1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
29 */
30
31#include <sys/param.h>
32#include <sys/types.h>
33#include <sys/systm.h>
34#include <sys/cred.h>
35#include <sys/proc.h>
36#include <sys/user.h>
37#include <sys/time.h>
38#include <sys/buf.h>
39#include <sys/vfs.h>
40#include <sys/vnode.h>
41#include <sys/socket.h>
42#include <sys/uio.h>
43#include <sys/tiuser.h>
44#include <sys/swap.h>
45#include <sys/errno.h>
46#include <sys/debug.h>
47#include <sys/kmem.h>
48#include <sys/kstat.h>
49#include <sys/cmn_err.h>
50#include <sys/vtrace.h>
51#include <sys/session.h>
52#include <sys/dnlc.h>
53#include <sys/bitmap.h>
54#include <sys/acl.h>
55#include <sys/ddi.h>
56#include <sys/pathname.h>
57#include <sys/flock.h>
58#include <sys/dirent.h>
59#include <sys/flock.h>
60#include <sys/callb.h>
61#include <sys/atomic.h>
62#include <sys/list.h>
63#include <sys/tsol/tnet.h>
64#include <sys/priv.h>
65#include <sys/sdt.h>
66#include <sys/attr.h>
67
68#include <inet/ip6.h>
69
70#include <rpc/types.h>
71#include <rpc/xdr.h>
72#include <rpc/auth.h>
73#include <rpc/clnt.h>
74
75#include <nfs/nfs.h>
76#include <nfs/nfs4.h>
77#include <nfs/nfs_clnt.h>
78#include <nfs/rnode.h>
79#include <nfs/nfs_acl.h>
80
81#include <sys/tsol/label.h>
82
83/*
84 * The hash queues for the access to active and cached rnodes
85 * are organized as doubly linked lists.  A reader/writer lock
86 * for each hash bucket is used to control access and to synchronize
87 * lookups, additions, and deletions from the hash queue.
88 *
89 * The rnode freelist is organized as a doubly linked list with
90 * a head pointer.  Additions and deletions are synchronized via
91 * a single mutex.
92 *
93 * In order to add an rnode to the free list, it must be hashed into
94 * a hash queue and the exclusive lock to the hash queue be held.
95 * If an rnode is not hashed into a hash queue, then it is destroyed
96 * because it represents no valuable information that can be reused
97 * about the file.  The exclusive lock to the hash queue must be
98 * held in order to prevent a lookup in the hash queue from finding
99 * the rnode and using it and assuming that the rnode is not on the
100 * freelist.  The lookup in the hash queue will have the hash queue
101 * locked, either exclusive or shared.
102 *
103 * The vnode reference count for each rnode is not allowed to drop
104 * below 1.  This prevents external entities, such as the VM
105 * subsystem, from acquiring references to vnodes already on the
106 * freelist and then trying to place them back on the freelist
107 * when their reference is released.  This means that the when an
108 * rnode is looked up in the hash queues, then either the rnode
109 * is removed from the freelist and that reference is transferred to
110 * the new reference or the vnode reference count must be incremented
111 * accordingly.  The mutex for the freelist must be held in order to
112 * accurately test to see if the rnode is on the freelist or not.
113 * The hash queue lock might be held shared and it is possible that
114 * two different threads may race to remove the rnode from the
115 * freelist.  This race can be resolved by holding the mutex for the
116 * freelist.  Please note that the mutex for the freelist does not
117 * need to held if the rnode is not on the freelist.  It can not be
118 * placed on the freelist due to the requirement that the thread
119 * putting the rnode on the freelist must hold the exclusive lock
120 * to the hash queue and the thread doing the lookup in the hash
121 * queue is holding either a shared or exclusive lock to the hash
122 * queue.
123 *
124 * The lock ordering is:
125 *
126 *	hash bucket lock -> vnode lock
127 *	hash bucket lock -> freelist lock
128 */
129static rhashq_t *rtable;
130
131static kmutex_t rpfreelist_lock;
132static rnode_t *rpfreelist = NULL;
133static long rnew = 0;
134long nrnode = 0;
135
136static int rtablesize;
137static int rtablemask;
138
139static int hashlen = 4;
140
141static struct kmem_cache *rnode_cache;
142
143/*
144 * Mutex to protect the following variables:
145 *	nfs_major
146 *	nfs_minor
147 */
148kmutex_t nfs_minor_lock;
149int nfs_major;
150int nfs_minor;
151
152/* Do we allow preepoch (negative) time values otw? */
153bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
154
155/*
156 * Access cache
157 */
158static acache_hash_t *acache;
159static long nacache;	/* used strictly to size the number of hash queues */
160
161static int acachesize;
162static int acachemask;
163static struct kmem_cache *acache_cache;
164
165/*
166 * Client side utilities
167 */
168
169/*
170 * client side statistics
171 */
172static const struct clstat clstat_tmpl = {
173	{ "calls",	KSTAT_DATA_UINT64 },
174	{ "badcalls",	KSTAT_DATA_UINT64 },
175	{ "clgets",	KSTAT_DATA_UINT64 },
176	{ "cltoomany",	KSTAT_DATA_UINT64 },
177#ifdef DEBUG
178	{ "clalloc",	KSTAT_DATA_UINT64 },
179	{ "noresponse",	KSTAT_DATA_UINT64 },
180	{ "failover",	KSTAT_DATA_UINT64 },
181	{ "remap",	KSTAT_DATA_UINT64 },
182#endif
183};
184
185/*
186 * The following are statistics that describe behavior of the system as a whole
187 * and doesn't correspond to any one particular zone.
188 */
189#ifdef DEBUG
190static struct clstat_debug {
191	kstat_named_t	nrnode;			/* number of allocated rnodes */
192	kstat_named_t	access;			/* size of access cache */
193	kstat_named_t	dirent;			/* size of readdir cache */
194	kstat_named_t	dirents;		/* size of readdir buf cache */
195	kstat_named_t	reclaim;		/* number of reclaims */
196	kstat_named_t	clreclaim;		/* number of cl reclaims */
197	kstat_named_t	f_reclaim;		/* number of free reclaims */
198	kstat_named_t	a_reclaim;		/* number of active reclaims */
199	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
200	kstat_named_t	rpath;			/* bytes used to store rpaths */
201} clstat_debug = {
202	{ "nrnode",	KSTAT_DATA_UINT64 },
203	{ "access",	KSTAT_DATA_UINT64 },
204	{ "dirent",	KSTAT_DATA_UINT64 },
205	{ "dirents",	KSTAT_DATA_UINT64 },
206	{ "reclaim",	KSTAT_DATA_UINT64 },
207	{ "clreclaim",	KSTAT_DATA_UINT64 },
208	{ "f_reclaim",	KSTAT_DATA_UINT64 },
209	{ "a_reclaim",	KSTAT_DATA_UINT64 },
210	{ "r_reclaim",	KSTAT_DATA_UINT64 },
211	{ "r_path",	KSTAT_DATA_UINT64 },
212};
213#endif	/* DEBUG */
214
215/*
216 * We keep a global list of per-zone client data, so we can clean up all zones
217 * if we get low on memory.
218 */
219static list_t nfs_clnt_list;
220static kmutex_t nfs_clnt_list_lock;
221static zone_key_t nfsclnt_zone_key;
222
223static struct kmem_cache *chtab_cache;
224
225/*
226 * Some servers do not properly update the attributes of the
227 * directory when changes are made.  To allow interoperability
228 * with these broken servers, the nfs_disable_rddir_cache
229 * parameter must be set in /etc/system
230 */
231int nfs_disable_rddir_cache = 0;
232
233int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234		    struct chtab **);
235void		clfree(CLIENT *, struct chtab *);
236static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
237		    struct chtab **, struct nfs_clnt *);
238static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
239		    struct chtab **, struct nfs_clnt *);
240static void	clreclaim(void *);
241static int	nfs_feedback(int, int, mntinfo_t *);
242static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
243		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
244		    failinfo_t *);
245static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
246		    caddr_t, cred_t *, int *, int, failinfo_t *);
247static void	rinactive(rnode_t *, cred_t *);
248static int	rtablehash(nfs_fhandle *);
249static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
250		    struct vnodeops *,
251		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
252			cred_t *),
253		    int (*)(const void *, const void *), int *, cred_t *,
254		    char *, char *);
255static void	rp_rmfree(rnode_t *);
256static void	rp_addhash(rnode_t *);
257static void	rp_rmhash_locked(rnode_t *);
258static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
259static void	destroy_rnode(rnode_t *);
260static void	rddir_cache_free(rddir_cache *);
261static int	nfs_free_data_reclaim(rnode_t *);
262static int	nfs_active_data_reclaim(rnode_t *);
263static int	nfs_free_reclaim(void);
264static int	nfs_active_reclaim(void);
265static int	nfs_rnode_reclaim(void);
266static void	nfs_reclaim(void *);
267static int	failover_safe(failinfo_t *);
268static void	failover_newserver(mntinfo_t *mi);
269static void	failover_thread(mntinfo_t *mi);
270static int	failover_wait(mntinfo_t *);
271static int	failover_remap(failinfo_t *);
272static int	failover_lookup(char *, vnode_t *,
273		    int (*)(vnode_t *, char *, vnode_t **,
274			struct pathname *, int, vnode_t *, cred_t *, int),
275		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
276		    vnode_t **);
277static void	nfs_free_r_path(rnode_t *);
278static void	nfs_set_vroot(vnode_t *);
279static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
280
281/*
282 * from rpcsec module (common/rpcsec)
283 */
284extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
285extern void sec_clnt_freeh(AUTH *);
286extern void sec_clnt_freeinfo(struct sec_data *);
287
288/*
289 * used in mount policy
290 */
291extern ts_label_t *getflabel_cipso(vfs_t *);
292
293/*
294 * EIO or EINTR are not recoverable errors.
295 */
296#define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
297
298#ifdef DEBUG
299#define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
300#define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
301#else
302#define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
303#define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
304#endif
305/*
306 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
307 */
308static int
309clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
310    struct chtab **chp, struct nfs_clnt *nfscl)
311{
312	struct chhead *ch, *newch;
313	struct chhead **plistp;
314	struct chtab *cp;
315	int error;
316	k_sigset_t smask;
317
318	if (newcl == NULL || chp == NULL || ci == NULL)
319		return (EINVAL);
320
321	*newcl = NULL;
322	*chp = NULL;
323
324	/*
325	 * Find an unused handle or create one
326	 */
327	newch = NULL;
328	nfscl->nfscl_stat.clgets.value.ui64++;
329top:
330	/*
331	 * Find the correct entry in the cache to check for free
332	 * client handles.  The search is based on the RPC program
333	 * number, program version number, dev_t for the transport
334	 * device, and the protocol family.
335	 */
336	mutex_enter(&nfscl->nfscl_chtable_lock);
337	plistp = &nfscl->nfscl_chtable;
338	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
339		if (ch->ch_prog == ci->cl_prog &&
340		    ch->ch_vers == ci->cl_vers &&
341		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
342		    (strcmp(ch->ch_protofmly,
343		    svp->sv_knconf->knc_protofmly) == 0))
344			break;
345		plistp = &ch->ch_next;
346	}
347
348	/*
349	 * If we didn't find a cache entry for this quadruple, then
350	 * create one.  If we don't have one already preallocated,
351	 * then drop the cache lock, create one, and then start over.
352	 * If we did have a preallocated entry, then just add it to
353	 * the front of the list.
354	 */
355	if (ch == NULL) {
356		if (newch == NULL) {
357			mutex_exit(&nfscl->nfscl_chtable_lock);
358			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
359			newch->ch_timesused = 0;
360			newch->ch_prog = ci->cl_prog;
361			newch->ch_vers = ci->cl_vers;
362			newch->ch_dev = svp->sv_knconf->knc_rdev;
363			newch->ch_protofmly = kmem_alloc(
364			    strlen(svp->sv_knconf->knc_protofmly) + 1,
365			    KM_SLEEP);
366			(void) strcpy(newch->ch_protofmly,
367			    svp->sv_knconf->knc_protofmly);
368			newch->ch_list = NULL;
369			goto top;
370		}
371		ch = newch;
372		newch = NULL;
373		ch->ch_next = nfscl->nfscl_chtable;
374		nfscl->nfscl_chtable = ch;
375	/*
376	 * We found a cache entry, but if it isn't on the front of the
377	 * list, then move it to the front of the list to try to take
378	 * advantage of locality of operations.
379	 */
380	} else if (ch != nfscl->nfscl_chtable) {
381		*plistp = ch->ch_next;
382		ch->ch_next = nfscl->nfscl_chtable;
383		nfscl->nfscl_chtable = ch;
384	}
385
386	/*
387	 * If there was a free client handle cached, then remove it
388	 * from the list, init it, and use it.
389	 */
390	if (ch->ch_list != NULL) {
391		cp = ch->ch_list;
392		ch->ch_list = cp->ch_list;
393		mutex_exit(&nfscl->nfscl_chtable_lock);
394		if (newch != NULL) {
395			kmem_free(newch->ch_protofmly,
396			    strlen(newch->ch_protofmly) + 1);
397			kmem_free(newch, sizeof (*newch));
398		}
399		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
400		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
401		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
402		    &cp->ch_client->cl_auth);
403		if (error || cp->ch_client->cl_auth == NULL) {
404			CLNT_DESTROY(cp->ch_client);
405			kmem_cache_free(chtab_cache, cp);
406			return ((error != 0) ? error : EINTR);
407		}
408		ch->ch_timesused++;
409		*newcl = cp->ch_client;
410		*chp = cp;
411		return (0);
412	}
413
414	/*
415	 * There weren't any free client handles which fit, so allocate
416	 * a new one and use that.
417	 */
418#ifdef DEBUG
419	atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
420#endif
421	mutex_exit(&nfscl->nfscl_chtable_lock);
422
423	nfscl->nfscl_stat.cltoomany.value.ui64++;
424	if (newch != NULL) {
425		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
426		kmem_free(newch, sizeof (*newch));
427	}
428
429	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
430	cp->ch_head = ch;
431
432	sigintr(&smask, (int)ci->cl_flags & MI_INT);
433	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
434	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
435	sigunintr(&smask);
436
437	if (error != 0) {
438		kmem_cache_free(chtab_cache, cp);
439#ifdef DEBUG
440		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
441#endif
442		/*
443		 * Warning is unnecessary if error is EINTR.
444		 */
445		if (error != EINTR) {
446			nfs_cmn_err(error, CE_WARN,
447			    "clget: couldn't create handle: %m\n");
448		}
449		return (error);
450	}
451	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
452	auth_destroy(cp->ch_client->cl_auth);
453	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
454	    &cp->ch_client->cl_auth);
455	if (error || cp->ch_client->cl_auth == NULL) {
456		CLNT_DESTROY(cp->ch_client);
457		kmem_cache_free(chtab_cache, cp);
458#ifdef DEBUG
459		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
460#endif
461		return ((error != 0) ? error : EINTR);
462	}
463	ch->ch_timesused++;
464	*newcl = cp->ch_client;
465	ASSERT(cp->ch_client->cl_nosignal == FALSE);
466	*chp = cp;
467	return (0);
468}
469
470int
471clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
472    struct chtab **chp)
473{
474	struct nfs_clnt *nfscl;
475
476	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
477	ASSERT(nfscl != NULL);
478
479	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
480}
481
482static int
483acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
484    struct chtab **chp, struct nfs_clnt *nfscl)
485{
486	clinfo_t ci;
487	int error;
488
489	/*
490	 * Set read buffer size to rsize
491	 * and add room for RPC headers.
492	 */
493	ci.cl_readsize = mi->mi_tsize;
494	if (ci.cl_readsize != 0)
495		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
496
497	/*
498	 * If soft mount and server is down just try once.
499	 * meaning: do not retransmit.
500	 */
501	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
502		ci.cl_retrans = 0;
503	else
504		ci.cl_retrans = mi->mi_retrans;
505
506	ci.cl_prog = NFS_ACL_PROGRAM;
507	ci.cl_vers = mi->mi_vers;
508	ci.cl_flags = mi->mi_flags;
509
510	/*
511	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
512	 * security flavor, the client tries to establish a security context
513	 * by contacting the server. If the connection is timed out or reset,
514	 * e.g. server reboot, we will try again.
515	 */
516	do {
517		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
518
519		if (error == 0)
520			break;
521
522		/*
523		 * For forced unmount or zone shutdown, bail out, no retry.
524		 */
525		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
526			error = EIO;
527			break;
528		}
529
530		/* do not retry for softmount */
531		if (!(mi->mi_flags & MI_HARD))
532			break;
533
534		/* let the caller deal with the failover case */
535		if (FAILOVER_MOUNT(mi))
536			break;
537
538	} while (error == ETIMEDOUT || error == ECONNRESET);
539
540	return (error);
541}
542
543static int
544nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
545    struct chtab **chp, struct nfs_clnt *nfscl)
546{
547	clinfo_t ci;
548	int error;
549
550	/*
551	 * Set read buffer size to rsize
552	 * and add room for RPC headers.
553	 */
554	ci.cl_readsize = mi->mi_tsize;
555	if (ci.cl_readsize != 0)
556		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
557
558	/*
559	 * If soft mount and server is down just try once.
560	 * meaning: do not retransmit.
561	 */
562	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
563		ci.cl_retrans = 0;
564	else
565		ci.cl_retrans = mi->mi_retrans;
566
567	ci.cl_prog = mi->mi_prog;
568	ci.cl_vers = mi->mi_vers;
569	ci.cl_flags = mi->mi_flags;
570
571	/*
572	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
573	 * security flavor, the client tries to establish a security context
574	 * by contacting the server. If the connection is timed out or reset,
575	 * e.g. server reboot, we will try again.
576	 */
577	do {
578		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
579
580		if (error == 0)
581			break;
582
583		/*
584		 * For forced unmount or zone shutdown, bail out, no retry.
585		 */
586		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
587			error = EIO;
588			break;
589		}
590
591		/* do not retry for softmount */
592		if (!(mi->mi_flags & MI_HARD))
593			break;
594
595		/* let the caller deal with the failover case */
596		if (FAILOVER_MOUNT(mi))
597			break;
598
599	} while (error == ETIMEDOUT || error == ECONNRESET);
600
601	return (error);
602}
603
604static void
605clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
606{
607	if (cl->cl_auth != NULL) {
608		sec_clnt_freeh(cl->cl_auth);
609		cl->cl_auth = NULL;
610	}
611
612	/*
613	 * Timestamp this cache entry so that we know when it was last
614	 * used.
615	 */
616	cp->ch_freed = gethrestime_sec();
617
618	/*
619	 * Add the free client handle to the front of the list.
620	 * This way, the list will be sorted in youngest to oldest
621	 * order.
622	 */
623	mutex_enter(&nfscl->nfscl_chtable_lock);
624	cp->ch_list = cp->ch_head->ch_list;
625	cp->ch_head->ch_list = cp;
626	mutex_exit(&nfscl->nfscl_chtable_lock);
627}
628
629void
630clfree(CLIENT *cl, struct chtab *cp)
631{
632	struct nfs_clnt *nfscl;
633
634	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
635	ASSERT(nfscl != NULL);
636
637	clfree_impl(cl, cp, nfscl);
638}
639
640#define	CL_HOLDTIME	60	/* time to hold client handles */
641
642static void
643clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
644{
645	struct chhead *ch;
646	struct chtab *cp;	/* list of objects that can be reclaimed */
647	struct chtab *cpe;
648	struct chtab *cpl;
649	struct chtab **cpp;
650#ifdef DEBUG
651	int n = 0;
652#endif
653
654	/*
655	 * Need to reclaim some memory, so step through the cache
656	 * looking through the lists for entries which can be freed.
657	 */
658	cp = NULL;
659
660	mutex_enter(&nfscl->nfscl_chtable_lock);
661
662	/*
663	 * Here we step through each non-NULL quadruple and start to
664	 * construct the reclaim list pointed to by cp.  Note that
665	 * cp will contain all eligible chtab entries.  When this traversal
666	 * completes, chtab entries from the last quadruple will be at the
667	 * front of cp and entries from previously inspected quadruples have
668	 * been appended to the rear of cp.
669	 */
670	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
671		if (ch->ch_list == NULL)
672			continue;
673		/*
674		 * Search each list for entries older then
675		 * cl_holdtime seconds.  The lists are maintained
676		 * in youngest to oldest order so that when the
677		 * first entry is found which is old enough, then
678		 * all of the rest of the entries on the list will
679		 * be old enough as well.
680		 */
681		cpl = ch->ch_list;
682		cpp = &ch->ch_list;
683		while (cpl != NULL &&
684		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
685			cpp = &cpl->ch_list;
686			cpl = cpl->ch_list;
687		}
688		if (cpl != NULL) {
689			*cpp = NULL;
690			if (cp != NULL) {
691				cpe = cpl;
692				while (cpe->ch_list != NULL)
693					cpe = cpe->ch_list;
694				cpe->ch_list = cp;
695			}
696			cp = cpl;
697		}
698	}
699
700	mutex_exit(&nfscl->nfscl_chtable_lock);
701
702	/*
703	 * If cp is empty, then there is nothing to reclaim here.
704	 */
705	if (cp == NULL)
706		return;
707
708	/*
709	 * Step through the list of entries to free, destroying each client
710	 * handle and kmem_free'ing the memory for each entry.
711	 */
712	while (cp != NULL) {
713#ifdef DEBUG
714		n++;
715#endif
716		CLNT_DESTROY(cp->ch_client);
717		cpl = cp->ch_list;
718		kmem_cache_free(chtab_cache, cp);
719		cp = cpl;
720	}
721
722#ifdef DEBUG
723	/*
724	 * Update clalloc so that nfsstat shows the current number
725	 * of allocated client handles.
726	 */
727	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
728#endif
729}
730
731/* ARGSUSED */
732static void
733clreclaim(void *all)
734{
735	struct nfs_clnt *nfscl;
736
737#ifdef DEBUG
738	clstat_debug.clreclaim.value.ui64++;
739#endif
740	/*
741	 * The system is low on memory; go through and try to reclaim some from
742	 * every zone on the system.
743	 */
744	mutex_enter(&nfs_clnt_list_lock);
745	nfscl = list_head(&nfs_clnt_list);
746	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
747		clreclaim_zone(nfscl, CL_HOLDTIME);
748	mutex_exit(&nfs_clnt_list_lock);
749}
750
751/*
752 * Minimum time-out values indexed by call type
753 * These units are in "eights" of a second to avoid multiplies
754 */
755static unsigned int minimum_timeo[] = {
756	6, 7, 10
757};
758
759/*
760 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
761 */
762#define	MAXTIMO	(20*hz)
763#define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
764#define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
765
766#define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
767#define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
768#define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
769
770/*
771 * Function called when rfscall notices that we have been
772 * re-transmitting, or when we get a response without retransmissions.
773 * Return 1 if the transfer size was adjusted down - 0 if no change.
774 */
775static int
776nfs_feedback(int flag, int which, mntinfo_t *mi)
777{
778	int kind;
779	int r = 0;
780
781	mutex_enter(&mi->mi_lock);
782	if (flag == FEEDBACK_REXMIT1) {
783		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
784		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
785			goto done;
786		if (mi->mi_curread > MIN_NFS_TSIZE) {
787			mi->mi_curread /= 2;
788			if (mi->mi_curread < MIN_NFS_TSIZE)
789				mi->mi_curread = MIN_NFS_TSIZE;
790			r = 1;
791		}
792
793		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
794			mi->mi_curwrite /= 2;
795			if (mi->mi_curwrite < MIN_NFS_TSIZE)
796				mi->mi_curwrite = MIN_NFS_TSIZE;
797			r = 1;
798		}
799	} else if (flag == FEEDBACK_OK) {
800		kind = mi->mi_timer_type[which];
801		if (kind == 0 ||
802		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
803			goto done;
804		if (kind == 1) {
805			if (mi->mi_curread >= mi->mi_tsize)
806				goto done;
807			mi->mi_curread +=  MIN_NFS_TSIZE;
808			if (mi->mi_curread > mi->mi_tsize/2)
809				mi->mi_curread = mi->mi_tsize;
810		} else if (kind == 2) {
811			if (mi->mi_curwrite >= mi->mi_stsize)
812				goto done;
813			mi->mi_curwrite += MIN_NFS_TSIZE;
814			if (mi->mi_curwrite > mi->mi_stsize/2)
815				mi->mi_curwrite = mi->mi_stsize;
816		}
817	}
818done:
819	mutex_exit(&mi->mi_lock);
820	return (r);
821}
822
823#ifdef DEBUG
824static int rfs2call_hits = 0;
825static int rfs2call_misses = 0;
826#endif
827
828int
829rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
830    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
831    enum nfsstat *statusp, int flags, failinfo_t *fi)
832{
833	int rpcerror;
834	enum clnt_stat rpc_status;
835
836	ASSERT(statusp != NULL);
837
838	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
839	    cr, douprintf, &rpc_status, flags, fi);
840	if (!rpcerror) {
841		/*
842		 * See crnetadjust() for comments.
843		 */
844		if (*statusp == NFSERR_ACCES &&
845		    (cr = crnetadjust(cr)) != NULL) {
846#ifdef DEBUG
847			rfs2call_hits++;
848#endif
849			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
850			    resp, cr, douprintf, NULL, flags, fi);
851			crfree(cr);
852#ifdef DEBUG
853			if (*statusp == NFSERR_ACCES)
854				rfs2call_misses++;
855#endif
856		}
857	} else if (rpc_status == RPC_PROCUNAVAIL) {
858		*statusp = NFSERR_OPNOTSUPP;
859		rpcerror = 0;
860	}
861
862	return (rpcerror);
863}
864
865#define	NFS3_JUKEBOX_DELAY	10 * hz
866
867static clock_t nfs3_jukebox_delay = 0;
868
869#ifdef DEBUG
870static int rfs3call_hits = 0;
871static int rfs3call_misses = 0;
872#endif
873
874int
875rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
876    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
877    nfsstat3 *statusp, int flags, failinfo_t *fi)
878{
879	int rpcerror;
880	int user_informed;
881
882	user_informed = 0;
883	do {
884		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
885		    cr, douprintf, NULL, flags, fi);
886		if (!rpcerror) {
887			cred_t *crr;
888			if (*statusp == NFS3ERR_JUKEBOX) {
889				if (ttoproc(curthread) == &p0) {
890					rpcerror = EAGAIN;
891					break;
892				}
893				if (!user_informed) {
894					user_informed = 1;
895					uprintf(
896		"file temporarily unavailable on the server, retrying...\n");
897				}
898				delay(nfs3_jukebox_delay);
899			}
900			/*
901			 * See crnetadjust() for comments.
902			 */
903			else if (*statusp == NFS3ERR_ACCES &&
904			    (crr = crnetadjust(cr)) != NULL) {
905#ifdef DEBUG
906				rfs3call_hits++;
907#endif
908				rpcerror = rfscall(mi, which, xdrargs, argsp,
909				    xdrres, resp, crr, douprintf,
910				    NULL, flags, fi);
911
912				crfree(crr);
913#ifdef DEBUG
914				if (*statusp == NFS3ERR_ACCES)
915					rfs3call_misses++;
916#endif
917			}
918		}
919	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
920
921	return (rpcerror);
922}
923
924#define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
925#define	INC_READERS(mi)		{ \
926	mi->mi_readers++; \
927}
928#define	DEC_READERS(mi)		{ \
929	mi->mi_readers--; \
930	if (mi->mi_readers == 0) \
931		cv_broadcast(&mi->mi_failover_cv); \
932}
933
934static int
935rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
936    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
937    enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
938{
939	CLIENT *client;
940	struct chtab *ch;
941	cred_t *cr = icr;
942	enum clnt_stat status;
943	struct rpc_err rpcerr, rpcerr_tmp;
944	struct timeval wait;
945	int timeo;		/* in units of hz */
946	int my_rsize, my_wsize;
947	bool_t tryagain;
948	bool_t cred_cloned = FALSE;
949	k_sigset_t smask;
950	servinfo_t *svp;
951	struct nfs_clnt *nfscl;
952	zoneid_t zoneid = getzoneid();
953	char *msg;
954#ifdef DEBUG
955	char *bufp;
956#endif
957
958
959	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
960	    "rfscall_start:which %d mi %p", which, mi);
961
962	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
963	ASSERT(nfscl != NULL);
964
965	nfscl->nfscl_stat.calls.value.ui64++;
966	mi->mi_reqs[which].value.ui64++;
967
968	rpcerr.re_status = RPC_SUCCESS;
969
970	/*
971	 * In case of forced unmount or zone shutdown, return EIO.
972	 */
973
974	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
975		rpcerr.re_status = RPC_FAILED;
976		rpcerr.re_errno = EIO;
977		return (rpcerr.re_errno);
978	}
979
980	/*
981	 * Remember the transfer sizes in case
982	 * nfs_feedback changes them underneath us.
983	 */
984	my_rsize = mi->mi_curread;
985	my_wsize = mi->mi_curwrite;
986
987	/*
988	 * NFS client failover support
989	 *
990	 * If this rnode is not in sync with the current server (VALID_FH),
991	 * we'd like to do a remap to get in sync.  We can be interrupted
992	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
993	 * use the best info we have to try the RPC.  Part of that is
994	 * unconditionally updating the filehandle copy kept for V3.
995	 *
996	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
997	 * rw_enter(); we're trying to keep the current server from being
998	 * changed on us until we're done with the remapping and have a
999	 * matching client handle.  We don't want to sending a filehandle
1000	 * to the wrong host.
1001	 */
1002failoverretry:
1003	if (FAILOVER_MOUNT(mi)) {
1004		mutex_enter(&mi->mi_lock);
1005		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1006			if (failover_wait(mi)) {
1007				mutex_exit(&mi->mi_lock);
1008				return (EINTR);
1009			}
1010		}
1011		INC_READERS(mi);
1012		mutex_exit(&mi->mi_lock);
1013		if (fi) {
1014			if (!VALID_FH(fi) &&
1015			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1016				int remaperr;
1017
1018				svp = mi->mi_curr_serv;
1019				remaperr = failover_remap(fi);
1020				if (remaperr != 0) {
1021#ifdef DEBUG
1022					if (remaperr != EINTR)
1023						nfs_cmn_err(remaperr, CE_WARN,
1024					    "rfscall couldn't failover: %m");
1025#endif
1026					mutex_enter(&mi->mi_lock);
1027					DEC_READERS(mi);
1028					mutex_exit(&mi->mi_lock);
1029					/*
1030					 * If failover_remap returns ETIMEDOUT
1031					 * and the filesystem is hard mounted
1032					 * we have to retry the call with a new
1033					 * server.
1034					 */
1035					if ((mi->mi_flags & MI_HARD) &&
1036					    IS_RECOVERABLE_ERROR(remaperr)) {
1037						if (svp == mi->mi_curr_serv)
1038							failover_newserver(mi);
1039						rpcerr.re_status = RPC_SUCCESS;
1040						goto failoverretry;
1041					}
1042					rpcerr.re_errno = remaperr;
1043					return (remaperr);
1044				}
1045			}
1046			if (fi->fhp && fi->copyproc)
1047				(*fi->copyproc)(fi->fhp, fi->vp);
1048		}
1049	}
1050
1051	/* For TSOL, use a new cred which has net_mac_aware flag */
1052	if (!cred_cloned && is_system_labeled()) {
1053		cred_cloned = TRUE;
1054		cr = crdup(icr);
1055		(void) setpflags(NET_MAC_AWARE, 1, cr);
1056	}
1057
1058	/*
1059	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1060	 * are guaranteed to reprocess the retry as a new request.
1061	 */
1062	svp = mi->mi_curr_serv;
1063	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1064
1065	if (FAILOVER_MOUNT(mi)) {
1066		mutex_enter(&mi->mi_lock);
1067		DEC_READERS(mi);
1068		mutex_exit(&mi->mi_lock);
1069
1070		if ((rpcerr.re_errno == ETIMEDOUT ||
1071		    rpcerr.re_errno == ECONNRESET) &&
1072		    failover_safe(fi)) {
1073			if (svp == mi->mi_curr_serv)
1074				failover_newserver(mi);
1075			goto failoverretry;
1076		}
1077	}
1078	if (rpcerr.re_errno != 0)
1079		return (rpcerr.re_errno);
1080
1081	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1082	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1083		timeo = (mi->mi_timeo * hz) / 10;
1084	} else {
1085		mutex_enter(&mi->mi_lock);
1086		timeo = CLNT_SETTIMERS(client,
1087		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1088		    &(mi->mi_timers[NFS_CALLTYPES]),
1089		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1090		    (void (*)())NULL, (caddr_t)mi, 0);
1091		mutex_exit(&mi->mi_lock);
1092	}
1093
1094	/*
1095	 * If hard mounted fs, retry call forever unless hard error occurs.
1096	 */
1097	do {
1098		tryagain = FALSE;
1099
1100		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1101			status = RPC_FAILED;
1102			rpcerr.re_status = RPC_FAILED;
1103			rpcerr.re_errno = EIO;
1104			break;
1105		}
1106
1107		TICK_TO_TIMEVAL(timeo, &wait);
1108
1109		/*
1110		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1111		 * and SIGTERM. (Preserving the existing masks).
1112		 * Mask out SIGINT if mount option nointr is specified.
1113		 */
1114		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1115		if (!(mi->mi_flags & MI_INT))
1116			client->cl_nosignal = TRUE;
1117
1118		/*
1119		 * If there is a current signal, then don't bother
1120		 * even trying to send out the request because we
1121		 * won't be able to block waiting for the response.
1122		 * Simply assume RPC_INTR and get on with it.
1123		 */
1124		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1125			status = RPC_INTR;
1126		else {
1127			status = CLNT_CALL(client, which, xdrargs, argsp,
1128			    xdrres, resp, wait);
1129		}
1130
1131		if (!(mi->mi_flags & MI_INT))
1132			client->cl_nosignal = FALSE;
1133		/*
1134		 * restore original signal mask
1135		 */
1136		sigunintr(&smask);
1137
1138		switch (status) {
1139		case RPC_SUCCESS:
1140			if ((mi->mi_flags & MI_DYNAMIC) &&
1141			    mi->mi_timer_type[which] != 0 &&
1142			    (mi->mi_curread != my_rsize ||
1143			    mi->mi_curwrite != my_wsize))
1144				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1145			break;
1146
1147		case RPC_INTR:
1148			/*
1149			 * There is no way to recover from this error,
1150			 * even if mount option nointr is specified.
1151			 * SIGKILL, for example, cannot be blocked.
1152			 */
1153			rpcerr.re_status = RPC_INTR;
1154			rpcerr.re_errno = EINTR;
1155			break;
1156
1157		case RPC_UDERROR:
1158			/*
1159			 * If the NFS server is local (vold) and
1160			 * it goes away then we get RPC_UDERROR.
1161			 * This is a retryable error, so we would
1162			 * loop, so check to see if the specific
1163			 * error was ECONNRESET, indicating that
1164			 * target did not exist at all.  If so,
1165			 * return with RPC_PROGUNAVAIL and
1166			 * ECONNRESET to indicate why.
1167			 */
1168			CLNT_GETERR(client, &rpcerr);
1169			if (rpcerr.re_errno == ECONNRESET) {
1170				rpcerr.re_status = RPC_PROGUNAVAIL;
1171				rpcerr.re_errno = ECONNRESET;
1172				break;
1173			}
1174			/*FALLTHROUGH*/
1175
1176		default:		/* probably RPC_TIMEDOUT */
1177			if (IS_UNRECOVERABLE_RPC(status))
1178				break;
1179
1180			/*
1181			 * increment server not responding count
1182			 */
1183			mutex_enter(&mi->mi_lock);
1184			mi->mi_noresponse++;
1185			mutex_exit(&mi->mi_lock);
1186#ifdef DEBUG
1187			nfscl->nfscl_stat.noresponse.value.ui64++;
1188#endif
1189
1190			if (!(mi->mi_flags & MI_HARD)) {
1191				if (!(mi->mi_flags & MI_SEMISOFT) ||
1192				    (mi->mi_ss_call_type[which] == 0))
1193					break;
1194			}
1195
1196			/*
1197			 * The call is in progress (over COTS).
1198			 * Try the CLNT_CALL again, but don't
1199			 * print a noisy error message.
1200			 */
1201			if (status == RPC_INPROGRESS) {
1202				tryagain = TRUE;
1203				break;
1204			}
1205
1206			if (flags & RFSCALL_SOFT)
1207				break;
1208
1209			/*
1210			 * On zone shutdown, just move on.
1211			 */
1212			if (zone_status_get(curproc->p_zone) >=
1213			    ZONE_IS_SHUTTING_DOWN) {
1214				rpcerr.re_status = RPC_FAILED;
1215				rpcerr.re_errno = EIO;
1216				break;
1217			}
1218
1219			/*
1220			 * NFS client failover support
1221			 *
1222			 * If the current server just failed us, we'll
1223			 * start the process of finding a new server.
1224			 * After that, we can just retry.
1225			 */
1226			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1227				if (svp == mi->mi_curr_serv)
1228					failover_newserver(mi);
1229				clfree_impl(client, ch, nfscl);
1230				goto failoverretry;
1231			}
1232
1233			tryagain = TRUE;
1234			timeo = backoff(timeo);
1235
1236			CLNT_GETERR(client, &rpcerr_tmp);
1237			if ((status == RPC_CANTSEND) &&
1238			    (rpcerr_tmp.re_errno == ENOBUFS))
1239				msg = SRV_QFULL_MSG;
1240			else
1241				msg = SRV_NOTRESP_MSG;
1242
1243			mutex_enter(&mi->mi_lock);
1244			if (!(mi->mi_flags & MI_PRINTED)) {
1245				mi->mi_flags |= MI_PRINTED;
1246				mutex_exit(&mi->mi_lock);
1247#ifdef DEBUG
1248				zprintf(zoneid, msg, mi->mi_vers,
1249				    svp->sv_hostname);
1250#else
1251				zprintf(zoneid, msg, svp->sv_hostname);
1252#endif
1253			} else
1254				mutex_exit(&mi->mi_lock);
1255			if (*douprintf && nfs_has_ctty()) {
1256				*douprintf = 0;
1257				if (!(mi->mi_flags & MI_NOPRINT))
1258#ifdef DEBUG
1259					uprintf(msg, mi->mi_vers,
1260					    svp->sv_hostname);
1261#else
1262					uprintf(msg, svp->sv_hostname);
1263#endif
1264			}
1265
1266			/*
1267			 * If doing dynamic adjustment of transfer
1268			 * size and if it's a read or write call
1269			 * and if the transfer size changed while
1270			 * retransmitting or if the feedback routine
1271			 * changed the transfer size,
1272			 * then exit rfscall so that the transfer
1273			 * size can be adjusted at the vnops level.
1274			 */
1275			if ((mi->mi_flags & MI_DYNAMIC) &&
1276			    mi->mi_timer_type[which] != 0 &&
1277			    (mi->mi_curread != my_rsize ||
1278			    mi->mi_curwrite != my_wsize ||
1279			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1280				/*
1281				 * On read or write calls, return
1282				 * back to the vnode ops level if
1283				 * the transfer size changed.
1284				 */
1285				clfree_impl(client, ch, nfscl);
1286				if (cred_cloned)
1287					crfree(cr);
1288				return (ENFS_TRYAGAIN);
1289			}
1290		}
1291	} while (tryagain);
1292
1293	if (status != RPC_SUCCESS) {
1294		/*
1295		 * Let soft mounts use the timed out message.
1296		 */
1297		if (status == RPC_INPROGRESS)
1298			status = RPC_TIMEDOUT;
1299		nfscl->nfscl_stat.badcalls.value.ui64++;
1300		if (status != RPC_INTR) {
1301			mutex_enter(&mi->mi_lock);
1302			mi->mi_flags |= MI_DOWN;
1303			mutex_exit(&mi->mi_lock);
1304			CLNT_GETERR(client, &rpcerr);
1305#ifdef DEBUG
1306			bufp = clnt_sperror(client, svp->sv_hostname);
1307			zprintf(zoneid, "NFS%d %s failed for %s\n",
1308			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1309			if (nfs_has_ctty()) {
1310				if (!(mi->mi_flags & MI_NOPRINT)) {
1311					uprintf("NFS%d %s failed for %s\n",
1312					    mi->mi_vers, mi->mi_rfsnames[which],
1313					    bufp);
1314				}
1315			}
1316			kmem_free(bufp, MAXPATHLEN);
1317#else
1318			zprintf(zoneid,
1319			    "NFS %s failed for server %s: error %d (%s)\n",
1320			    mi->mi_rfsnames[which], svp->sv_hostname,
1321			    status, clnt_sperrno(status));
1322			if (nfs_has_ctty()) {
1323				if (!(mi->mi_flags & MI_NOPRINT)) {
1324					uprintf(
1325				"NFS %s failed for server %s: error %d (%s)\n",
1326					    mi->mi_rfsnames[which],
1327					    svp->sv_hostname, status,
1328					    clnt_sperrno(status));
1329				}
1330			}
1331#endif
1332			/*
1333			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1334			 * re_errno is set appropriately depending on
1335			 * the authentication error
1336			 */
1337			if (status == RPC_VERSMISMATCH ||
1338			    status == RPC_PROGVERSMISMATCH)
1339				rpcerr.re_errno = EIO;
1340		}
1341	} else {
1342		/*
1343		 * Test the value of mi_down and mi_printed without
1344		 * holding the mi_lock mutex.  If they are both zero,
1345		 * then it is okay to skip the down and printed
1346		 * processing.  This saves on a mutex_enter and
1347		 * mutex_exit pair for a normal, successful RPC.
1348		 * This was just complete overhead.
1349		 */
1350		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1351			mutex_enter(&mi->mi_lock);
1352			mi->mi_flags &= ~MI_DOWN;
1353			if (mi->mi_flags & MI_PRINTED) {
1354				mi->mi_flags &= ~MI_PRINTED;
1355				mutex_exit(&mi->mi_lock);
1356#ifdef DEBUG
1357			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358				zprintf(zoneid, "NFS%d server %s ok\n",
1359				    mi->mi_vers, svp->sv_hostname);
1360#else
1361			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362				zprintf(zoneid, "NFS server %s ok\n",
1363				    svp->sv_hostname);
1364#endif
1365			} else
1366				mutex_exit(&mi->mi_lock);
1367		}
1368
1369		if (*douprintf == 0) {
1370			if (!(mi->mi_flags & MI_NOPRINT))
1371#ifdef DEBUG
1372				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1373					uprintf("NFS%d server %s ok\n",
1374					    mi->mi_vers, svp->sv_hostname);
1375#else
1376			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377				uprintf("NFS server %s ok\n", svp->sv_hostname);
1378#endif
1379			*douprintf = 1;
1380		}
1381	}
1382
1383	clfree_impl(client, ch, nfscl);
1384	if (cred_cloned)
1385		crfree(cr);
1386
1387	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1388
1389	if (rpc_status != NULL)
1390		*rpc_status = rpcerr.re_status;
1391
1392	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1393	    rpcerr.re_errno);
1394
1395	return (rpcerr.re_errno);
1396}
1397
1398#ifdef DEBUG
1399static int acl2call_hits = 0;
1400static int acl2call_misses = 0;
1401#endif
1402
1403int
1404acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1405    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1406    enum nfsstat *statusp, int flags, failinfo_t *fi)
1407{
1408	int rpcerror;
1409
1410	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1411	    cr, douprintf, flags, fi);
1412	if (!rpcerror) {
1413		/*
1414		 * See comments with crnetadjust().
1415		 */
1416		if (*statusp == NFSERR_ACCES &&
1417		    (cr = crnetadjust(cr)) != NULL) {
1418#ifdef DEBUG
1419			acl2call_hits++;
1420#endif
1421			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1422			    resp, cr, douprintf, flags, fi);
1423			crfree(cr);
1424#ifdef DEBUG
1425			if (*statusp == NFSERR_ACCES)
1426				acl2call_misses++;
1427#endif
1428		}
1429	}
1430
1431	return (rpcerror);
1432}
1433
1434#ifdef DEBUG
1435static int acl3call_hits = 0;
1436static int acl3call_misses = 0;
1437#endif
1438
1439int
1440acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1441    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1442    nfsstat3 *statusp, int flags, failinfo_t *fi)
1443{
1444	int rpcerror;
1445	int user_informed;
1446
1447	user_informed = 0;
1448
1449	do {
1450		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1451		    cr, douprintf, flags, fi);
1452		if (!rpcerror) {
1453			cred_t *crr;
1454			if (*statusp == NFS3ERR_JUKEBOX) {
1455				if (!user_informed) {
1456					user_informed = 1;
1457					uprintf(
1458		"file temporarily unavailable on the server, retrying...\n");
1459				}
1460				delay(nfs3_jukebox_delay);
1461			}
1462			/*
1463			 * See crnetadjust() for comments.
1464			 */
1465			else if (*statusp == NFS3ERR_ACCES &&
1466			    (crr = crnetadjust(cr)) != NULL) {
1467#ifdef DEBUG
1468				acl3call_hits++;
1469#endif
1470				rpcerror = aclcall(mi, which, xdrargs, argsp,
1471				    xdrres, resp, crr, douprintf, flags, fi);
1472
1473				crfree(crr);
1474#ifdef DEBUG
1475				if (*statusp == NFS3ERR_ACCES)
1476					acl3call_misses++;
1477#endif
1478			}
1479		}
1480	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1481
1482	return (rpcerror);
1483}
1484
1485static int
1486aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1487    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1488    int flags, failinfo_t *fi)
1489{
1490	CLIENT *client;
1491	struct chtab *ch;
1492	cred_t *cr = icr;
1493	bool_t cred_cloned = FALSE;
1494	enum clnt_stat status;
1495	struct rpc_err rpcerr;
1496	struct timeval wait;
1497	int timeo;		/* in units of hz */
1498#if 0 /* notyet */
1499	int my_rsize, my_wsize;
1500#endif
1501	bool_t tryagain;
1502	k_sigset_t smask;
1503	servinfo_t *svp;
1504	struct nfs_clnt *nfscl;
1505	zoneid_t zoneid = getzoneid();
1506#ifdef DEBUG
1507	char *bufp;
1508#endif
1509
1510#if 0 /* notyet */
1511	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1512	    "rfscall_start:which %d mi %p", which, mi);
1513#endif
1514
1515	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1516	ASSERT(nfscl != NULL);
1517
1518	nfscl->nfscl_stat.calls.value.ui64++;
1519	mi->mi_aclreqs[which].value.ui64++;
1520
1521	rpcerr.re_status = RPC_SUCCESS;
1522
1523	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1524		rpcerr.re_status = RPC_FAILED;
1525		rpcerr.re_errno = EIO;
1526		return (rpcerr.re_errno);
1527	}
1528
1529#if 0 /* notyet */
1530	/*
1531	 * Remember the transfer sizes in case
1532	 * nfs_feedback changes them underneath us.
1533	 */
1534	my_rsize = mi->mi_curread;
1535	my_wsize = mi->mi_curwrite;
1536#endif
1537
1538	/*
1539	 * NFS client failover support
1540	 *
1541	 * If this rnode is not in sync with the current server (VALID_FH),
1542	 * we'd like to do a remap to get in sync.  We can be interrupted
1543	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1544	 * use the best info we have to try the RPC.  Part of that is
1545	 * unconditionally updating the filehandle copy kept for V3.
1546	 *
1547	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1548	 * rw_enter(); we're trying to keep the current server from being
1549	 * changed on us until we're done with the remapping and have a
1550	 * matching client handle.  We don't want to sending a filehandle
1551	 * to the wrong host.
1552	 */
1553failoverretry:
1554	if (FAILOVER_MOUNT(mi)) {
1555		mutex_enter(&mi->mi_lock);
1556		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1557			if (failover_wait(mi)) {
1558				mutex_exit(&mi->mi_lock);
1559				return (EINTR);
1560			}
1561		}
1562		INC_READERS(mi);
1563		mutex_exit(&mi->mi_lock);
1564		if (fi) {
1565			if (!VALID_FH(fi) &&
1566			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1567				int remaperr;
1568
1569				svp = mi->mi_curr_serv;
1570				remaperr = failover_remap(fi);
1571				if (remaperr != 0) {
1572#ifdef DEBUG
1573					if (remaperr != EINTR)
1574						nfs_cmn_err(remaperr, CE_WARN,
1575					    "aclcall couldn't failover: %m");
1576#endif
1577					mutex_enter(&mi->mi_lock);
1578					DEC_READERS(mi);
1579					mutex_exit(&mi->mi_lock);
1580
1581					/*
1582					 * If failover_remap returns ETIMEDOUT
1583					 * and the filesystem is hard mounted
1584					 * we have to retry the call with a new
1585					 * server.
1586					 */
1587					if ((mi->mi_flags & MI_HARD) &&
1588					    IS_RECOVERABLE_ERROR(remaperr)) {
1589						if (svp == mi->mi_curr_serv)
1590							failover_newserver(mi);
1591						rpcerr.re_status = RPC_SUCCESS;
1592						goto failoverretry;
1593					}
1594					return (remaperr);
1595				}
1596			}
1597			if (fi->fhp && fi->copyproc)
1598				(*fi->copyproc)(fi->fhp, fi->vp);
1599		}
1600	}
1601
1602	/* For TSOL, use a new cred which has net_mac_aware flag */
1603	if (!cred_cloned && is_system_labeled()) {
1604		cred_cloned = TRUE;
1605		cr = crdup(icr);
1606		(void) setpflags(NET_MAC_AWARE, 1, cr);
1607	}
1608
1609	/*
1610	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1611	 * are guaranteed to reprocess the retry as a new request.
1612	 */
1613	svp = mi->mi_curr_serv;
1614	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1615	if (FAILOVER_MOUNT(mi)) {
1616		mutex_enter(&mi->mi_lock);
1617		DEC_READERS(mi);
1618		mutex_exit(&mi->mi_lock);
1619
1620		if ((rpcerr.re_errno == ETIMEDOUT ||
1621		    rpcerr.re_errno == ECONNRESET) &&
1622		    failover_safe(fi)) {
1623			if (svp == mi->mi_curr_serv)
1624				failover_newserver(mi);
1625			goto failoverretry;
1626		}
1627	}
1628	if (rpcerr.re_errno != 0) {
1629		if (cred_cloned)
1630			crfree(cr);
1631		return (rpcerr.re_errno);
1632	}
1633
1634	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1635	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1636		timeo = (mi->mi_timeo * hz) / 10;
1637	} else {
1638		mutex_enter(&mi->mi_lock);
1639		timeo = CLNT_SETTIMERS(client,
1640		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1641		    &(mi->mi_timers[NFS_CALLTYPES]),
1642		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1643		    (void (*)()) 0, (caddr_t)mi, 0);
1644		mutex_exit(&mi->mi_lock);
1645	}
1646
1647	/*
1648	 * If hard mounted fs, retry call forever unless hard error occurs.
1649	 */
1650	do {
1651		tryagain = FALSE;
1652
1653		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1654			status = RPC_FAILED;
1655			rpcerr.re_status = RPC_FAILED;
1656			rpcerr.re_errno = EIO;
1657			break;
1658		}
1659
1660		TICK_TO_TIMEVAL(timeo, &wait);
1661
1662		/*
1663		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1664		 * and SIGTERM. (Preserving the existing masks).
1665		 * Mask out SIGINT if mount option nointr is specified.
1666		 */
1667		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1668		if (!(mi->mi_flags & MI_INT))
1669			client->cl_nosignal = TRUE;
1670
1671		/*
1672		 * If there is a current signal, then don't bother
1673		 * even trying to send out the request because we
1674		 * won't be able to block waiting for the response.
1675		 * Simply assume RPC_INTR and get on with it.
1676		 */
1677		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1678			status = RPC_INTR;
1679		else {
1680			status = CLNT_CALL(client, which, xdrargs, argsp,
1681			    xdrres, resp, wait);
1682		}
1683
1684		if (!(mi->mi_flags & MI_INT))
1685			client->cl_nosignal = FALSE;
1686		/*
1687		 * restore original signal mask
1688		 */
1689		sigunintr(&smask);
1690
1691		switch (status) {
1692		case RPC_SUCCESS:
1693#if 0 /* notyet */
1694			if ((mi->mi_flags & MI_DYNAMIC) &&
1695			    mi->mi_timer_type[which] != 0 &&
1696			    (mi->mi_curread != my_rsize ||
1697			    mi->mi_curwrite != my_wsize))
1698				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1699#endif
1700			break;
1701
1702		/*
1703		 * Unfortunately, there are servers in the world which
1704		 * are not coded correctly.  They are not prepared to
1705		 * handle RPC requests to the NFS port which are not
1706		 * NFS requests.  Thus, they may try to process the
1707		 * NFS_ACL request as if it were an NFS request.  This
1708		 * does not work.  Generally, an error will be generated
1709		 * on the client because it will not be able to decode
1710		 * the response from the server.  However, it seems
1711		 * possible that the server may not be able to decode
1712		 * the arguments.  Thus, the criteria for deciding
1713		 * whether the server supports NFS_ACL or not is whether
1714		 * the following RPC errors are returned from CLNT_CALL.
1715		 */
1716		case RPC_CANTDECODERES:
1717		case RPC_PROGUNAVAIL:
1718		case RPC_CANTDECODEARGS:
1719		case RPC_PROGVERSMISMATCH:
1720			mutex_enter(&mi->mi_lock);
1721			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1722			mutex_exit(&mi->mi_lock);
1723			break;
1724
1725		/*
1726		 * If the server supports NFS_ACL but not the new ops
1727		 * for extended attributes, make sure we don't retry.
1728		 */
1729		case RPC_PROCUNAVAIL:
1730			mutex_enter(&mi->mi_lock);
1731			mi->mi_flags &= ~MI_EXTATTR;
1732			mutex_exit(&mi->mi_lock);
1733			break;
1734
1735		case RPC_INTR:
1736			/*
1737			 * There is no way to recover from this error,
1738			 * even if mount option nointr is specified.
1739			 * SIGKILL, for example, cannot be blocked.
1740			 */
1741			rpcerr.re_status = RPC_INTR;
1742			rpcerr.re_errno = EINTR;
1743			break;
1744
1745		case RPC_UDERROR:
1746			/*
1747			 * If the NFS server is local (vold) and
1748			 * it goes away then we get RPC_UDERROR.
1749			 * This is a retryable error, so we would
1750			 * loop, so check to see if the specific
1751			 * error was ECONNRESET, indicating that
1752			 * target did not exist at all.  If so,
1753			 * return with RPC_PROGUNAVAIL and
1754			 * ECONNRESET to indicate why.
1755			 */
1756			CLNT_GETERR(client, &rpcerr);
1757			if (rpcerr.re_errno == ECONNRESET) {
1758				rpcerr.re_status = RPC_PROGUNAVAIL;
1759				rpcerr.re_errno = ECONNRESET;
1760				break;
1761			}
1762			/*FALLTHROUGH*/
1763
1764		default:		/* probably RPC_TIMEDOUT */
1765			if (IS_UNRECOVERABLE_RPC(status))
1766				break;
1767
1768			/*
1769			 * increment server not responding count
1770			 */
1771			mutex_enter(&mi->mi_lock);
1772			mi->mi_noresponse++;
1773			mutex_exit(&mi->mi_lock);
1774#ifdef DEBUG
1775			nfscl->nfscl_stat.noresponse.value.ui64++;
1776#endif
1777
1778			if (!(mi->mi_flags & MI_HARD)) {
1779				if (!(mi->mi_flags & MI_SEMISOFT) ||
1780				    (mi->mi_acl_ss_call_type[which] == 0))
1781					break;
1782			}
1783
1784			/*
1785			 * The call is in progress (over COTS).
1786			 * Try the CLNT_CALL again, but don't
1787			 * print a noisy error message.
1788			 */
1789			if (status == RPC_INPROGRESS) {
1790				tryagain = TRUE;
1791				break;
1792			}
1793
1794			if (flags & RFSCALL_SOFT)
1795				break;
1796
1797			/*
1798			 * On zone shutdown, just move on.
1799			 */
1800			if (zone_status_get(curproc->p_zone) >=
1801			    ZONE_IS_SHUTTING_DOWN) {
1802				rpcerr.re_status = RPC_FAILED;
1803				rpcerr.re_errno = EIO;
1804				break;
1805			}
1806
1807			/*
1808			 * NFS client failover support
1809			 *
1810			 * If the current server just failed us, we'll
1811			 * start the process of finding a new server.
1812			 * After that, we can just retry.
1813			 */
1814			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1815				if (svp == mi->mi_curr_serv)
1816					failover_newserver(mi);
1817				clfree_impl(client, ch, nfscl);
1818				goto failoverretry;
1819			}
1820
1821			tryagain = TRUE;
1822			timeo = backoff(timeo);
1823			mutex_enter(&mi->mi_lock);
1824			if (!(mi->mi_flags & MI_PRINTED)) {
1825				mi->mi_flags |= MI_PRINTED;
1826				mutex_exit(&mi->mi_lock);
1827#ifdef DEBUG
1828				zprintf(zoneid,
1829			"NFS_ACL%d server %s not responding still trying\n",
1830				    mi->mi_vers, svp->sv_hostname);
1831#else
1832				zprintf(zoneid,
1833			    "NFS server %s not responding still trying\n",
1834				    svp->sv_hostname);
1835#endif
1836			} else
1837				mutex_exit(&mi->mi_lock);
1838			if (*douprintf && nfs_has_ctty()) {
1839				*douprintf = 0;
1840				if (!(mi->mi_flags & MI_NOPRINT))
1841#ifdef DEBUG
1842					uprintf(
1843			"NFS_ACL%d server %s not responding still trying\n",
1844					    mi->mi_vers, svp->sv_hostname);
1845#else
1846					uprintf(
1847			    "NFS server %s not responding still trying\n",
1848					    svp->sv_hostname);
1849#endif
1850			}
1851
1852#if 0 /* notyet */
1853			/*
1854			 * If doing dynamic adjustment of transfer
1855			 * size and if it's a read or write call
1856			 * and if the transfer size changed while
1857			 * retransmitting or if the feedback routine
1858			 * changed the transfer size,
1859			 * then exit rfscall so that the transfer
1860			 * size can be adjusted at the vnops level.
1861			 */
1862			if ((mi->mi_flags & MI_DYNAMIC) &&
1863			    mi->mi_acl_timer_type[which] != 0 &&
1864			    (mi->mi_curread != my_rsize ||
1865			    mi->mi_curwrite != my_wsize ||
1866			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1867				/*
1868				 * On read or write calls, return
1869				 * back to the vnode ops level if
1870				 * the transfer size changed.
1871				 */
1872				clfree_impl(client, ch, nfscl);
1873				if (cred_cloned)
1874					crfree(cr);
1875				return (ENFS_TRYAGAIN);
1876			}
1877#endif
1878		}
1879	} while (tryagain);
1880
1881	if (status != RPC_SUCCESS) {
1882		/*
1883		 * Let soft mounts use the timed out message.
1884		 */
1885		if (status == RPC_INPROGRESS)
1886			status = RPC_TIMEDOUT;
1887		nfscl->nfscl_stat.badcalls.value.ui64++;
1888		if (status == RPC_CANTDECODERES ||
1889		    status == RPC_PROGUNAVAIL ||
1890		    status == RPC_PROCUNAVAIL ||
1891		    status == RPC_CANTDECODEARGS ||
1892		    status == RPC_PROGVERSMISMATCH)
1893			CLNT_GETERR(client, &rpcerr);
1894		else if (status != RPC_INTR) {
1895			mutex_enter(&mi->mi_lock);
1896			mi->mi_flags |= MI_DOWN;
1897			mutex_exit(&mi->mi_lock);
1898			CLNT_GETERR(client, &rpcerr);
1899#ifdef DEBUG
1900			bufp = clnt_sperror(client, svp->sv_hostname);
1901			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1902			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1903			if (nfs_has_ctty()) {
1904				if (!(mi->mi_flags & MI_NOPRINT)) {
1905					uprintf("NFS_ACL%d %s failed for %s\n",
1906					    mi->mi_vers, mi->mi_aclnames[which],
1907					    bufp);
1908				}
1909			}
1910			kmem_free(bufp, MAXPATHLEN);
1911#else
1912			zprintf(zoneid,
1913			    "NFS %s failed for server %s: error %d (%s)\n",
1914			    mi->mi_aclnames[which], svp->sv_hostname,
1915			    status, clnt_sperrno(status));
1916			if (nfs_has_ctty()) {
1917				if (!(mi->mi_flags & MI_NOPRINT))
1918					uprintf(
1919				"NFS %s failed for server %s: error %d (%s)\n",
1920					    mi->mi_aclnames[which],
1921					    svp->sv_hostname, status,
1922					    clnt_sperrno(status));
1923			}
1924#endif
1925			/*
1926			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1927			 * re_errno is set appropriately depending on
1928			 * the authentication error
1929			 */
1930			if (status == RPC_VERSMISMATCH ||
1931			    status == RPC_PROGVERSMISMATCH)
1932				rpcerr.re_errno = EIO;
1933		}
1934	} else {
1935		/*
1936		 * Test the value of mi_down and mi_printed without
1937		 * holding the mi_lock mutex.  If they are both zero,
1938		 * then it is okay to skip the down and printed
1939		 * processing.  This saves on a mutex_enter and
1940		 * mutex_exit pair for a normal, successful RPC.
1941		 * This was just complete overhead.
1942		 */
1943		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1944			mutex_enter(&mi->mi_lock);
1945			mi->mi_flags &= ~MI_DOWN;
1946			if (mi->mi_flags & MI_PRINTED) {
1947				mi->mi_flags &= ~MI_PRINTED;
1948				mutex_exit(&mi->mi_lock);
1949#ifdef DEBUG
1950				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1951				    mi->mi_vers, svp->sv_hostname);
1952#else
1953				zprintf(zoneid, "NFS server %s ok\n",
1954				    svp->sv_hostname);
1955#endif
1956			} else
1957				mutex_exit(&mi->mi_lock);
1958		}
1959
1960		if (*douprintf == 0) {
1961			if (!(mi->mi_flags & MI_NOPRINT))
1962#ifdef DEBUG
1963				uprintf("NFS_ACL%d server %s ok\n",
1964				    mi->mi_vers, svp->sv_hostname);
1965#else
1966				uprintf("NFS server %s ok\n", svp->sv_hostname);
1967#endif
1968			*douprintf = 1;
1969		}
1970	}
1971
1972	clfree_impl(client, ch, nfscl);
1973	if (cred_cloned)
1974		crfree(cr);
1975
1976	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1977
1978#if 0 /* notyet */
1979	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1980	    rpcerr.re_errno);
1981#endif
1982
1983	return (rpcerr.re_errno);
1984}
1985
1986int
1987vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1988{
1989	uint_t mask = vap->va_mask;
1990
1991	if (!(mask & AT_MODE))
1992		sa->sa_mode = (uint32_t)-1;
1993	else
1994		sa->sa_mode = vap->va_mode;
1995	if (!(mask & AT_UID))
1996		sa->sa_uid = (uint32_t)-1;
1997	else
1998		sa->sa_uid = (uint32_t)vap->va_uid;
1999	if (!(mask & AT_GID))
2000		sa->sa_gid = (uint32_t)-1;
2001	else
2002		sa->sa_gid = (uint32_t)vap->va_gid;
2003	if (!(mask & AT_SIZE))
2004		sa->sa_size = (uint32_t)-1;
2005	else
2006		sa->sa_size = (uint32_t)vap->va_size;
2007	if (!(mask & AT_ATIME))
2008		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2009	else {
2010		/* check time validity */
2011		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2012			return (EOVERFLOW);
2013		}
2014		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2015		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2016	}
2017	if (!(mask & AT_MTIME))
2018		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2019	else {
2020		/* check time validity */
2021		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2022			return (EOVERFLOW);
2023		}
2024		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2025		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2026	}
2027	return (0);
2028}
2029
2030int
2031vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2032{
2033	uint_t mask = vap->va_mask;
2034
2035	if (!(mask & AT_MODE))
2036		sa->mode.set_it = FALSE;
2037	else {
2038		sa->mode.set_it = TRUE;
2039		sa->mode.mode = (mode3)vap->va_mode;
2040	}
2041	if (!(mask & AT_UID))
2042		sa->uid.set_it = FALSE;
2043	else {
2044		sa->uid.set_it = TRUE;
2045		sa->uid.uid = (uid3)vap->va_uid;
2046	}
2047	if (!(mask & AT_GID))
2048		sa->gid.set_it = FALSE;
2049	else {
2050		sa->gid.set_it = TRUE;
2051		sa->gid.gid = (gid3)vap->va_gid;
2052	}
2053	if (!(mask & AT_SIZE))
2054		sa->size.set_it = FALSE;
2055	else {
2056		sa->size.set_it = TRUE;
2057		sa->size.size = (size3)vap->va_size;
2058	}
2059	if (!(mask & AT_ATIME))
2060		sa->atime.set_it = DONT_CHANGE;
2061	else {
2062		/* check time validity */
2063		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2064			return (EOVERFLOW);
2065		}
2066		sa->atime.set_it = SET_TO_CLIENT_TIME;
2067		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2068		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2069	}
2070	if (!(mask & AT_MTIME))
2071		sa->mtime.set_it = DONT_CHANGE;
2072	else {
2073		/* check time validity */
2074		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2075			return (EOVERFLOW);
2076		}
2077		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2078		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2079		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2080	}
2081	return (0);
2082}
2083
2084void
2085setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2086{
2087
2088	da->da_fhandle = VTOFH(dvp);
2089	da->da_name = nm;
2090	da->da_flags = 0;
2091}
2092
2093void
2094setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2095{
2096
2097	da->dirp = VTOFH3(dvp);
2098	da->name = nm;
2099}
2100
2101int
2102setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2103{
2104	int error;
2105	rnode_t *rp;
2106	struct vattr va;
2107
2108	va.va_mask = AT_MODE | AT_GID;
2109	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2110	if (error)
2111		return (error);
2112
2113	/*
2114	 * To determine the expected group-id of the created file:
2115	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2116	 *	GRPID option, and the directory's set-gid bit is clear,
2117	 *	then use the process's gid.
2118	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2119	 */
2120	rp = VTOR(dvp);
2121	mutex_enter(&rp->r_statelock);
2122	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2123		*gidp = crgetgid(cr);
2124	else
2125		*gidp = va.va_gid;
2126	mutex_exit(&rp->r_statelock);
2127	return (0);
2128}
2129
2130int
2131setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2132{
2133	int error;
2134	struct vattr va;
2135
2136	va.va_mask = AT_MODE;
2137	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2138	if (error)
2139		return (error);
2140
2141	/*
2142	 * Modify the expected mode (om) so that the set-gid bit matches
2143	 * that of the parent directory (dvp).
2144	 */
2145	if (va.va_mode & VSGID)
2146		*omp |= VSGID;
2147	else
2148		*omp &= ~VSGID;
2149	return (0);
2150}
2151
2152void
2153nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2154{
2155
2156	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2157		if (!(vp->v_flag & VSWAPLIKE)) {
2158			mutex_enter(&vp->v_lock);
2159			vp->v_flag |= VSWAPLIKE;
2160			mutex_exit(&vp->v_lock);
2161		}
2162	} else {
2163		if (vp->v_flag & VSWAPLIKE) {
2164			mutex_enter(&vp->v_lock);
2165			vp->v_flag &= ~VSWAPLIKE;
2166			mutex_exit(&vp->v_lock);
2167		}
2168	}
2169}
2170
2171/*
2172 * Free the resources associated with an rnode.
2173 */
2174static void
2175rinactive(rnode_t *rp, cred_t *cr)
2176{
2177	vnode_t *vp;
2178	cred_t *cred;
2179	char *contents;
2180	int size;
2181	vsecattr_t *vsp;
2182	int error;
2183	nfs3_pathconf_info *info;
2184
2185	/*
2186	 * Before freeing anything, wait until all asynchronous
2187	 * activity is done on this rnode.  This will allow all
2188	 * asynchronous read ahead and write behind i/o's to
2189	 * finish.
2190	 */
2191	mutex_enter(&rp->r_statelock);
2192	while (rp->r_count > 0)
2193		cv_wait(&rp->r_cv, &rp->r_statelock);
2194	mutex_exit(&rp->r_statelock);
2195
2196	/*
2197	 * Flush and invalidate all pages associated with the vnode.
2198	 */
2199	vp = RTOV(rp);
2200	if (vn_has_cached_data(vp)) {
2201		ASSERT(vp->v_type != VCHR);
2202		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2203			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2204			if (error && (error == ENOSPC || error == EDQUOT)) {
2205				mutex_enter(&rp->r_statelock);
2206				if (!rp->r_error)
2207					rp->r_error = error;
2208				mutex_exit(&rp->r_statelock);
2209			}
2210		}
2211		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2212	}
2213
2214	/*
2215	 * Free any held credentials and caches which may be associated
2216	 * with this rnode.
2217	 */
2218	mutex_enter(&rp->r_statelock);
2219	cred = rp->r_cred;
2220	rp->r_cred = NULL;
2221	contents = rp->r_symlink.contents;
2222	size = rp->r_symlink.size;
2223	rp->r_symlink.contents = NULL;
2224	vsp = rp->r_secattr;
2225	rp->r_secattr = NULL;
2226	info = rp->r_pathconf;
2227	rp->r_pathconf = NULL;
2228	mutex_exit(&rp->r_statelock);
2229
2230	/*
2231	 * Free the held credential.
2232	 */
2233	if (cred != NULL)
2234		crfree(cred);
2235
2236	/*
2237	 * Free the access cache entries.
2238	 */
2239	(void) nfs_access_purge_rp(rp);
2240
2241	/*
2242	 * Free the readdir cache entries.
2243	 */
2244	if (HAVE_RDDIR_CACHE(rp))
2245		nfs_purge_rddir_cache(vp);
2246
2247	/*
2248	 * Free the symbolic link cache.
2249	 */
2250	if (contents != NULL) {
2251
2252		kmem_free((void *)contents, size);
2253	}
2254
2255	/*
2256	 * Free any cached ACL.
2257	 */
2258	if (vsp != NULL)
2259		nfs_acl_free(vsp);
2260
2261	/*
2262	 * Free any cached pathconf information.
2263	 */
2264	if (info != NULL)
2265		kmem_free(info, sizeof (*info));
2266}
2267
2268/*
2269 * Return a vnode for the given NFS Version 2 file handle.
2270 * If no rnode exists for this fhandle, create one and put it
2271 * into the hash queues.  If the rnode for this fhandle
2272 * already exists, return it.
2273 *
2274 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2275 */
2276vnode_t *
2277makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2278    hrtime_t t, cred_t *cr, char *dnm, char *nm)
2279{
2280	int newnode;
2281	int index;
2282	vnode_t *vp;
2283	nfs_fhandle nfh;
2284	vattr_t va;
2285
2286	nfh.fh_len = NFS_FHSIZE;
2287	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2288
2289	index = rtablehash(&nfh);
2290	rw_enter(&rtable[index].r_lock, RW_READER);
2291
2292	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2293	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2294
2295	if (attr != NULL) {
2296		if (!newnode) {
2297			rw_exit(&rtable[index].r_lock);
2298			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2299		} else {
2300			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2301				vp->v_type = VBAD;
2302			else
2303				vp->v_type = n2v_type(attr);
2304			/*
2305			 * A translation here seems to be necessary
2306			 * because this function can be called
2307			 * with `attr' that has come from the wire,
2308			 * and been operated on by vattr_to_nattr().
2309			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2310			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2311			 * ->makenfsnode().
2312			 */
2313			if ((attr->na_rdev & 0xffff0000) == 0)
2314				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2315			else
2316				vp->v_rdev = expldev(n2v_rdev(attr));
2317			nfs_attrcache(vp, attr, t);
2318			rw_exit(&rtable[index].r_lock);
2319		}
2320	} else {
2321		if (newnode) {
2322			PURGE_ATTRCACHE(vp);
2323		}
2324		rw_exit(&rtable[index].r_lock);
2325	}
2326
2327	return (vp);
2328}
2329
2330/*
2331 * Return a vnode for the given NFS Version 3 file handle.
2332 * If no rnode exists for this fhandle, create one and put it
2333 * into the hash queues.  If the rnode for this fhandle
2334 * already exists, return it.
2335 *
2336 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2337 */
2338vnode_t *
2339makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2340    cred_t *cr, char *dnm, char *nm)
2341{
2342	int newnode;
2343	int index;
2344	vnode_t *vp;
2345
2346	index = rtablehash((nfs_fhandle *)fh);
2347	rw_enter(&rtable[index].r_lock, RW_READER);
2348
2349	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2350	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2351	    dnm, nm);
2352
2353	if (vap == NULL) {
2354		if (newnode) {
2355			PURGE_ATTRCACHE(vp);
2356		}
2357		rw_exit(&rtable[index].r_lock);
2358		return (vp);
2359	}
2360
2361	if (!newnode) {
2362		rw_exit(&rtable[index].r_lock);
2363		nfs_attr_cache(vp, vap, t, cr);
2364	} else {
2365		rnode_t *rp = VTOR(vp);
2366
2367		vp->v_type = vap->va_type;
2368		vp->v_rdev = vap->va_rdev;
2369
2370		mutex_enter(&rp->r_statelock);
2371		if (rp->r_mtime <= t)
2372			nfs_attrcache_va(vp, vap);
2373		mutex_exit(&rp->r_statelock);
2374		rw_exit(&rtable[index].r_lock);
2375	}
2376
2377	return (vp);
2378}
2379
2380vnode_t *
2381makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2382    cred_t *cr, char *dnm, char *nm)
2383{
2384	int newnode;
2385	int index;
2386	vnode_t *vp;
2387	vattr_t va;
2388
2389	index = rtablehash((nfs_fhandle *)fh);
2390	rw_enter(&rtable[index].r_lock, RW_READER);
2391
2392	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2393	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2394	    dnm, nm);
2395
2396	if (attr == NULL) {
2397		if (newnode) {
2398			PURGE_ATTRCACHE(vp);
2399		}
2400		rw_exit(&rtable[index].r_lock);
2401		return (vp);
2402	}
2403
2404	if (!newnode) {
2405		rw_exit(&rtable[index].r_lock);
2406		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2407	} else {
2408		if (attr->type < NF3REG || attr->type > NF3FIFO)
2409			vp->v_type = VBAD;
2410		else
2411			vp->v_type = nf3_to_vt[attr->type];
2412		vp->v_rdev = makedevice(attr->rdev.specdata1,
2413		    attr->rdev.specdata2);
2414		nfs3_attrcache(vp, attr, t);
2415		rw_exit(&rtable[index].r_lock);
2416	}
2417
2418	return (vp);
2419}
2420
2421/*
2422 * Read this comment before making changes to rtablehash()!
2423 * This is a hash function in which seemingly obvious and harmless
2424 * changes can cause escalations costing million dollars!
2425 * Know what you are doing.
2426 *
2427 * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2428 * algorithm is currently detailed here:
2429 *
2430 *   http://burtleburtle.net/bob/hash/doobs.html
2431 *
2432 * Of course, the above link may not be valid by the time you are reading
2433 * this, but suffice it to say that the one-at-a-time algorithm works well in
2434 * almost all cases.  If you are changing the algorithm be sure to verify that
2435 * the hash algorithm still provides even distribution in all cases and with
2436 * any server returning filehandles in whatever order (sequential or random).
2437 */
2438static int
2439rtablehash(nfs_fhandle *fh)
2440{
2441	ulong_t hash, len, i;
2442	char *key;
2443
2444	key = fh->fh_buf;
2445	len = (ulong_t)fh->fh_len;
2446	for (hash = 0, i = 0; i < len; i++) {
2447		hash += key[i];
2448		hash += (hash << 10);
2449		hash ^= (hash >> 6);
2450	}
2451	hash += (hash << 3);
2452	hash ^= (hash >> 11);
2453	hash += (hash << 15);
2454	return (hash & rtablemask);
2455}
2456
2457static vnode_t *
2458make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2459    struct vnodeops *vops,
2460    int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2461    int (*compar)(const void *, const void *),
2462    int *newnode, cred_t *cr, char *dnm, char *nm)
2463{
2464	rnode_t *rp;
2465	rnode_t *trp;
2466	vnode_t *vp;
2467	mntinfo_t *mi;
2468
2469	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2470
2471	mi = VFTOMI(vfsp);
2472start:
2473	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2474		vp = RTOV(rp);
2475		nfs_set_vroot(vp);
2476		*newnode = 0;
2477		return (vp);
2478	}
2479	rw_exit(&rhtp->r_lock);
2480
2481	mutex_enter(&rpfreelist_lock);
2482	if (rpfreelist != NULL && rnew >= nrnode) {
2483		rp = rpfreelist;
2484		rp_rmfree(rp);
2485		mutex_exit(&rpfreelist_lock);
2486
2487		vp = RTOV(rp);
2488
2489		if (rp->r_flags & RHASHED) {
2490			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2491			mutex_enter(&vp->v_lock);
2492			if (vp->v_count > 1) {
2493				VN_RELE_LOCKED(vp);
2494				mutex_exit(&vp->v_lock);
2495				rw_exit(&rp->r_hashq->r_lock);
2496				rw_enter(&rhtp->r_lock, RW_READER);
2497				goto start;
2498			}
2499			mutex_exit(&vp->v_lock);
2500			rp_rmhash_locked(rp);
2501			rw_exit(&rp->r_hashq->r_lock);
2502		}
2503
2504		rinactive(rp, cr);
2505
2506		mutex_enter(&vp->v_lock);
2507		if (vp->v_count > 1) {
2508			VN_RELE_LOCKED(vp);
2509			mutex_exit(&vp->v_lock);
2510			rw_enter(&rhtp->r_lock, RW_READER);
2511			goto start;
2512		}
2513		mutex_exit(&vp->v_lock);
2514		vn_invalid(vp);
2515		/*
2516		 * destroy old locks before bzero'ing and
2517		 * recreating the locks below.
2518		 */
2519		nfs_rw_destroy(&rp->r_rwlock);
2520		nfs_rw_destroy(&rp->r_lkserlock);
2521		mutex_destroy(&rp->r_statelock);
2522		cv_destroy(&rp->r_cv);
2523		cv_destroy(&rp->r_commit.c_cv);
2524		nfs_free_r_path(rp);
2525		avl_destroy(&rp->r_dir);
2526		/*
2527		 * Make sure that if rnode is recycled then
2528		 * VFS count is decremented properly before
2529		 * reuse.
2530		 */
2531		VFS_RELE(vp->v_vfsp);
2532		vn_reinit(vp);
2533	} else {
2534		vnode_t *new_vp;
2535
2536		mutex_exit(&rpfreelist_lock);
2537
2538		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2539		new_vp = vn_alloc(KM_SLEEP);
2540
2541		atomic_inc_ulong((ulong_t *)&rnew);
2542#ifdef DEBUG
2543		clstat_debug.nrnode.value.ui64++;
2544#endif
2545		vp = new_vp;
2546	}
2547
2548	bzero(rp, sizeof (*rp));
2549	rp->r_vnode = vp;
2550	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2551	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2552	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2553	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2554	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2555	rp->r_fh.fh_len = fh->fh_len;
2556	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2557	rp->r_server = mi->mi_curr_serv;
2558	if (FAILOVER_MOUNT(mi)) {
2559		/*
2560		 * If replicated servers, stash pathnames
2561		 */
2562		if (dnm != NULL && nm != NULL) {
2563			char *s, *p;
2564			uint_t len;
2565
2566			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2567			rp->r_path = kmem_alloc(len, KM_SLEEP);
2568#ifdef DEBUG
2569			clstat_debug.rpath.value.ui64 += len;
2570#endif
2571			s = rp->r_path;
2572			for (p = dnm; *p; p++)
2573				*s++ = *p;
2574			*s++ = '/';
2575			for (p = nm; *p; p++)
2576				*s++ = *p;
2577			*s = '\0';
2578		} else {
2579			/* special case for root */
2580			rp->r_path = kmem_alloc(2, KM_SLEEP);
2581#ifdef DEBUG
2582			clstat_debug.rpath.value.ui64 += 2;
2583#endif
2584			*rp->r_path = '.';
2585			*(rp->r_path + 1) = '\0';
2586		}
2587	}
2588	VFS_HOLD(vfsp);
2589	rp->r_putapage = putapage;
2590	rp->r_hashq = rhtp;
2591	rp->r_flags = RREADDIRPLUS;
2592	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2593	    offsetof(rddir_cache, tree));
2594	vn_setops(vp, vops);
2595	vp->v_data = (caddr_t)rp;
2596	vp->v_vfsp = vfsp;
2597	vp->v_type = VNON;
2598	vp->v_flag |= VMODSORT;
2599	nfs_set_vroot(vp);
2600
2601	/*
2602	 * There is a race condition if someone else
2603	 * alloc's the rnode while no locks are held, so we
2604	 * check again and recover if found.
2605	 */
2606	rw_enter(&rhtp->r_lock, RW_WRITER);
2607	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2608		vp = RTOV(trp);
2609		nfs_set_vroot(vp);
2610		*newnode = 0;
2611		rw_exit(&rhtp->r_lock);
2612		rp_addfree(rp, cr);
2613		rw_enter(&rhtp->r_lock, RW_READER);
2614		return (vp);
2615	}
2616	rp_addhash(rp);
2617	*newnode = 1;
2618	return (vp);
2619}
2620
2621/*
2622 * Callback function to check if the page should be marked as
2623 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2624 */
2625int
2626nfs_setmod_check(page_t *pp)
2627{
2628	if (pp->p_fsdata != C_NOCOMMIT) {
2629		pp->p_fsdata = C_NOCOMMIT;
2630		return (1);
2631	}
2632	return (0);
2633}
2634
2635static void
2636nfs_set_vroot(vnode_t *vp)
2637{
2638	rnode_t *rp;
2639	nfs_fhandle *rootfh;
2640
2641	rp = VTOR(vp);
2642	rootfh = &rp->r_server->sv_fhandle;
2643	if (rootfh->fh_len == rp->r_fh.fh_len &&
2644	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2645		if (!(vp->v_flag & VROOT)) {
2646			mutex_enter(&vp->v_lock);
2647			vp->v_flag |= VROOT;
2648			mutex_exit(&vp->v_lock);
2649		}
2650	}
2651}
2652
2653static void
2654nfs_free_r_path(rnode_t *rp)
2655{
2656	char *path;
2657	size_t len;
2658
2659	path = rp->r_path;
2660	if (path) {
2661		rp->r_path = NULL;
2662		len = strlen(path) + 1;
2663		kmem_free(path, len);
2664#ifdef DEBUG
2665		clstat_debug.rpath.value.ui64 -= len;
2666#endif
2667	}
2668}
2669
2670/*
2671 * Put an rnode on the free list.
2672 *
2673 * Rnodes which were allocated above and beyond the normal limit
2674 * are immediately freed.
2675 */
2676void
2677rp_addfree(rnode_t *rp, cred_t *cr)
2678{
2679	vnode_t *vp;
2680	struct vfs *vfsp;
2681
2682	vp = RTOV(rp);
2683	ASSERT(vp->v_count >= 1);
2684	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2685
2686	/*
2687	 * If we have too many rnodes allocated and there are no
2688	 * references to this rnode, or if the rnode is no longer
2689	 * accessible by it does not reside in the hash queues,
2690	 * or if an i/o error occurred while writing to the file,
2691	 * then just free it instead of putting it on the rnode
2692	 * freelist.
2693	 */
2694	vfsp = vp->v_vfsp;
2695	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2696	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2697		if (rp->r_flags & RHASHED) {
2698			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2699			mutex_enter(&vp->v_lock);
2700			if (vp->v_count > 1) {
2701				VN_RELE_LOCKED(vp);
2702				mutex_exit(&vp->v_lock);
2703				rw_exit(&rp->r_hashq->r_lock);
2704				return;
2705			}
2706			mutex_exit(&vp->v_lock);
2707			rp_rmhash_locked(rp);
2708			rw_exit(&rp->r_hashq->r_lock);
2709		}
2710
2711		rinactive(rp, cr);
2712
2713		/*
2714		 * Recheck the vnode reference count.  We need to
2715		 * make sure that another reference has not been
2716		 * acquired while we were not holding v_lock.  The
2717		 * rnode is not in the rnode hash queues, so the
2718		 * only way for a reference to have been acquired
2719		 * is for a VOP_PUTPAGE because the rnode was marked
2720		 * with RDIRTY or for a modified page.  This
2721		 * reference may have been acquired before our call
2722		 * to rinactive.  The i/o may have been completed,
2723		 * thus allowing rinactive to complete, but the
2724		 * reference to the vnode may not have been released
2725		 * yet.  In any case, the rnode can not be destroyed
2726		 * until the other references to this vnode have been
2727		 * released.  The other references will take care of
2728		 * either destroying the rnode or placing it on the
2729		 * rnode freelist.  If there are no other references,
2730		 * then the rnode may be safely destroyed.
2731		 */
2732		mutex_enter(&vp->v_lock);
2733		if (vp->v_count > 1) {
2734			VN_RELE_LOCKED(vp);
2735			mutex_exit(&vp->v_lock);
2736			return;
2737		}
2738		mutex_exit(&vp->v_lock);
2739
2740		destroy_rnode(rp);
2741		return;
2742	}
2743
2744	/*
2745	 * Lock the hash queue and then recheck the reference count
2746	 * to ensure that no other threads have acquired a reference
2747	 * to indicate that the rnode should not be placed on the
2748	 * freelist.  If another reference has been acquired, then
2749	 * just release this one and let the other thread complete
2750	 * the processing of adding this rnode to the freelist.
2751	 */
2752	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2753
2754	mutex_enter(&vp->v_lock);
2755	if (vp->v_count > 1) {
2756		VN_RELE_LOCKED(vp);
2757		mutex_exit(&vp->v_lock);
2758		rw_exit(&rp->r_hashq->r_lock);
2759		return;
2760	}
2761	mutex_exit(&vp->v_lock);
2762
2763	/*
2764	 * If there is no cached data or metadata for this file, then
2765	 * put the rnode on the front of the freelist so that it will
2766	 * be reused before other rnodes which may have cached data or
2767	 * metadata associated with them.
2768	 */
2769	mutex_enter(&rpfreelist_lock);
2770	if (rpfreelist == NULL) {
2771		rp->r_freef = rp;
2772		rp->r_freeb = rp;
2773		rpfreelist = rp;
2774	} else {
2775		rp->r_freef = rpfreelist;
2776		rp->r_freeb = rpfreelist->r_freeb;
2777		rpfreelist->r_freeb->r_freef = rp;
2778		rpfreelist->r_freeb = rp;
2779		if (!vn_has_cached_data(vp) &&
2780		    !HAVE_RDDIR_CACHE(rp) &&
2781		    rp->r_symlink.contents == NULL &&
2782		    rp->r_secattr == NULL &&
2783		    rp->r_pathconf == NULL)
2784			rpfreelist = rp;
2785	}
2786	mutex_exit(&rpfreelist_lock);
2787
2788	rw_exit(&rp->r_hashq->r_lock);
2789}
2790
2791/*
2792 * Remove an rnode from the free list.
2793 *
2794 * The caller must be holding rpfreelist_lock and the rnode
2795 * must be on the freelist.
2796 */
2797static void
2798rp_rmfree(rnode_t *rp)
2799{
2800
2801	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2802	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2803
2804	if (rp == rpfreelist) {
2805		rpfreelist = rp->r_freef;
2806		if (rp == rpfreelist)
2807			rpfreelist = NULL;
2808	}
2809
2810	rp->r_freeb->r_freef = rp->r_freef;
2811	rp->r_freef->r_freeb = rp->r_freeb;
2812
2813	rp->r_freef = rp->r_freeb = NULL;
2814}
2815
2816/*
2817 * Put a rnode in the hash table.
2818 *
2819 * The caller must be holding the exclusive hash queue lock.
2820 */
2821static void
2822rp_addhash(rnode_t *rp)
2823{
2824	mntinfo_t *mi;
2825
2826	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2827	ASSERT(!(rp->r_flags & RHASHED));
2828
2829	rp->r_hashf = rp->r_hashq->r_hashf;
2830	rp->r_hashq->r_hashf = rp;
2831	rp->r_hashb = (rnode_t *)rp->r_hashq;
2832	rp->r_hashf->r_hashb = rp;
2833
2834	mutex_enter(&rp->r_statelock);
2835	rp->r_flags |= RHASHED;
2836	mutex_exit(&rp->r_statelock);
2837
2838	mi = VTOMI(RTOV(rp));
2839	mutex_enter(&mi->mi_rnodes_lock);
2840	list_insert_tail(&mi->mi_rnodes, rp);
2841	mutex_exit(&mi->mi_rnodes_lock);
2842}
2843
2844/*
2845 * Remove a rnode from the hash table.
2846 *
2847 * The caller must be holding the hash queue lock.
2848 */
2849static void
2850rp_rmhash_locked(rnode_t *rp)
2851{
2852	mntinfo_t *mi;
2853
2854	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2855	ASSERT(rp->r_flags & RHASHED);
2856
2857	rp->r_hashb->r_hashf = rp->r_hashf;
2858	rp->r_hashf->r_hashb = rp->r_hashb;
2859
2860	mutex_enter(&rp->r_statelock);
2861	rp->r_flags &= ~RHASHED;
2862	mutex_exit(&rp->r_statelock);
2863
2864	mi = VTOMI(RTOV(rp));
2865	mutex_enter(&mi->mi_rnodes_lock);
2866	if (list_link_active(&rp->r_mi_link))
2867		list_remove(&mi->mi_rnodes, rp);
2868	mutex_exit(&mi->mi_rnodes_lock);
2869}
2870
2871/*
2872 * Remove a rnode from the hash table.
2873 *
2874 * The caller must not be holding the hash queue lock.
2875 */
2876void
2877rp_rmhash(rnode_t *rp)
2878{
2879
2880	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2881	rp_rmhash_locked(rp);
2882	rw_exit(&rp->r_hashq->r_lock);
2883}
2884
2885/*
2886 * Lookup a rnode by fhandle.
2887 *
2888 * The caller must be holding the hash queue lock, either shared or exclusive.
2889 */
2890static rnode_t *
2891rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2892{
2893	rnode_t *rp;
2894	vnode_t *vp;
2895
2896	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2897
2898	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2899		vp = RTOV(rp);
2900		if (vp->v_vfsp == vfsp &&
2901		    rp->r_fh.fh_len == fh->fh_len &&
2902		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2903			/*
2904			 * remove rnode from free list, if necessary.
2905			 */
2906			if (rp->r_freef != NULL) {
2907				mutex_enter(&rpfreelist_lock);
2908				/*
2909				 * If the rnode is on the freelist,
2910				 * then remove it and use that reference
2911				 * as the new reference.  Otherwise,
2912				 * need to increment the reference count.
2913				 */
2914				if (rp->r_freef != NULL) {
2915					rp_rmfree(rp);
2916					mutex_exit(&rpfreelist_lock);
2917				} else {
2918					mutex_exit(&rpfreelist_lock);
2919					VN_HOLD(vp);
2920				}
2921			} else
2922				VN_HOLD(vp);
2923			return (rp);
2924		}
2925	}
2926	return (NULL);
2927}
2928
2929/*
2930 * Return 1 if there is an active vnode belonging to this vfs in the
2931 * rtable cache.
2932 *
2933 * Several of these checks are done without holding the usual
2934 * locks.  This is safe because destroy_rtable(), rp_addfree(),
2935 * etc. will redo the necessary checks before actually destroying
2936 * any rnodes.
2937 */
2938int
2939check_rtable(struct vfs *vfsp)
2940{
2941	rnode_t *rp;
2942	vnode_t *vp;
2943	mntinfo_t *mi;
2944
2945	ASSERT(vfsp != NULL);
2946	mi = VFTOMI(vfsp);
2947
2948	mutex_enter(&mi->mi_rnodes_lock);
2949	for (rp = list_head(&mi->mi_rnodes); rp != NULL;
2950	    rp = list_next(&mi->mi_rnodes, rp)) {
2951		vp = RTOV(rp);
2952
2953		if (rp->r_freef == NULL ||
2954		    (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) ||
2955		    rp->r_count > 0) {
2956			mutex_exit(&mi->mi_rnodes_lock);
2957			return (1);
2958		}
2959	}
2960	mutex_exit(&mi->mi_rnodes_lock);
2961
2962	return (0);
2963}
2964
2965/*
2966 * Destroy inactive vnodes from the hash queues which belong to this
2967 * vfs.  It is essential that we destroy all inactive vnodes during a
2968 * forced unmount as well as during a normal unmount.
2969 */
2970void
2971destroy_rtable(struct vfs *vfsp, cred_t *cr)
2972{
2973	rnode_t *rp;
2974	mntinfo_t *mi;
2975
2976	ASSERT(vfsp != NULL);
2977
2978	mi = VFTOMI(vfsp);
2979
2980	mutex_enter(&rpfreelist_lock);
2981	mutex_enter(&mi->mi_rnodes_lock);
2982	while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
2983		/*
2984		 * If the rnode is no longer on the freelist it is not
2985		 * ours and it will be handled by some other thread, so
2986		 * skip it.
2987		 */
2988		if (rp->r_freef == NULL)
2989			continue;
2990		mutex_exit(&mi->mi_rnodes_lock);
2991
2992		rp_rmfree(rp);
2993		mutex_exit(&rpfreelist_lock);
2994
2995		rp_rmhash(rp);
2996
2997		/*
2998		 * This call to rp_addfree will end up destroying the
2999		 * rnode, but in a safe way with the appropriate set
3000		 * of checks done.
3001		 */
3002		rp_addfree(rp, cr);
3003
3004		mutex_enter(&rpfreelist_lock);
3005		mutex_enter(&mi->mi_rnodes_lock);
3006	}
3007	mutex_exit(&mi->mi_rnodes_lock);
3008	mutex_exit(&rpfreelist_lock);
3009}
3010
3011/*
3012 * This routine destroys all the resources associated with the rnode
3013 * and then the rnode itself.
3014 */
3015static void
3016destroy_rnode(rnode_t *rp)
3017{
3018	vnode_t *vp;
3019	vfs_t *vfsp;
3020
3021	vp = RTOV(rp);
3022	vfsp = vp->v_vfsp;
3023
3024	ASSERT(vp->v_count == 1);
3025	ASSERT(rp->r_count == 0);
3026	ASSERT(rp->r_lmpl == NULL);
3027	ASSERT(rp->r_mapcnt == 0);
3028	ASSERT(!(rp->r_flags & RHASHED));
3029	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3030	atomic_dec_ulong((ulong_t *)&rnew);
3031#ifdef DEBUG
3032	clstat_debug.nrnode.value.ui64--;
3033#endif
3034	nfs_rw_destroy(&rp->r_rwlock);
3035	nfs_rw_destroy(&rp->r_lkserlock);
3036	mutex_destroy(&rp->r_statelock);
3037	cv_destroy(&rp->r_cv);
3038	cv_destroy(&rp->r_commit.c_cv);
3039	if (rp->r_flags & RDELMAPLIST)
3040		list_destroy(&rp->r_indelmap);
3041	nfs_free_r_path(rp);
3042	avl_destroy(&rp->r_dir);
3043	vn_invalid(vp);
3044	vn_free(vp);
3045	kmem_cache_free(rnode_cache, rp);
3046	VFS_RELE(vfsp);
3047}
3048
3049/*
3050 * Flush all vnodes in this (or every) vfs.
3051 * Used by nfs_sync and by nfs_unmount.
3052 */
3053void
3054rflush(struct vfs *vfsp, cred_t *cr)
3055{
3056	int index;
3057	rnode_t *rp;
3058	vnode_t *vp, **vplist;
3059	long num, cnt;
3060
3061	/*
3062	 * Check to see whether there is anything to do.
3063	 */
3064	num = rnew;
3065	if (num == 0)
3066		return;
3067
3068	/*
3069	 * Allocate a slot for all currently active rnodes on the
3070	 * supposition that they all may need flushing.
3071	 */
3072	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3073	cnt = 0;
3074
3075	/*
3076	 * If the vfs is known we can do fast path by iterating all rnodes that
3077	 * belongs to this vfs.  This is much faster than the traditional way
3078	 * of iterating rtable (below) in a case there is a lot of rnodes that
3079	 * does not belong to our vfs.
3080	 */
3081	if (vfsp != NULL) {
3082		mntinfo_t *mi = VFTOMI(vfsp);
3083
3084		mutex_enter(&mi->mi_rnodes_lock);
3085		for (rp = list_head(&mi->mi_rnodes); rp != NULL;
3086		    rp = list_next(&mi->mi_rnodes, rp)) {
3087			vp = RTOV(rp);
3088			/*
3089			 * Don't bother sync'ing a vp if it
3090			 * is part of virtual swap device or
3091			 * if VFS is read-only
3092			 */
3093			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3094				continue;
3095			/*
3096			 * If the vnode has pages and is marked as either dirty
3097			 * or mmap'd, hold and add this vnode to the list of
3098			 * vnodes to flush.
3099			 */
3100			ASSERT(vp->v_vfsp == vfsp);
3101			if (vn_has_cached_data(vp) &&
3102			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3103				VN_HOLD(vp);
3104				vplist[cnt++] = vp;
3105				if (cnt == num) {
3106					/*
3107					 * The vplist is full because there is
3108					 * too many rnodes.  We are done for
3109					 * now.
3110					 */
3111					break;
3112				}
3113			}
3114		}
3115		mutex_exit(&mi->mi_rnodes_lock);
3116
3117		goto done;
3118	}
3119
3120	ASSERT(vfsp == NULL);
3121
3122	/*
3123	 * Walk the hash queues looking for rnodes with page
3124	 * lists associated with them.  Make a list of these
3125	 * files.
3126	 */
3127	for (index = 0; index < rtablesize; index++) {
3128		rw_enter(&rtable[index].r_lock, RW_READER);
3129		for (rp = rtable[index].r_hashf;
3130		    rp != (rnode_t *)(&rtable[index]);
3131		    rp = rp->r_hashf) {
3132			vp = RTOV(rp);
3133			/*
3134			 * Don't bother sync'ing a vp if it
3135			 * is part of virtual swap device or
3136			 * if VFS is read-only
3137			 */
3138			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3139				continue;
3140			/*
3141			 * If the vnode has pages and is marked as either dirty
3142			 * or mmap'd, hold and add this vnode to the list of
3143			 * vnodes to flush.
3144			 */
3145			if (vn_has_cached_data(vp) &&
3146			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3147				VN_HOLD(vp);
3148				vplist[cnt++] = vp;
3149				if (cnt == num) {
3150					rw_exit(&rtable[index].r_lock);
3151					/*
3152					 * The vplist is full because there is
3153					 * too many rnodes.  We are done for
3154					 * now.
3155					 */
3156					goto done;
3157				}
3158			}
3159		}
3160		rw_exit(&rtable[index].r_lock);
3161	}
3162
3163done:
3164
3165	/*
3166	 * Flush and release all of the files on the list.
3167	 */
3168	while (cnt-- > 0) {
3169		vp = vplist[cnt];
3170		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3171		VN_RELE(vp);
3172	}
3173
3174	/*
3175	 * Free the space allocated to hold the list.
3176	 */
3177	kmem_free(vplist, num * sizeof (*vplist));
3178}
3179
3180/*
3181 * This probably needs to be larger than or equal to
3182 * log2(sizeof (struct rnode)) due to the way that rnodes are
3183 * allocated.
3184 */
3185#define	ACACHE_SHIFT_BITS	9
3186
3187static int
3188acachehash(rnode_t *rp, cred_t *cr)
3189{
3190
3191	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3192	    acachemask);
3193}
3194
3195#ifdef DEBUG
3196static long nfs_access_cache_hits = 0;
3197static long nfs_access_cache_misses = 0;
3198#endif
3199
3200nfs_access_type_t
3201nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3202{
3203	vnode_t *vp;
3204	acache_t *ap;
3205	acache_hash_t *hp;
3206	nfs_access_type_t all;
3207
3208	vp = RTOV(rp);
3209	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3210		return (NFS_ACCESS_UNKNOWN);
3211
3212	if (rp->r_acache != NULL) {
3213		hp = &acache[acachehash(rp, cr)];
3214		rw_enter(&hp->lock, RW_READER);
3215		ap = hp->next;
3216		while (ap != (acache_t *)hp) {
3217			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3218				if ((ap->known & acc) == acc) {
3219#ifdef DEBUG
3220					nfs_access_cache_hits++;
3221#endif
3222					if ((ap->allowed & acc) == acc)
3223						all = NFS_ACCESS_ALLOWED;
3224					else
3225						all = NFS_ACCESS_DENIED;
3226				} else {
3227#ifdef DEBUG
3228					nfs_access_cache_misses++;
3229#endif
3230					all = NFS_ACCESS_UNKNOWN;
3231				}
3232				rw_exit(&hp->lock);
3233				return (all);
3234			}
3235			ap = ap->next;
3236		}
3237		rw_exit(&hp->lock);
3238	}
3239
3240#ifdef DEBUG
3241	nfs_access_cache_misses++;
3242#endif
3243	return (NFS_ACCESS_UNKNOWN);
3244}
3245
3246void
3247nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3248{
3249	acache_t *ap;
3250	acache_t *nap;
3251	acache_hash_t *hp;
3252
3253	hp = &acache[acachehash(rp, cr)];
3254
3255	/*
3256	 * Allocate now assuming that mostly an allocation will be
3257	 * required.  This allows the allocation to happen without
3258	 * holding the hash bucket locked.
3259	 */
3260	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3261	if (nap != NULL) {
3262		nap->known = acc;
3263		nap->allowed = resacc;
3264		nap->rnode = rp;
3265		crhold(cr);
3266		nap->cred = cr;
3267		nap->hashq = hp;
3268	}
3269
3270	rw_enter(&hp->lock, RW_WRITER);
3271
3272	if (rp->r_acache != NULL) {
3273		ap = hp->next;
3274		while (ap != (acache_t *)hp) {
3275			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3276				ap->known |= acc;
3277				ap->allowed &= ~acc;
3278				ap->allowed |= resacc;
3279				rw_exit(&hp->lock);
3280				if (nap != NULL) {
3281					crfree(nap->cred);
3282					kmem_cache_free(acache_cache, nap);
3283				}
3284				return;
3285			}
3286			ap = ap->next;
3287		}
3288	}
3289
3290	if (nap != NULL) {
3291#ifdef DEBUG
3292		clstat_debug.access.value.ui64++;
3293#endif
3294		nap->next = hp->next;
3295		hp->next = nap;
3296		nap->next->prev = nap;
3297		nap->prev = (acache_t *)hp;
3298
3299		mutex_enter(&rp->r_statelock);
3300		nap->list = rp->r_acache;
3301		rp->r_acache = nap;
3302		mutex_exit(&rp->r_statelock);
3303	}
3304
3305	rw_exit(&hp->lock);
3306}
3307
3308int
3309nfs_access_purge_rp(rnode_t *rp)
3310{
3311	acache_t *ap;
3312	acache_t *tmpap;
3313	acache_t *rplist;
3314
3315	/*
3316	 * If there aren't any cached entries, then there is nothing
3317	 * to free.
3318	 */
3319	if (rp->r_acache == NULL)
3320		return (0);
3321
3322	mutex_enter(&rp->r_statelock);
3323	rplist = rp->r_acache;
3324	rp->r_acache = NULL;
3325	mutex_exit(&rp->r_statelock);
3326
3327	/*
3328	 * Loop through each entry in the list pointed to in the
3329	 * rnode.  Remove each of these entries from the hash
3330	 * queue that it is on and remove it from the list in
3331	 * the rnode.
3332	 */
3333	for (ap = rplist; ap != NULL; ap = tmpap) {
3334		rw_enter(&ap->hashq->lock, RW_WRITER);
3335		ap->prev->next = ap->next;
3336		ap->next->prev = ap->prev;
3337		rw_exit(&ap->hashq->lock);
3338
3339		tmpap = ap->list;
3340		crfree(ap->cred);
3341		kmem_cache_free(acache_cache, ap);
3342#ifdef DEBUG
3343		clstat_debug.access.value.ui64--;
3344#endif
3345	}
3346
3347	return (1);
3348}
3349
3350static const char prefix[] = ".nfs";
3351
3352static kmutex_t newnum_lock;
3353
3354int
3355newnum(void)
3356{
3357	static uint_t newnum = 0;
3358	uint_t id;
3359
3360	mutex_enter(&newnum_lock);
3361	if (newnum == 0)
3362		newnum = gethrestime_sec() & 0xffff;
3363	id = newnum++;
3364	mutex_exit(&newnum_lock);
3365	return (id);
3366}
3367
3368char *
3369newname(void)
3370{
3371	char *news;
3372	char *s;
3373	const char *p;
3374	uint_t id;
3375
3376	id = newnum();
3377	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3378	s = news;
3379	p = prefix;
3380	while (*p != '\0')
3381		*s++ = *p++;
3382	while (id != 0) {
3383		*s++ = "0123456789ABCDEF"[id & 0x0f];
3384		id >>= 4;
3385	}
3386	*s = '\0';
3387	return (news);
3388}
3389
3390/*
3391 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3392 * framework.
3393 */
3394static int
3395cl_snapshot(kstat_t *ksp, void *buf, int rw)
3396{
3397	ksp->ks_snaptime = gethrtime();
3398	if (rw == KSTAT_WRITE) {
3399		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3400#ifdef DEBUG
3401		/*
3402		 * Currently only the global zone can write to kstats, but we
3403		 * add the check just for paranoia.
3404		 */
3405		if (INGLOBALZONE(curproc))
3406			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3407			    sizeof (clstat_debug));
3408#endif
3409	} else {
3410		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3411#ifdef DEBUG
3412		/*
3413		 * If we're displaying the "global" debug kstat values, we
3414		 * display them as-is to all zones since in fact they apply to
3415		 * the system as a whole.
3416		 */
3417		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3418		    sizeof (clstat_debug));
3419#endif
3420	}
3421	return (0);
3422}
3423
3424static void *
3425clinit_zone(zoneid_t zoneid)
3426{
3427	kstat_t *nfs_client_kstat;
3428	struct nfs_clnt *nfscl;
3429	uint_t ndata;
3430
3431	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3432	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3433	nfscl->nfscl_chtable = NULL;
3434	nfscl->nfscl_zoneid = zoneid;
3435
3436	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3437	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3438#ifdef DEBUG
3439	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3440#endif
3441	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3442	    "misc", KSTAT_TYPE_NAMED, ndata,
3443	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3444		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3445		nfs_client_kstat->ks_snapshot = cl_snapshot;
3446		kstat_install(nfs_client_kstat);
3447	}
3448	mutex_enter(&nfs_clnt_list_lock);
3449	list_insert_head(&nfs_clnt_list, nfscl);
3450	mutex_exit(&nfs_clnt_list_lock);
3451	return (nfscl);
3452}
3453
3454/*ARGSUSED*/
3455static void
3456clfini_zone(zoneid_t zoneid, void *arg)
3457{
3458	struct nfs_clnt *nfscl = arg;
3459	chhead_t *chp, *next;
3460
3461	if (nfscl == NULL)
3462		return;
3463	mutex_enter(&nfs_clnt_list_lock);
3464	list_remove(&nfs_clnt_list, nfscl);
3465	mutex_exit(&nfs_clnt_list_lock);
3466	clreclaim_zone(nfscl, 0);
3467	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3468		ASSERT(chp->ch_list == NULL);
3469		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3470		next = chp->ch_next;
3471		kmem_free(chp, sizeof (*chp));
3472	}
3473	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3474	mutex_destroy(&nfscl->nfscl_chtable_lock);
3475	kmem_free(nfscl, sizeof (*nfscl));
3476}
3477
3478/*
3479 * Called by endpnt_destructor to make sure the client handles are
3480 * cleaned up before the RPC endpoints.  This becomes a no-op if
3481 * clfini_zone (above) is called first.  This function is needed
3482 * (rather than relying on clfini_zone to clean up) because the ZSD
3483 * callbacks have no ordering mechanism, so we have no way to ensure
3484 * that clfini_zone is called before endpnt_destructor.
3485 */
3486void
3487clcleanup_zone(zoneid_t zoneid)
3488{
3489	struct nfs_clnt *nfscl;
3490
3491	mutex_enter(&nfs_clnt_list_lock);
3492	nfscl = list_head(&nfs_clnt_list);
3493	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3494		if (nfscl->nfscl_zoneid == zoneid) {
3495			clreclaim_zone(nfscl, 0);
3496			break;
3497		}
3498	}
3499	mutex_exit(&nfs_clnt_list_lock);
3500}
3501
3502int
3503nfs_subrinit(void)
3504{
3505	int i;
3506	ulong_t nrnode_max;
3507
3508	/*
3509	 * Allocate and initialize the rnode hash queues
3510	 */
3511	if (nrnode <= 0)
3512		nrnode = ncsize;
3513	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3514	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3515		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3516		    "!setting nrnode to max value of %ld", nrnode_max);
3517		nrnode = nrnode_max;
3518	}
3519
3520	rtablesize = 1 << highbit(nrnode / hashlen);
3521	rtablemask = rtablesize - 1;
3522	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3523	for (i = 0; i < rtablesize; i++) {
3524		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3525		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3526		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3527	}
3528	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3529	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3530
3531	/*
3532	 * Allocate and initialize the access cache
3533	 */
3534
3535	/*
3536	 * Initial guess is one access cache entry per rnode unless
3537	 * nacache is set to a non-zero value and then it is used to
3538	 * indicate a guess at the number of access cache entries.
3539	 */
3540	if (nacache > 0)
3541		acachesize = 1 << highbit(nacache / hashlen);
3542	else
3543		acachesize = rtablesize;
3544	acachemask = acachesize - 1;
3545	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3546	for (i = 0; i < acachesize; i++) {
3547		acache[i].next = (acache_t *)&acache[i];
3548		acache[i].prev = (acache_t *)&acache[i];
3549		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3550	}
3551	acache_cache = kmem_cache_create("nfs_access_cache",
3552	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3553	/*
3554	 * Allocate and initialize the client handle cache
3555	 */
3556	chtab_cache = kmem_cache_create("client_handle_cache",
3557	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3558	/*
3559	 * Initialize the list of per-zone client handles (and associated data).
3560	 * This needs to be done before we call zone_key_create().
3561	 */
3562	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3563	    offsetof(struct nfs_clnt, nfscl_node));
3564	/*
3565	 * Initialize the zone_key for per-zone client handle lists.
3566	 */
3567	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3568	/*
3569	 * Initialize the various mutexes and reader/writer locks
3570	 */
3571	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3572	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3573	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3574
3575	/*
3576	 * Assign unique major number for all nfs mounts
3577	 */
3578	if ((nfs_major = getudev()) == -1) {
3579		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3580		    "nfs: init: can't get unique device number");
3581		nfs_major = 0;
3582	}
3583	nfs_minor = 0;
3584
3585	if (nfs3_jukebox_delay == 0)
3586		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3587
3588	return (0);
3589}
3590
3591void
3592nfs_subrfini(void)
3593{
3594	int i;
3595
3596	/*
3597	 * Deallocate the rnode hash queues
3598	 */
3599	kmem_cache_destroy(rnode_cache);
3600
3601	for (i = 0; i < rtablesize; i++)
3602		rw_destroy(&rtable[i].r_lock);
3603	kmem_free(rtable, rtablesize * sizeof (*rtable));
3604
3605	/*
3606	 * Deallocated the access cache
3607	 */
3608	kmem_cache_destroy(acache_cache);
3609
3610	for (i = 0; i < acachesize; i++)
3611		rw_destroy(&acache[i].lock);
3612	kmem_free(acache, acachesize * sizeof (*acache));
3613
3614	/*
3615	 * Deallocate the client handle cache
3616	 */
3617	kmem_cache_destroy(chtab_cache);
3618
3619	/*
3620	 * Destroy the various mutexes and reader/writer locks
3621	 */
3622	mutex_destroy(&rpfreelist_lock);
3623	mutex_destroy(&newnum_lock);
3624	mutex_destroy(&nfs_minor_lock);
3625	(void) zone_key_delete(nfsclnt_zone_key);
3626}
3627
3628enum nfsstat
3629puterrno(int error)
3630{
3631
3632	switch (error) {
3633	case EOPNOTSUPP:
3634		return (NFSERR_OPNOTSUPP);
3635	case ENAMETOOLONG:
3636		return (NFSERR_NAMETOOLONG);
3637	case ENOTEMPTY:
3638		return (NFSERR_NOTEMPTY);
3639	case EDQUOT:
3640		return (NFSERR_DQUOT);
3641	case ESTALE:
3642		return (NFSERR_STALE);
3643	case EREMOTE:
3644		return (NFSERR_REMOTE);
3645	case ENOSYS:
3646		return (NFSERR_OPNOTSUPP);
3647	case EOVERFLOW:
3648		return (NFSERR_INVAL);
3649	default:
3650		return ((enum nfsstat)error);
3651	}
3652	/* NOTREACHED */
3653}
3654
3655int
3656geterrno(enum nfsstat status)
3657{
3658
3659	switch (status) {
3660	case NFSERR_OPNOTSUPP:
3661		return (EOPNOTSUPP);
3662	case NFSERR_NAMETOOLONG:
3663		return (ENAMETOOLONG);
3664	case NFSERR_NOTEMPTY:
3665		return (ENOTEMPTY);
3666	case NFSERR_DQUOT:
3667		return (EDQUOT);
3668	case NFSERR_STALE:
3669		return (ESTALE);
3670	case NFSERR_REMOTE:
3671		return (EREMOTE);
3672	case NFSERR_WFLUSH:
3673		return (EIO);
3674	default:
3675		return ((int)status);
3676	}
3677	/* NOTREACHED */
3678}
3679
3680enum nfsstat3
3681puterrno3(int error)
3682{
3683
3684#ifdef DEBUG
3685	switch (error) {
3686	case 0:
3687		return (NFS3_OK);
3688	case EPERM:
3689		return (NFS3ERR_PERM);
3690	case ENOENT:
3691		return (NFS3ERR_NOENT);
3692	case EIO:
3693		return (NFS3ERR_IO);
3694	case ENXIO:
3695		return (NFS3ERR_NXIO);
3696	case EACCES:
3697		return (NFS3ERR_ACCES);
3698	case EEXIST:
3699		return (NFS3ERR_EXIST);
3700	case EXDEV:
3701		return (NFS3ERR_XDEV);
3702	case ENODEV:
3703		return (NFS3ERR_NODEV);
3704	case ENOTDIR:
3705		return (NFS3ERR_NOTDIR);
3706	case EISDIR:
3707		return (NFS3ERR_ISDIR);
3708	case EINVAL:
3709		return (NFS3ERR_INVAL);
3710	case EFBIG:
3711		return (NFS3ERR_FBIG);
3712	case ENOSPC:
3713		return (NFS3ERR_NOSPC);
3714	case EROFS:
3715		return (NFS3ERR_ROFS);
3716	case EMLINK:
3717		return (NFS3ERR_MLINK);
3718	case ENAMETOOLONG:
3719		return (NFS3ERR_NAMETOOLONG);
3720	case ENOTEMPTY:
3721		return (NFS3ERR_NOTEMPTY);
3722	case EDQUOT:
3723		return (NFS3ERR_DQUOT);
3724	case ESTALE:
3725		return (NFS3ERR_STALE);
3726	case EREMOTE:
3727		return (NFS3ERR_REMOTE);
3728	case ENOSYS:
3729	case EOPNOTSUPP:
3730		return (NFS3ERR_NOTSUPP);
3731	case EOVERFLOW:
3732		return (NFS3ERR_INVAL);
3733	default:
3734		zcmn_err(getzoneid(), CE_WARN,
3735		    "puterrno3: got error %d", error);
3736		return ((enum nfsstat3)error);
3737	}
3738#else
3739	switch (error) {
3740	case ENAMETOOLONG:
3741		return (NFS3ERR_NAMETOOLONG);
3742	case ENOTEMPTY:
3743		return (NFS3ERR_NOTEMPTY);
3744	case EDQUOT:
3745		return (NFS3ERR_DQUOT);
3746	case ESTALE:
3747		return (NFS3ERR_STALE);
3748	case ENOSYS:
3749	case EOPNOTSUPP:
3750		return (NFS3ERR_NOTSUPP);
3751	case EREMOTE:
3752		return (NFS3ERR_REMOTE);
3753	case EOVERFLOW:
3754		return (NFS3ERR_INVAL);
3755	default:
3756		return ((enum nfsstat3)error);
3757	}
3758#endif
3759}
3760
3761int
3762geterrno3(enum nfsstat3 status)
3763{
3764
3765#ifdef DEBUG
3766	switch (status) {
3767	case NFS3_OK:
3768		return (0);
3769	case NFS3ERR_PERM:
3770		return (EPERM);
3771	case NFS3ERR_NOENT:
3772		return (ENOENT);
3773	case NFS3ERR_IO:
3774		return (EIO);
3775	case NFS3ERR_NXIO:
3776		return (ENXIO);
3777	case NFS3ERR_ACCES:
3778		return (EACCES);
3779	case NFS3ERR_EXIST:
3780		return (EEXIST);
3781	case NFS3ERR_XDEV:
3782		return (EXDEV);
3783	case NFS3ERR_NODEV:
3784		return (ENODEV);
3785	case NFS3ERR_NOTDIR:
3786		return (ENOTDIR);
3787	case NFS3ERR_ISDIR:
3788		return (EISDIR);
3789	case NFS3ERR_INVAL:
3790		return (EINVAL);
3791	case NFS3ERR_FBIG:
3792		return (EFBIG);
3793	case NFS3ERR_NOSPC:
3794		return (ENOSPC);
3795	case NFS3ERR_ROFS:
3796		return (EROFS);
3797	case NFS3ERR_MLINK:
3798		return (EMLINK);
3799	case NFS3ERR_NAMETOOLONG:
3800		return (ENAMETOOLONG);
3801	case NFS3ERR_NOTEMPTY:
3802		return (ENOTEMPTY);
3803	case NFS3ERR_DQUOT:
3804		return (EDQUOT);
3805	case NFS3ERR_STALE:
3806		return (ESTALE);
3807	case NFS3ERR_REMOTE:
3808		return (EREMOTE);
3809	case NFS3ERR_BADHANDLE:
3810		return (ESTALE);
3811	case NFS3ERR_NOT_SYNC:
3812		return (EINVAL);
3813	case NFS3ERR_BAD_COOKIE:
3814		return (ENOENT);
3815	case NFS3ERR_NOTSUPP:
3816		return (EOPNOTSUPP);
3817	case NFS3ERR_TOOSMALL:
3818		return (EINVAL);
3819	case NFS3ERR_SERVERFAULT:
3820		return (EIO);
3821	case NFS3ERR_BADTYPE:
3822		return (EINVAL);
3823	case NFS3ERR_JUKEBOX:
3824		return (ENXIO);
3825	default:
3826		zcmn_err(getzoneid(), CE_WARN,
3827		    "geterrno3: got status %d", status);
3828		return ((int)status);
3829	}
3830#else
3831	switch (status) {
3832	case NFS3ERR_NAMETOOLONG:
3833		return (ENAMETOOLONG);
3834	case NFS3ERR_NOTEMPTY:
3835		return (ENOTEMPTY);
3836	case NFS3ERR_DQUOT:
3837		return (EDQUOT);
3838	case NFS3ERR_STALE:
3839	case NFS3ERR_BADHANDLE:
3840		return (ESTALE);
3841	case NFS3ERR_NOTSUPP:
3842		return (EOPNOTSUPP);
3843	case NFS3ERR_REMOTE:
3844		return (EREMOTE);
3845	case NFS3ERR_NOT_SYNC:
3846	case NFS3ERR_TOOSMALL:
3847	case NFS3ERR_BADTYPE:
3848		return (EINVAL);
3849	case NFS3ERR_BAD_COOKIE:
3850		return (ENOENT);
3851	case NFS3ERR_SERVERFAULT:
3852		return (EIO);
3853	case NFS3ERR_JUKEBOX:
3854		return (ENXIO);
3855	default:
3856		return ((int)status);
3857	}
3858#endif
3859}
3860
3861rddir_cache *
3862rddir_cache_alloc(int flags)
3863{
3864	rddir_cache *rc;
3865
3866	rc = kmem_alloc(sizeof (*rc), flags);
3867	if (rc != NULL) {
3868		rc->entries = NULL;
3869		rc->flags = RDDIR;
3870		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3871		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3872		rc->count = 1;
3873#ifdef DEBUG
3874		atomic_inc_64(&clstat_debug.dirent.value.ui64);
3875#endif
3876	}
3877	return (rc);
3878}
3879
3880static void
3881rddir_cache_free(rddir_cache *rc)
3882{
3883
3884#ifdef DEBUG
3885	atomic_dec_64(&clstat_debug.dirent.value.ui64);
3886#endif
3887	if (rc->entries != NULL) {
3888#ifdef DEBUG
3889		rddir_cache_buf_free(rc->entries, rc->buflen);
3890#else
3891		kmem_free(rc->entries, rc->buflen);
3892#endif
3893	}
3894	cv_destroy(&rc->cv);
3895	mutex_destroy(&rc->lock);
3896	kmem_free(rc, sizeof (*rc));
3897}
3898
3899void
3900rddir_cache_hold(rddir_cache *rc)
3901{
3902
3903	mutex_enter(&rc->lock);
3904	rc->count++;
3905	mutex_exit(&rc->lock);
3906}
3907
3908void
3909rddir_cache_rele(rddir_cache *rc)
3910{
3911
3912	mutex_enter(&rc->lock);
3913	ASSERT(rc->count > 0);
3914	if (--rc->count == 0) {
3915		mutex_exit(&rc->lock);
3916		rddir_cache_free(rc);
3917	} else
3918		mutex_exit(&rc->lock);
3919}
3920
3921#ifdef DEBUG
3922char *
3923rddir_cache_buf_alloc(size_t size, int flags)
3924{
3925	char *rc;
3926
3927	rc = kmem_alloc(size, flags);
3928	if (rc != NULL)
3929		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3930	return (rc);
3931}
3932
3933void
3934rddir_cache_buf_free(void *addr, size_t size)
3935{
3936
3937	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3938	kmem_free(addr, size);
3939}
3940#endif
3941
3942static int
3943nfs_free_data_reclaim(rnode_t *rp)
3944{
3945	char *contents;
3946	int size;
3947	vsecattr_t *vsp;
3948	nfs3_pathconf_info *info;
3949	int freed;
3950	cred_t *cred;
3951
3952	/*
3953	 * Free any held credentials and caches which
3954	 * may be associated with this rnode.
3955	 */
3956	mutex_enter(&rp->r_statelock);
3957	cred = rp->r_cred;
3958	rp->r_cred = NULL;
3959	contents = rp->r_symlink.contents;
3960	size = rp->r_symlink.size;
3961	rp->r_symlink.contents = NULL;
3962	vsp = rp->r_secattr;
3963	rp->r_secattr = NULL;
3964	info = rp->r_pathconf;
3965	rp->r_pathconf = NULL;
3966	mutex_exit(&rp->r_statelock);
3967
3968	if (cred != NULL)
3969		crfree(cred);
3970
3971	/*
3972	 * Free the access cache entries.
3973	 */
3974	freed = nfs_access_purge_rp(rp);
3975
3976	if (!HAVE_RDDIR_CACHE(rp) &&
3977	    contents == NULL &&
3978	    vsp == NULL &&
3979	    info == NULL)
3980		return (freed);
3981
3982	/*
3983	 * Free the readdir cache entries
3984	 */
3985	if (HAVE_RDDIR_CACHE(rp))
3986		nfs_purge_rddir_cache(RTOV(rp));
3987
3988	/*
3989	 * Free the symbolic link cache.
3990	 */
3991	if (contents != NULL) {
3992
3993		kmem_free((void *)contents, size);
3994	}
3995
3996	/*
3997	 * Free any cached ACL.
3998	 */
3999	if (vsp != NULL)
4000		nfs_acl_free(vsp);
4001
4002	/*
4003	 * Free any cached pathconf information.
4004	 */
4005	if (info != NULL)
4006		kmem_free(info, sizeof (*info));
4007
4008	return (1);
4009}
4010
4011static int
4012nfs_active_data_reclaim(rnode_t *rp)
4013{
4014	char *contents;
4015	int size;
4016	vsecattr_t *vsp;
4017	nfs3_pathconf_info *info;
4018	int freed;
4019
4020	/*
4021	 * Free any held credentials and caches which
4022	 * may be associated with this rnode.
4023	 */
4024	if (!mutex_tryenter(&rp->r_statelock))
4025		return (0);
4026	contents = rp->r_symlink.contents;
4027	size = rp->r_symlink.size;
4028	rp->r_symlink.contents = NULL;
4029	vsp = rp->r_secattr;
4030	rp->r_secattr = NULL;
4031	info = rp->r_pathconf;
4032	rp->r_pathconf = NULL;
4033	mutex_exit(&rp->r_statelock);
4034
4035	/*
4036	 * Free the access cache entries.
4037	 */
4038	freed = nfs_access_purge_rp(rp);
4039
4040	if (!HAVE_RDDIR_CACHE(rp) &&
4041	    contents == NULL &&
4042	    vsp == NULL &&
4043	    info == NULL)
4044		return (freed);
4045
4046	/*
4047	 * Free the readdir cache entries
4048	 */
4049	if (HAVE_RDDIR_CACHE(rp))
4050		nfs_purge_rddir_cache(RTOV(rp));
4051
4052	/*
4053	 * Free the symbolic link cache.
4054	 */
4055	if (contents != NULL) {
4056
4057		kmem_free((void *)contents, size);
4058	}
4059
4060	/*
4061	 * Free any cached ACL.
4062	 */
4063	if (vsp != NULL)
4064		nfs_acl_free(vsp);
4065
4066	/*
4067	 * Free any cached pathconf information.
4068	 */
4069	if (info != NULL)
4070		kmem_free(info, sizeof (*info));
4071
4072	return (1);
4073}
4074
4075static int
4076nfs_free_reclaim(void)
4077{
4078	int freed;
4079	rnode_t *rp;
4080
4081#ifdef DEBUG
4082	clstat_debug.f_reclaim.value.ui64++;
4083#endif
4084	freed = 0;
4085	mutex_enter(&rpfreelist_lock);
4086	rp = rpfreelist;
4087	if (rp != NULL) {
4088		do {
4089			if (nfs_free_data_reclaim(rp))
4090				freed = 1;
4091		} while ((rp = rp->r_freef) != rpfreelist);
4092	}
4093	mutex_exit(&rpfreelist_lock);
4094	return (freed);
4095}
4096
4097static int
4098nfs_active_reclaim(void)
4099{
4100	int freed;
4101	int index;
4102	rnode_t *rp;
4103
4104#ifdef DEBUG
4105	clstat_debug.a_reclaim.value.ui64++;
4106#endif
4107	freed = 0;
4108	for (index = 0; index < rtablesize; index++) {
4109		rw_enter(&rtable[index].r_lock, RW_READER);
4110		for (rp = rtable[index].r_hashf;
4111		    rp != (rnode_t *)(&rtable[index]);
4112		    rp = rp->r_hashf) {
4113			if (nfs_active_data_reclaim(rp))
4114				freed = 1;
4115		}
4116		rw_exit(&rtable[index].r_lock);
4117	}
4118	return (freed);
4119}
4120
4121static int
4122nfs_rnode_reclaim(void)
4123{
4124	int freed;
4125	rnode_t *rp;
4126	vnode_t *vp;
4127
4128#ifdef DEBUG
4129	clstat_debug.r_reclaim.value.ui64++;
4130#endif
4131	freed = 0;
4132	mutex_enter(&rpfreelist_lock);
4133	while ((rp = rpfreelist) != NULL) {
4134		rp_rmfree(rp);
4135		mutex_exit(&rpfreelist_lock);
4136		if (rp->r_flags & RHASHED) {
4137			vp = RTOV(rp);
4138			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4139			mutex_enter(&vp->v_lock);
4140			if (vp->v_count > 1) {
4141				VN_RELE_LOCKED(vp);
4142				mutex_exit(&vp->v_lock);
4143				rw_exit(&rp->r_hashq->r_lock);
4144				mutex_enter(&rpfreelist_lock);
4145				continue;
4146			}
4147			mutex_exit(&vp->v_lock);
4148			rp_rmhash_locked(rp);
4149			rw_exit(&rp->r_hashq->r_lock);
4150		}
4151		/*
4152		 * This call to rp_addfree will end up destroying the
4153		 * rnode, but in a safe way with the appropriate set
4154		 * of checks done.
4155		 */
4156		rp_addfree(rp, CRED());
4157		mutex_enter(&rpfreelist_lock);
4158	}
4159	mutex_exit(&rpfreelist_lock);
4160	return (freed);
4161}
4162
4163/*ARGSUSED*/
4164static void
4165nfs_reclaim(void *cdrarg)
4166{
4167
4168#ifdef DEBUG
4169	clstat_debug.reclaim.value.ui64++;
4170#endif
4171	if (nfs_free_reclaim())
4172		return;
4173
4174	if (nfs_active_reclaim())
4175		return;
4176
4177	(void) nfs_rnode_reclaim();
4178}
4179
4180/*
4181 * NFS client failover support
4182 *
4183 * Routines to copy filehandles
4184 */
4185void
4186nfscopyfh(caddr_t fhp, vnode_t *vp)
4187{
4188	fhandle_t *dest = (fhandle_t *)fhp;
4189
4190	if (dest != NULL)
4191		*dest = *VTOFH(vp);
4192}
4193
4194void
4195nfs3copyfh(caddr_t fhp, vnode_t *vp)
4196{
4197	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4198
4199	if (dest != NULL)
4200		*dest = *VTOFH3(vp);
4201}
4202
4203/*
4204 * NFS client failover support
4205 *
4206 * failover_safe() will test various conditions to ensure that
4207 * failover is permitted for this vnode.  It will be denied
4208 * if:
4209 *	1) the operation in progress does not support failover (NULL fi)
4210 *	2) there are no available replicas (NULL mi_servers->sv_next)
4211 *	3) any locks are outstanding on this file
4212 */
4213static int
4214failover_safe(failinfo_t *fi)
4215{
4216
4217	/*
4218	 * Does this op permit failover?
4219	 */
4220	if (fi == NULL || fi->vp == NULL)
4221		return (0);
4222
4223	/*
4224	 * Are there any alternates to failover to?
4225	 */
4226	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4227		return (0);
4228
4229	/*
4230	 * Disable check; we've forced local locking
4231	 *
4232	 * if (flk_has_remote_locks(fi->vp))
4233	 *	return (0);
4234	 */
4235
4236	/*
4237	 * If we have no partial path, we can't do anything
4238	 */
4239	if (VTOR(fi->vp)->r_path == NULL)
4240		return (0);
4241
4242	return (1);
4243}
4244
4245#include <sys/thread.h>
4246
4247/*
4248 * NFS client failover support
4249 *
4250 * failover_newserver() will start a search for a new server,
4251 * preferably by starting an async thread to do the work.  If
4252 * someone is already doing this (recognizable by MI_BINDINPROG
4253 * being set), it will simply return and the calling thread
4254 * will queue on the mi_failover_cv condition variable.
4255 */
4256static void
4257failover_newserver(mntinfo_t *mi)
4258{
4259	/*
4260	 * Check if someone else is doing this already
4261	 */
4262	mutex_enter(&mi->mi_lock);
4263	if (mi->mi_flags & MI_BINDINPROG) {
4264		mutex_exit(&mi->mi_lock);
4265		return;
4266	}
4267	mi->mi_flags |= MI_BINDINPROG;
4268
4269	/*
4270	 * Need to hold the vfs struct so that it can't be released
4271	 * while the failover thread is selecting a new server.
4272	 */
4273	VFS_HOLD(mi->mi_vfsp);
4274
4275	/*
4276	 * Start a thread to do the real searching.
4277	 */
4278	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4279
4280	mutex_exit(&mi->mi_lock);
4281}
4282
4283/*
4284 * NFS client failover support
4285 *
4286 * failover_thread() will find a new server to replace the one
4287 * currently in use, wake up other threads waiting on this mount
4288 * point, and die.  It will start at the head of the server list
4289 * and poll servers until it finds one with an NFS server which is
4290 * registered and responds to a NULL procedure ping.
4291 *
4292 * XXX failover_thread is unsafe within the scope of the
4293 * present model defined for cpr to suspend the system.
4294 * Specifically, over-the-wire calls made by the thread
4295 * are unsafe. The thread needs to be reevaluated in case of
4296 * future updates to the cpr suspend model.
4297 */
4298static void
4299failover_thread(mntinfo_t *mi)
4300{
4301	servinfo_t *svp = NULL;
4302	CLIENT *cl;
4303	enum clnt_stat status;
4304	struct timeval tv;
4305	int error;
4306	int oncethru = 0;
4307	callb_cpr_t cprinfo;
4308	rnode_t *rp;
4309	int index;
4310	char *srvnames;
4311	size_t srvnames_len;
4312	struct nfs_clnt *nfscl = NULL;
4313	zoneid_t zoneid = getzoneid();
4314
4315#ifdef DEBUG
4316	/*
4317	 * This is currently only needed to access counters which exist on
4318	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4319	 * on non-DEBUG kernels.
4320	 */
4321	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4322	ASSERT(nfscl != NULL);
4323#endif
4324
4325	/*
4326	 * Its safe to piggyback on the mi_lock since failover_newserver()
4327	 * code guarantees that there will be only one failover thread
4328	 * per mountinfo at any instance.
4329	 */
4330	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4331	    "failover_thread");
4332
4333	mutex_enter(&mi->mi_lock);
4334	while (mi->mi_readers) {
4335		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4336		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4337		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4338	}
4339	mutex_exit(&mi->mi_lock);
4340
4341	tv.tv_sec = 2;
4342	tv.tv_usec = 0;
4343
4344	/*
4345	 * Ping the null NFS procedure of every server in
4346	 * the list until one responds.  We always start
4347	 * at the head of the list and always skip the one
4348	 * that is current, since it's caused us a problem.
4349	 */
4350	while (svp == NULL) {
4351		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4352			if (!oncethru && svp == mi->mi_curr_serv)
4353				continue;
4354
4355			/*
4356			 * If the file system was forcibly umounted
4357			 * while trying to do a failover, then just
4358			 * give up on the failover.  It won't matter
4359			 * what the server is.
4360			 */
4361			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4362				svp = NULL;
4363				goto done;
4364			}
4365
4366			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4367			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4368			if (error)
4369				continue;
4370
4371			if (!(mi->mi_flags & MI_INT))
4372				cl->cl_nosignal = TRUE;
4373			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4374			    xdr_void, NULL, tv);
4375			if (!(mi->mi_flags & MI_INT))
4376				cl->cl_nosignal = FALSE;
4377			AUTH_DESTROY(cl->cl_auth);
4378			CLNT_DESTROY(cl);
4379			if (status == RPC_SUCCESS) {
4380				if (svp == mi->mi_curr_serv) {
4381#ifdef DEBUG
4382					zcmn_err(zoneid, CE_NOTE,
4383			"NFS%d: failing over: selecting original server %s",
4384					    mi->mi_vers, svp->sv_hostname);
4385#else
4386					zcmn_err(zoneid, CE_NOTE,
4387			"NFS: failing over: selecting original server %s",
4388					    svp->sv_hostname);
4389#endif
4390				} else {
4391#ifdef DEBUG
4392					zcmn_err(zoneid, CE_NOTE,
4393				    "NFS%d: failing over from %s to %s",
4394					    mi->mi_vers,
4395					    mi->mi_curr_serv->sv_hostname,
4396					    svp->sv_hostname);
4397#else
4398					zcmn_err(zoneid, CE_NOTE,
4399				    "NFS: failing over from %s to %s",
4400					    mi->mi_curr_serv->sv_hostname,
4401					    svp->sv_hostname);
4402#endif
4403				}
4404				break;
4405			}
4406		}
4407
4408		if (svp == NULL) {
4409			if (!oncethru) {
4410				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4411#ifdef DEBUG
4412				zprintf(zoneid,
4413				    "NFS%d servers %s not responding "
4414				    "still trying\n", mi->mi_vers, srvnames);
4415#else
4416				zprintf(zoneid, "NFS servers %s not responding "
4417				    "still trying\n", srvnames);
4418#endif
4419				oncethru = 1;
4420			}
4421			mutex_enter(&mi->mi_lock);
4422			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4423			mutex_exit(&mi->mi_lock);
4424			delay(hz);
4425			mutex_enter(&mi->mi_lock);
4426			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4427			mutex_exit(&mi->mi_lock);
4428		}
4429	}
4430
4431	if (oncethru) {
4432#ifdef DEBUG
4433		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4434#else
4435		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4436#endif
4437	}
4438
4439	if (svp != mi->mi_curr_serv) {
4440		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4441		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4442		rw_enter(&rtable[index].r_lock, RW_WRITER);
4443		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4444		    mi->mi_vfsp);
4445		if (rp != NULL) {
4446			if (rp->r_flags & RHASHED)
4447				rp_rmhash_locked(rp);
4448			rw_exit(&rtable[index].r_lock);
4449			rp->r_server = svp;
4450			rp->r_fh = svp->sv_fhandle;
4451			(void) nfs_free_data_reclaim(rp);
4452			index = rtablehash(&rp->r_fh);
4453			rp->r_hashq = &rtable[index];
4454			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4455			vn_exists(RTOV(rp));
4456			rp_addhash(rp);
4457			rw_exit(&rp->r_hashq->r_lock);
4458			VN_RELE(RTOV(rp));
4459		} else
4460			rw_exit(&rtable[index].r_lock);
4461	}
4462
4463done:
4464	if (oncethru)
4465		kmem_free(srvnames, srvnames_len);
4466	mutex_enter(&mi->mi_lock);
4467	mi->mi_flags &= ~MI_BINDINPROG;
4468	if (svp != NULL) {
4469		mi->mi_curr_serv = svp;
4470		mi->mi_failover++;
4471#ifdef DEBUG
4472	nfscl->nfscl_stat.failover.value.ui64++;
4473#endif
4474	}
4475	cv_broadcast(&mi->mi_failover_cv);
4476	CALLB_CPR_EXIT(&cprinfo);
4477	VFS_RELE(mi->mi_vfsp);
4478	zthread_exit();
4479	/* NOTREACHED */
4480}
4481
4482/*
4483 * NFS client failover support
4484 *
4485 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4486 * is cleared, meaning that failover is complete.  Called with
4487 * mi_lock mutex held.
4488 */
4489static int
4490failover_wait(mntinfo_t *mi)
4491{
4492	k_sigset_t smask;
4493
4494	/*
4495	 * If someone else is hunting for a living server,
4496	 * sleep until it's done.  After our sleep, we may
4497	 * be bound to the right server and get off cheaply.
4498	 */
4499	while (mi->mi_flags & MI_BINDINPROG) {
4500		/*
4501		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4502		 * and SIGTERM. (Preserving the existing masks).
4503		 * Mask out SIGINT if mount option nointr is specified.
4504		 */
4505		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4506		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4507			/*
4508			 * restore original signal mask
4509			 */
4510			sigunintr(&smask);
4511			return (EINTR);
4512		}
4513		/*
4514		 * restore original signal mask
4515		 */
4516		sigunintr(&smask);
4517	}
4518	return (0);
4519}
4520
4521/*
4522 * NFS client failover support
4523 *
4524 * failover_remap() will do a partial pathname lookup and find the
4525 * desired vnode on the current server.  The interim vnode will be
4526 * discarded after we pilfer the new filehandle.
4527 *
4528 * Side effects:
4529 * - This routine will also update the filehandle in the args structure
4530 *    pointed to by the fi->fhp pointer if it is non-NULL.
4531 */
4532
4533static int
4534failover_remap(failinfo_t *fi)
4535{
4536	vnode_t *vp, *nvp, *rootvp;
4537	rnode_t *rp, *nrp;
4538	mntinfo_t *mi;
4539	int error;
4540#ifdef DEBUG
4541	struct nfs_clnt *nfscl;
4542
4543	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4544	ASSERT(nfscl != NULL);
4545#endif
4546	/*
4547	 * Sanity check
4548	 */
4549	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4550		return (EINVAL);
4551	vp = fi->vp;
4552	rp = VTOR(vp);
4553	mi = VTOMI(vp);
4554
4555	if (!(vp->v_flag & VROOT)) {
4556		/*
4557		 * Given the root fh, use the path stored in
4558		 * the rnode to find the fh for the new server.
4559		 */
4560		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4561		if (error)
4562			return (error);
4563
4564		error = failover_lookup(rp->r_path, rootvp,
4565		    fi->lookupproc, fi->xattrdirproc, &nvp);
4566
4567		VN_RELE(rootvp);
4568
4569		if (error)
4570			return (error);
4571
4572		/*
4573		 * If we found the same rnode, we're done now
4574		 */
4575		if (nvp == vp) {
4576			/*
4577			 * Failed and the new server may physically be same
4578			 * OR may share a same disk subsystem. In this case
4579			 * file handle for a particular file path is not going
4580			 * to change, given the same filehandle lookup will
4581			 * always locate the same rnode as the existing one.
4582			 * All we might need to do is to update the r_server
4583			 * with the current servinfo.
4584			 */
4585			if (!VALID_FH(fi)) {
4586				rp->r_server = mi->mi_curr_serv;
4587			}
4588			VN_RELE(nvp);
4589			return (0);
4590		}
4591
4592		/*
4593		 * Try to make it so that no one else will find this
4594		 * vnode because it is just a temporary to hold the
4595		 * new file handle until that file handle can be
4596		 * copied to the original vnode/rnode.
4597		 */
4598		nrp = VTOR(nvp);
4599		mutex_enter(&mi->mi_remap_lock);
4600		/*
4601		 * Some other thread could have raced in here and could
4602		 * have done the remap for this particular rnode before
4603		 * this thread here. Check for rp->r_server and
4604		 * mi->mi_curr_serv and return if they are same.
4605		 */
4606		if (VALID_FH(fi)) {
4607			mutex_exit(&mi->mi_remap_lock);
4608			VN_RELE(nvp);
4609			return (0);
4610		}
4611
4612		if (nrp->r_flags & RHASHED)
4613			rp_rmhash(nrp);
4614
4615		/*
4616		 * As a heuristic check on the validity of the new
4617		 * file, check that the size and type match against
4618		 * that we remember from the old version.
4619		 */
4620		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4621			mutex_exit(&mi->mi_remap_lock);
4622			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4623			    "NFS replicas %s and %s: file %s not same.",
4624			    rp->r_server->sv_hostname,
4625			    nrp->r_server->sv_hostname, rp->r_path);
4626			VN_RELE(nvp);
4627			return (EINVAL);
4628		}
4629
4630		/*
4631		 * snarf the filehandle from the new rnode
4632		 * then release it, again while updating the
4633		 * hash queues for the rnode.
4634		 */
4635		if (rp->r_flags & RHASHED)
4636			rp_rmhash(rp);
4637		rp->r_server = mi->mi_curr_serv;
4638		rp->r_fh = nrp->r_fh;
4639		rp->r_hashq = nrp->r_hashq;
4640		/*
4641		 * Copy the attributes from the new rnode to the old
4642		 * rnode.  This will help to reduce unnecessary page
4643		 * cache flushes.
4644		 */
4645		rp->r_attr = nrp->r_attr;
4646		rp->r_attrtime = nrp->r_attrtime;
4647		rp->r_mtime = nrp->r_mtime;
4648		(void) nfs_free_data_reclaim(rp);
4649		nfs_setswaplike(vp, &rp->r_attr);
4650		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4651		rp_addhash(rp);
4652		rw_exit(&rp->r_hashq->r_lock);
4653		mutex_exit(&mi->mi_remap_lock);
4654		VN_RELE(nvp);
4655	}
4656
4657	/*
4658	 * Update successful failover remap count
4659	 */
4660	mutex_enter(&mi->mi_lock);
4661	mi->mi_remap++;
4662	mutex_exit(&mi->mi_lock);
4663#ifdef DEBUG
4664	nfscl->nfscl_stat.remap.value.ui64++;
4665#endif
4666
4667	/*
4668	 * If we have a copied filehandle to update, do it now.
4669	 */
4670	if (fi->fhp != NULL && fi->copyproc != NULL)
4671		(*fi->copyproc)(fi->fhp, vp);
4672
4673	return (0);
4674}
4675
4676/*
4677 * NFS client failover support
4678 *
4679 * We want a simple pathname lookup routine to parse the pieces
4680 * of path in rp->r_path.  We know that the path was a created
4681 * as rnodes were made, so we know we have only to deal with
4682 * paths that look like:
4683 *	dir1/dir2/dir3/file
4684 * Any evidence of anything like .., symlinks, and ENOTDIR
4685 * are hard errors, because they mean something in this filesystem
4686 * is different from the one we came from, or has changed under
4687 * us in some way.  If this is true, we want the failure.
4688 *
4689 * Extended attributes: if the filesystem is mounted with extended
4690 * attributes enabled (-o xattr), the attribute directory will be
4691 * represented in the r_path as the magic name XATTR_RPATH. So if
4692 * we see that name in the pathname, is must be because this node
4693 * is an extended attribute.  Therefore, look it up that way.
4694 */
4695static int
4696failover_lookup(char *path, vnode_t *root,
4697    int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4698    vnode_t *, cred_t *, int),
4699    int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4700    vnode_t **new)
4701{
4702	vnode_t *dvp, *nvp;
4703	int error = EINVAL;
4704	char *s, *p, *tmppath;
4705	size_t len;
4706	mntinfo_t *mi;
4707	bool_t xattr;
4708
4709	/* Make local copy of path */
4710	len = strlen(path) + 1;
4711	tmppath = kmem_alloc(len, KM_SLEEP);
4712	(void) strcpy(tmppath, path);
4713	s = tmppath;
4714
4715	dvp = root;
4716	VN_HOLD(dvp);
4717	mi = VTOMI(root);
4718	xattr = mi->mi_flags & MI_EXTATTR;
4719
4720	do {
4721		p = strchr(s, '/');
4722		if (p != NULL)
4723			*p = '\0';
4724		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4725			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4726			    RFSCALL_SOFT);
4727		} else {
4728			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4729			    CRED(), RFSCALL_SOFT);
4730		}
4731		if (p != NULL)
4732			*p++ = '/';
4733		if (error) {
4734			VN_RELE(dvp);
4735			kmem_free(tmppath, len);
4736			return (error);
4737		}
4738		s = p;
4739		VN_RELE(dvp);
4740		dvp = nvp;
4741	} while (p != NULL);
4742
4743	if (nvp != NULL && new != NULL)
4744		*new = nvp;
4745	kmem_free(tmppath, len);
4746	return (0);
4747}
4748
4749/*
4750 * NFS client failover support
4751 *
4752 * sv_free() frees the malloc'd portion of a "servinfo_t".
4753 */
4754void
4755sv_free(servinfo_t *svp)
4756{
4757	servinfo_t *next;
4758	struct knetconfig *knconf;
4759
4760	while (svp != NULL) {
4761		next = svp->sv_next;
4762		if (svp->sv_secdata)
4763			sec_clnt_freeinfo(svp->sv_secdata);
4764		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4765			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4766		knconf = svp->sv_knconf;
4767		if (knconf != NULL) {
4768			if (knconf->knc_protofmly != NULL)
4769				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4770			if (knconf->knc_proto != NULL)
4771				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4772			kmem_free(knconf, sizeof (*knconf));
4773		}
4774		knconf = svp->sv_origknconf;
4775		if (knconf != NULL) {
4776			if (knconf->knc_protofmly != NULL)
4777				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4778			if (knconf->knc_proto != NULL)
4779				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4780			kmem_free(knconf, sizeof (*knconf));
4781		}
4782		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4783			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4784		mutex_destroy(&svp->sv_lock);
4785		kmem_free(svp, sizeof (*svp));
4786		svp = next;
4787	}
4788}
4789
4790/*
4791 * Only can return non-zero if intr != 0.
4792 */
4793int
4794nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4795{
4796
4797	mutex_enter(&l->lock);
4798
4799	/*
4800	 * If this is a nested enter, then allow it.  There
4801	 * must be as many exits as enters through.
4802	 */
4803	if (l->owner == curthread) {
4804		/* lock is held for writing by current thread */
4805		ASSERT(rw == RW_READER || rw == RW_WRITER);
4806		l->count--;
4807	} else if (rw == RW_READER) {
4808		/*
4809		 * While there is a writer active or writers waiting,
4810		 * then wait for them to finish up and move on.  Then,
4811		 * increment the count to indicate that a reader is
4812		 * active.
4813		 */
4814		while (l->count < 0 || l->waiters > 0) {
4815			if (intr) {
4816				klwp_t *lwp = ttolwp(curthread);
4817
4818				if (lwp != NULL)
4819					lwp->lwp_nostop++;
4820				if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4821					if (lwp != NULL)
4822						lwp->lwp_nostop--;
4823					mutex_exit(&l->lock);
4824					return (EINTR);
4825				}
4826				if (lwp != NULL)
4827					lwp->lwp_nostop--;
4828			} else
4829				cv_wait(&l->cv_rd, &l->lock);
4830		}
4831		ASSERT(l->count < INT_MAX);
4832#ifdef	DEBUG
4833		if ((l->count % 10000) == 9999)
4834			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4835			    "rwlock @ %p\n", l->count, (void *)&l);
4836#endif
4837		l->count++;
4838	} else {
4839		ASSERT(rw == RW_WRITER);
4840		/*
4841		 * While there are readers active or a writer
4842		 * active, then wait for all of the readers
4843		 * to finish or for the writer to finish.
4844		 * Then, set the owner field to curthread and
4845		 * decrement count to indicate that a writer
4846		 * is active.
4847		 */
4848		while (l->count != 0) {
4849			l->waiters++;
4850			if (intr) {
4851				klwp_t *lwp = ttolwp(curthread);
4852
4853				if (lwp != NULL)
4854					lwp->lwp_nostop++;
4855				if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4856					if (lwp != NULL)
4857						lwp->lwp_nostop--;
4858					l->waiters--;
4859					/*
4860					 * If there are readers active and no
4861					 * writers waiting then wake up all of
4862					 * the waiting readers (if any).
4863					 */
4864					if (l->count > 0 && l->waiters == 0)
4865						cv_broadcast(&l->cv_rd);
4866					mutex_exit(&l->lock);
4867					return (EINTR);
4868				}
4869				if (lwp != NULL)
4870					lwp->lwp_nostop--;
4871			} else
4872				cv_wait(&l->cv, &l->lock);
4873			l->waiters--;
4874		}
4875		ASSERT(l->owner == NULL);
4876		l->owner = curthread;
4877		l->count--;
4878	}
4879
4880	mutex_exit(&l->lock);
4881
4882	return (0);
4883}
4884
4885/*
4886 * If the lock is available, obtain it and return non-zero.  If there is
4887 * already a conflicting lock, return 0 immediately.
4888 */
4889
4890int
4891nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4892{
4893	mutex_enter(&l->lock);
4894
4895	/*
4896	 * If this is a nested enter, then allow it.  There
4897	 * must be as many exits as enters through.
4898	 */
4899	if (l->owner == curthread) {
4900		/* lock is held for writing by current thread */
4901		ASSERT(rw == RW_READER || rw == RW_WRITER);
4902		l->count--;
4903	} else if (rw == RW_READER) {
4904		/*
4905		 * If there is a writer active or writers waiting, deny the
4906		 * lock.  Otherwise, bump the count of readers.
4907		 */
4908		if (l->count < 0 || l->waiters > 0) {
4909			mutex_exit(&l->lock);
4910			return (0);
4911		}
4912		l->count++;
4913	} else {
4914		ASSERT(rw == RW_WRITER);
4915		/*
4916		 * If there are readers active or a writer active, deny the
4917		 * lock.  Otherwise, set the owner field to curthread and
4918		 * decrement count to indicate that a writer is active.
4919		 */
4920		if (l->count != 0) {
4921			mutex_exit(&l->lock);
4922			return (0);
4923		}
4924		ASSERT(l->owner == NULL);
4925		l->owner = curthread;
4926		l->count--;
4927	}
4928
4929	mutex_exit(&l->lock);
4930
4931	return (1);
4932}
4933
4934void
4935nfs_rw_exit(nfs_rwlock_t *l)
4936{
4937
4938	mutex_enter(&l->lock);
4939
4940	if (l->owner != NULL) {
4941		ASSERT(l->owner == curthread);
4942
4943		/*
4944		 * To release a writer lock increment count to indicate that
4945		 * there is one less writer active.  If this was the last of
4946		 * possibly nested writer locks, then clear the owner field as
4947		 * well to indicate that there is no writer active.
4948		 */
4949		ASSERT(l->count < 0);
4950		l->count++;
4951		if (l->count == 0) {
4952			l->owner = NULL;
4953
4954			/*
4955			 * If there are no writers waiting then wakeup all of
4956			 * the waiting readers (if any).
4957			 */
4958			if (l->waiters == 0)
4959				cv_broadcast(&l->cv_rd);
4960		}
4961	} else {
4962		/*
4963		 * To release a reader lock just decrement count to indicate
4964		 * that there is one less reader active.
4965		 */
4966		ASSERT(l->count > 0);
4967		l->count--;
4968	}
4969
4970	/*
4971	 * If there are no readers active nor a writer active and there is a
4972	 * writer waiting we need to wake up it.
4973	 */
4974	if (l->count == 0 && l->waiters > 0)
4975		cv_signal(&l->cv);
4976	mutex_exit(&l->lock);
4977}
4978
4979int
4980nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4981{
4982
4983	if (rw == RW_READER)
4984		return (l->count > 0);
4985	ASSERT(rw == RW_WRITER);
4986	return (l->count < 0);
4987}
4988
4989/* ARGSUSED */
4990void
4991nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4992{
4993
4994	l->count = 0;
4995	l->waiters = 0;
4996	l->owner = NULL;
4997	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4998	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4999	cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
5000}
5001
5002void
5003nfs_rw_destroy(nfs_rwlock_t *l)
5004{
5005
5006	mutex_destroy(&l->lock);
5007	cv_destroy(&l->cv);
5008	cv_destroy(&l->cv_rd);
5009}
5010
5011int
5012nfs3_rddir_compar(const void *x, const void *y)
5013{
5014	rddir_cache *a = (rddir_cache *)x;
5015	rddir_cache *b = (rddir_cache *)y;
5016
5017	if (a->nfs3_cookie == b->nfs3_cookie) {
5018		if (a->buflen == b->buflen)
5019			return (0);
5020		if (a->buflen < b->buflen)
5021			return (-1);
5022		return (1);
5023	}
5024
5025	if (a->nfs3_cookie < b->nfs3_cookie)
5026		return (-1);
5027
5028	return (1);
5029}
5030
5031int
5032nfs_rddir_compar(const void *x, const void *y)
5033{
5034	rddir_cache *a = (rddir_cache *)x;
5035	rddir_cache *b = (rddir_cache *)y;
5036
5037	if (a->nfs_cookie == b->nfs_cookie) {
5038		if (a->buflen == b->buflen)
5039			return (0);
5040		if (a->buflen < b->buflen)
5041			return (-1);
5042		return (1);
5043	}
5044
5045	if (a->nfs_cookie < b->nfs_cookie)
5046		return (-1);
5047
5048	return (1);
5049}
5050
5051static char *
5052nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5053{
5054	servinfo_t *s;
5055	char *srvnames;
5056	char *namep;
5057	size_t length;
5058
5059	/*
5060	 * Calculate the length of the string required to hold all
5061	 * of the server names plus either a comma or a null
5062	 * character following each individual one.
5063	 */
5064	length = 0;
5065	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5066		length += s->sv_hostnamelen;
5067
5068	srvnames = kmem_alloc(length, KM_SLEEP);
5069
5070	namep = srvnames;
5071	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5072		(void) strcpy(namep, s->sv_hostname);
5073		namep += s->sv_hostnamelen - 1;
5074		*namep++ = ',';
5075	}
5076	*--namep = '\0';
5077
5078	*len = length;
5079
5080	return (srvnames);
5081}
5082
5083/*
5084 * These two functions are temporary and designed for the upgrade-workaround
5085 * only.  They cannot be used for general zone-crossing NFS client support, and
5086 * will be removed shortly.
5087 *
5088 * When the workaround is enabled, all NFS traffic is forced into the global
5089 * zone.  These functions are called when the code needs to refer to the state
5090 * of the underlying network connection.  They're not called when the function
5091 * needs to refer to the state of the process that invoked the system call.
5092 * (E.g., when checking whether the zone is shutting down during the mount()
5093 * call.)
5094 */
5095
5096struct zone *
5097nfs_zone(void)
5098{
5099	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5100}
5101
5102zoneid_t
5103nfs_zoneid(void)
5104{
5105	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5106}
5107
5108/*
5109 * nfs_mount_label_policy:
5110 *	Determine whether the mount is allowed according to MAC check,
5111 *	by comparing (where appropriate) label of the remote server
5112 *	against the label of the zone being mounted into.
5113 *
5114 *	Returns:
5115 *		 0 :	access allowed
5116 *		-1 :	read-only access allowed (i.e., read-down)
5117 *		>0 :	error code, such as EACCES
5118 */
5119int
5120nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5121    struct knetconfig *knconf, cred_t *cr)
5122{
5123	int		addr_type;
5124	void		*ipaddr;
5125	bslabel_t	*server_sl, *mntlabel;
5126	zone_t		*mntzone = NULL;
5127	ts_label_t	*zlabel;
5128	tsol_tpc_t	*tp;
5129	ts_label_t	*tsl = NULL;
5130	int		retv;
5131
5132	/*
5133	 * Get the zone's label.  Each zone on a labeled system has a label.
5134	 */
5135	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5136	zlabel = mntzone->zone_slabel;
5137	ASSERT(zlabel != NULL);
5138	label_hold(zlabel);
5139
5140	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5141		addr_type = IPV4_VERSION;
5142		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5143	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5144		addr_type = IPV6_VERSION;
5145		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5146	} else {
5147		retv = 0;
5148		goto out;
5149	}
5150
5151	retv = EACCES;				/* assume the worst */
5152
5153	/*
5154	 * Next, get the assigned label of the remote server.
5155	 */
5156	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5157	if (tp == NULL)
5158		goto out;			/* error getting host entry */
5159
5160	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5161		goto rel_tpc;			/* invalid domain */
5162	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5163	    (tp->tpc_tp.host_type != UNLABELED))
5164		goto rel_tpc;			/* invalid hosttype */
5165
5166	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5167		tsl = getflabel_cipso(vfsp);
5168		if (tsl == NULL)
5169			goto rel_tpc;		/* error getting server lbl */
5170
5171		server_sl = label2bslabel(tsl);
5172	} else {	/* UNLABELED */
5173		server_sl = &tp->tpc_tp.tp_def_label;
5174	}
5175
5176	mntlabel = label2bslabel(zlabel);
5177
5178	/*
5179	 * Now compare labels to complete the MAC check.  If the labels
5180	 * are equal or if the requestor is in the global zone and has
5181	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5182	 * mounts into the global zone itself; restrict these to
5183	 * read-only.)
5184	 *
5185	 * If the requestor is in some other zone, but their label
5186	 * dominates the server, then allow read-down.
5187	 *
5188	 * Otherwise, access is denied.
5189	 */
5190	if (blequal(mntlabel, server_sl) ||
5191	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5192	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5193		if ((mntzone == global_zone) ||
5194		    !blequal(mntlabel, server_sl))
5195			retv = -1;		/* read-only */
5196		else
5197			retv = 0;		/* access OK */
5198	} else if (bldominates(mntlabel, server_sl)) {
5199		retv = -1;			/* read-only */
5200	} else {
5201		retv = EACCES;
5202	}
5203
5204	if (tsl != NULL)
5205		label_rele(tsl);
5206
5207rel_tpc:
5208	TPC_RELE(tp);
5209out:
5210	if (mntzone)
5211		zone_rele(mntzone);
5212	label_rele(zlabel);
5213	return (retv);
5214}
5215
5216boolean_t
5217nfs_has_ctty(void)
5218{
5219	boolean_t rv;
5220	mutex_enter(&curproc->p_splock);
5221	rv = (curproc->p_sessp->s_vp != NULL);
5222	mutex_exit(&curproc->p_splock);
5223	return (rv);
5224}
5225
5226/*
5227 * See if xattr directory to see if it has any generic user attributes
5228 */
5229int
5230do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5231{
5232	struct uio uio;
5233	struct iovec iov;
5234	char *dbuf;
5235	struct dirent64 *dp;
5236	size_t dlen = 8 * 1024;
5237	size_t dbuflen;
5238	int eof = 0;
5239	int error;
5240
5241	*valp = 0;
5242	dbuf = kmem_alloc(dlen, KM_SLEEP);
5243	uio.uio_iov = &iov;
5244	uio.uio_iovcnt = 1;
5245	uio.uio_segflg = UIO_SYSSPACE;
5246	uio.uio_fmode = 0;
5247	uio.uio_extflg = UIO_COPY_CACHED;
5248	uio.uio_loffset = 0;
5249	uio.uio_resid = dlen;
5250	iov.iov_base = dbuf;
5251	iov.iov_len = dlen;
5252	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5253	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5254	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5255
5256	dbuflen = dlen - uio.uio_resid;
5257
5258	if (error || dbuflen == 0) {
5259		kmem_free(dbuf, dlen);
5260		return (error);
5261	}
5262
5263	dp = (dirent64_t *)dbuf;
5264
5265	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5266		if (strcmp(dp->d_name, ".") == 0 ||
5267		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5268		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5269		    VIEW_READONLY) == 0) {
5270			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5271			continue;
5272		}
5273
5274		*valp = 1;
5275		break;
5276	}
5277	kmem_free(dbuf, dlen);
5278	return (0);
5279}
5280