xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision 7c478bd9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
29  *	All Rights Reserved
30  */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/proc.h>
39 #include <sys/user.h>
40 #include <sys/time.h>
41 #include <sys/buf.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/socket.h>
45 #include <sys/uio.h>
46 #include <sys/tiuser.h>
47 #include <sys/swap.h>
48 #include <sys/errno.h>
49 #include <sys/debug.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vtrace.h>
54 #include <sys/session.h>
55 #include <sys/dnlc.h>
56 #include <sys/bitmap.h>
57 #include <sys/acl.h>
58 #include <sys/ddi.h>
59 #include <sys/pathname.h>
60 #include <sys/flock.h>
61 #include <sys/dirent.h>
62 #include <sys/flock.h>
63 #include <sys/callb.h>
64 
65 #include <rpc/types.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/rpcsec_gss.h>
69 #include <rpc/clnt.h>
70 
71 #include <nfs/nfs.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/nfs_acl.h>
74 
75 #include <nfs/nfs4.h>
76 #include <nfs/rnode4.h>
77 #include <nfs/nfs4_clnt.h>
78 
79 /*
80  * The hash queues for the access to active and cached rnodes
81  * are organized as doubly linked lists.  A reader/writer lock
82  * for each hash bucket is used to control access and to synchronize
83  * lookups, additions, and deletions from the hash queue.
84  *
85  * The rnode freelist is organized as a doubly linked list with
86  * a head pointer.  Additions and deletions are synchronized via
87  * a single mutex.
88  *
89  * In order to add an rnode to the free list, it must be hashed into
90  * a hash queue and the exclusive lock to the hash queue be held.
91  * If an rnode is not hashed into a hash queue, then it is destroyed
92  * because it represents no valuable information that can be reused
93  * about the file.  The exclusive lock to the hash queue must be
94  * held in order to prevent a lookup in the hash queue from finding
95  * the rnode and using it and assuming that the rnode is not on the
96  * freelist.  The lookup in the hash queue will have the hash queue
97  * locked, either exclusive or shared.
98  *
99  * The vnode reference count for each rnode is not allowed to drop
100  * below 1.  This prevents external entities, such as the VM
101  * subsystem, from acquiring references to vnodes already on the
102  * freelist and then trying to place them back on the freelist
103  * when their reference is released.  This means that the when an
104  * rnode is looked up in the hash queues, then either the rnode
105  * is removed from the freelist and that reference is tranfered to
106  * the new reference or the vnode reference count must be incremented
107  * accordingly.  The mutex for the freelist must be held in order to
108  * accurately test to see if the rnode is on the freelist or not.
109  * The hash queue lock might be held shared and it is possible that
110  * two different threads may race to remove the rnode from the
111  * freelist.  This race can be resolved by holding the mutex for the
112  * freelist.  Please note that the mutex for the freelist does not
113  * need to be held if the rnode is not on the freelist.  It can not be
114  * placed on the freelist due to the requirement that the thread
115  * putting the rnode on the freelist must hold the exclusive lock
116  * to the hash queue and the thread doing the lookup in the hash
117  * queue is holding either a shared or exclusive lock to the hash
118  * queue.
119  *
120  * The lock ordering is:
121  *
122  *	hash bucket lock -> vnode lock
123  *	hash bucket lock -> freelist lock -> mi_fileid_lock -> r_statelock
124  */
125 r4hashq_t *rtable4;
126 
127 static kmutex_t rp4freelist_lock;
128 static rnode4_t *rp4freelist = NULL;
129 static long rnode4_new = 0;
130 int rtable4size;
131 static int rtable4mask;
132 static struct kmem_cache *rnode4_cache;
133 static int rnode4_hashlen = 4;
134 
135 static void	r4inactive(rnode4_t *, cred_t *);
136 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
137 		    struct vnodeops *,
138 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
139 		    cred_t *),
140 		    int *, cred_t *);
141 static vnode_t	*nfs4fidcollide(rnode4_t *, mntinfo4_t *);
142 static void	rp4_rmfree(rnode4_t *);
143 int		nfs4_free_data_reclaim(rnode4_t *);
144 static int	nfs4_active_data_reclaim(rnode4_t *);
145 static int	nfs4_free_reclaim(void);
146 static int	nfs4_active_reclaim(void);
147 static int	nfs4_rnode_reclaim(void);
148 static void	nfs4_reclaim(void *);
149 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
150 static void	uninit_rnode4(rnode4_t *);
151 static void	destroy_rnode4(rnode4_t *);
152 static int	rp4_fileid_cmp(const void *, const void *);
153 
154 
155 #ifdef DEBUG
156 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
157 static int nfs4_rnode_debug = 0;
158 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
159 static int nfs4_rnode_nofreelist = 0;
160 /* give messages on colliding shared filehandles */
161 static int nfs4_fidcollide_debug = 0;
162 static void	r4_dup_check(rnode4_t *, vfs_t *);
163 #endif
164 
165 /*
166  * Free the resources associated with an rnode.
167  */
168 static void
169 r4inactive(rnode4_t *rp, cred_t *cr)
170 {
171 	vnode_t *vp;
172 	char *contents;
173 	int size;
174 	vsecattr_t *vsp;
175 	vnode_t *xattr;
176 	int error;
177 
178 	/*
179 	 * Before freeing anything, wait until all asynchronous
180 	 * activity is done on this rnode.  This will allow all
181 	 * asynchronous read ahead and write behind i/o's to
182 	 * finish.
183 	 */
184 	mutex_enter(&rp->r_statelock);
185 	while (rp->r_count > 0)
186 		cv_wait(&rp->r_cv, &rp->r_statelock);
187 	mutex_exit(&rp->r_statelock);
188 
189 	/*
190 	 * Flush and invalidate all pages associated with the vnode.
191 	 */
192 	vp = RTOV4(rp);
193 	if (nfs4_has_pages(vp)) {
194 		ASSERT(vp->v_type != VCHR);
195 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
196 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
197 			if (error && (error == ENOSPC || error == EDQUOT)) {
198 				mutex_enter(&rp->r_statelock);
199 				if (!rp->r_error)
200 					rp->r_error = error;
201 				mutex_exit(&rp->r_statelock);
202 			}
203 		}
204 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
205 	}
206 
207 	/*
208 	 * Free any held caches which may be
209 	 * associated with this rnode.
210 	 */
211 	mutex_enter(&rp->r_statelock);
212 	contents = rp->r_symlink.contents;
213 	size = rp->r_symlink.size;
214 	rp->r_symlink.contents = NULL;
215 	vsp = rp->r_secattr;
216 	rp->r_secattr = NULL;
217 	xattr = rp->r_xattr_dir;
218 	rp->r_xattr_dir = NULL;
219 	mutex_exit(&rp->r_statelock);
220 
221 	/*
222 	 * Free the access cache entries.
223 	 */
224 	(void) nfs4_access_purge_rp(rp);
225 
226 	/*
227 	 * Free the readdir cache entries.
228 	 */
229 	nfs4_purge_rddir_cache(vp);
230 
231 	/*
232 	 * Free the symbolic link cache.
233 	 */
234 	if (contents != NULL) {
235 
236 		kmem_free((void *)contents, size);
237 	}
238 
239 	/*
240 	 * Free any cached ACL.
241 	 */
242 	if (vsp != NULL)
243 		nfs4_acl_free_cache(vsp);
244 
245 	/*
246 	 * Release the cached xattr_dir
247 	 */
248 	if (xattr != NULL)
249 		VN_RELE(xattr);
250 }
251 
252 /*
253  * We have seen a case that the fh passed in is for "." which
254  * should be a VROOT node, however, the fh is different from the
255  * root fh stored in the mntinfo4_t. The invalid fh might be
256  * from a misbehaved server and will panic the client system at
257  * a later time. To avoid the panic, we drop the bad fh, use
258  * the root fh from mntinfo4_t, and print an error message
259  * for attention.
260  */
261 nfs4_sharedfh_t *
262 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
263     int *wasbad)
264 {
265 	char *s;
266 
267 	*wasbad = 0;
268 	s = fn_name(nm);
269 	ASSERT(strcmp(s, "..") != 0);
270 
271 	if ((s[0] == '.' && s[1] == '\0') && fh &&
272 					!SFH4_SAME(mi->mi_rootfh, fh)) {
273 #ifdef DEBUG
274 		nfs4_fhandle_t fhandle;
275 
276 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
277 		    "Server %s returns a different "
278 		    "root filehandle for the path %s:",
279 		    mi->mi_curr_serv->sv_hostname,
280 		    mi->mi_curr_serv->sv_path);
281 
282 		/* print the bad fh */
283 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
284 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
285 			fhandle.fh_len);
286 		nfs4_printfhandle(&fhandle);
287 
288 		/* print mi_rootfh */
289 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
290 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
291 			fhandle.fh_len);
292 		nfs4_printfhandle(&fhandle);
293 #endif
294 		/* use mi_rootfh instead; fh will be rele by the caller */
295 		fh = mi->mi_rootfh;
296 		*wasbad = 1;
297 	}
298 
299 	kmem_free(s, MAXNAMELEN);
300 	return (fh);
301 }
302 
303 /*
304  * If we have volatile filehandles that may be expired while
305  * a file is held open, we need to check to see if this new
306  * rnode has the same fileid as an existing rnode.  If so,
307  * then we drop this rnode and start again with the other
308  * filehandle.
309  */
310 vnode_t *
311 shfh_collide_check(vnode_t *vp, vnode_t **badvp, mntinfo4_t *mi,
312 			nfs4_ga_res_t *garp)
313 {
314 	*badvp = NULL;
315 
316 	if (((mi->mi_fh_expire_type &
317 	    (FH4_VOLATILE_ANY |
318 		FH4_VOL_MIGRATION |
319 		FH4_VOL_RENAME)) != 0) &&
320 	    ((mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) == 0)) {
321 		rnode4_t *rp = VTOR4(vp);
322 		vnode_t *tmpvp;
323 
324 		if (! (rp->r_attr.va_mask & AT_NODEID)) {
325 			/*
326 			 * if the rnode doesn't have its nodeid cached,
327 			 * try to get it from the garp.
328 			 */
329 			if (garp != NULL) {
330 				rp->r_attr.va_nodeid = garp->n4g_va.va_nodeid;
331 				rp->r_attr.va_mask |= AT_NODEID;
332 			}
333 		}
334 		if (rp->r_attr.va_mask & AT_NODEID) {
335 			mutex_enter(&mi->mi_fileid_lock);
336 			tmpvp = nfs4fidcollide(rp, mi);
337 			mutex_exit(&mi->mi_fileid_lock);
338 			if (tmpvp != NULL) {
339 				/*
340 				 * We got a collision.
341 				 * badvp needs to be released, but not until
342 				 * after we drop the hash bucket lock.
343 				 * tmpvp is returned held.
344 				 */
345 				*badvp = vp;
346 				vp = tmpvp;
347 			}
348 		} else if (! (vp->v_flag & VROOT)) {
349 			/*
350 			 * Don't issue a warning for the root, because when
351 			 * we're creating the rootvp at mount time, we never
352 			 * have the fileid.
353 			 */
354 			NFS4_DEBUG(nfs4_fidcollide_debug,
355 			    (CE_NOTE, "rp %p: "
356 				"cannot get fileid for duplicate check",
357 				(void *) rp));
358 		}
359 	}
360 
361 	return (vp);
362 }
363 
364 void
365 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
366     hrtime_t t, cred_t *cr, int index)
367 {
368 	vattr_t *attr;
369 	/*
370 	 * Don't add to attrcache if time overflow, but
371 	 * no need to check because either attr is null or the time
372 	 * values in it were processed by nfs4_time_ntov(), which checks
373 	 * for time overflows.
374 	 */
375 	attr = garp ? &garp->n4g_va : NULL;
376 
377 	if (attr) {
378 		if (!newnode) {
379 			rw_exit(&rtable4[index].r_lock);
380 #ifdef DEBUG
381 			if (vp->v_type != attr->va_type &&
382 			    vp->v_type != VNON && attr->va_type != VNON) {
383 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
384 					"makenfs4node: type (%d) doesn't "
385 					"match type of found node at %p (%d)",
386 					attr->va_type, (void *)vp, vp->v_type);
387 			}
388 #endif
389 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
390 		} else {
391 			rnode4_t *rp = VTOR4(vp);
392 
393 			vp->v_type = attr->va_type;
394 			vp->v_rdev = attr->va_rdev;
395 
396 			/*
397 			 * Turn this object into a "stub" object if we
398 			 * crossed an underlying server fs boundary.  To
399 			 * make this check, during mount we save the
400 			 * fsid of the server object being mounted.
401 			 * Here we compare this object's server fsid
402 			 * with the fsid we saved at mount.  If they
403 			 * are different, we crossed server fs boundary.
404 			 *
405 			 * The stub flag is set (or not) at rnode
406 			 * creation time and it never changes for life
407 			 * of rnode.
408 			 *
409 			 * We don't bother with taking r_state_lock
410 			 * to set R4SRVSTUB flag because this is a new
411 			 * rnode and we're holding rtable lock.  No other
412 			 * thread could have obtained access to this
413 			 * rnode.
414 			 */
415 			if (garp->n4g_fsid_valid) {
416 				rp->r_srv_fsid = garp->n4g_fsid;
417 
418 				if (vp->v_type == VDIR) {
419 					servinfo4_t *svp = rp->r_server;
420 
421 					(void) nfs_rw_enter_sig(&svp->sv_lock,
422 								RW_READER, 0);
423 					if (!FATTR4_FSID_EQ(&garp->n4g_fsid,
424 							    &svp->sv_fsid)) {
425 						rp->r_flags |= R4SRVSTUB;
426 					}
427 					nfs_rw_exit(&svp->sv_lock);
428 				}
429 			}
430 
431 			/* Can not cache partial attr */
432 			if (attr->va_mask == AT_ALL)
433 				nfs4_attrcache_noinval(vp, garp, t);
434 			else
435 				PURGE_ATTRCACHE4(vp);
436 
437 			rw_exit(&rtable4[index].r_lock);
438 		}
439 	} else {
440 		if (newnode) {
441 			PURGE_ATTRCACHE4(vp);
442 		}
443 		rw_exit(&rtable4[index].r_lock);
444 	}
445 }
446 
447 /*
448  * Find or create an rnode based primarily on filehandle.  To be
449  * used when dvp (vnode for parent directory) is not available;
450  * otherwise, makenfs4node() should be used.
451  *
452  * The nfs4_fname_t argument *npp is consumed and nulled out.
453  */
454 
455 vnode_t *
456 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
457 	nfs4_fname_t **npp, nfs4_ga_res_t *garp,
458 	mntinfo4_t *mi, cred_t *cr, hrtime_t t)
459 {
460 	vfs_t *vfsp = mi->mi_vfsp;
461 	int newnode = 0;
462 	vnode_t *vp;
463 	rnode4_t *rp;
464 	svnode_t *svp;
465 	nfs4_fname_t *name;
466 	int index;
467 
468 	ASSERT(npp && *npp);
469 	name = *npp;
470 	*npp = NULL;
471 
472 	index = rtable4hash(sfh);
473 	rw_enter(&rtable4[index].r_lock, RW_READER);
474 
475 	rp = r4find(&rtable4[index], sfh, vfsp);
476 	if (rp != NULL) {
477 		rw_exit(&rtable4[index].r_lock);
478 		vp = RTOV4(rp);
479 		fn_rele(&name);
480 		return (vp);
481 	}
482 
483 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
484 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
485 	if (newnode) {
486 		svp = vtosv(vp);
487 		svp->sv_forw = svp->sv_back = svp;
488 		svp->sv_name = name;
489 		if (psfh != NULL)
490 			sfh4_hold(psfh);
491 		svp->sv_dfh = psfh;
492 	} else {
493 		fn_rele(&name);
494 	}
495 
496 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
497 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
498 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
499 
500 	return (vp);
501 }
502 
503 /*
504  * Find or create a vnode for the given filehandle, filesystem, parent, and
505  * name.  The reference to nm is consumed, so the caller must first do an
506  * fn_hold() if it wants to continue using nm after this call.
507  */
508 vnode_t *
509 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
510 	hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
511 {
512 	vnode_t *vp;
513 	vnode_t *badvp = NULL;
514 	int newnode;
515 	int index;
516 	mntinfo4_t *mi = VFTOMI4(vfsp);
517 	int had_badfh = 0;
518 	rnode4_t *rp;
519 
520 	ASSERT(dvp != NULL);
521 
522 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
523 
524 	index = rtable4hash(fh);
525 	rw_enter(&rtable4[index].r_lock, RW_READER);
526 
527 	/*
528 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
529 	 */
530 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
531 	    nfs4_putapage, &newnode, cr);
532 
533 	/*
534 	 * Check for shared filehandle collisions.  This only applies
535 	 * to servers with volatile filehandles.
536 	 */
537 	vp = shfh_collide_check(vp, &badvp, mi, garp);
538 	rp = VTOR4(vp);
539 	/* If we had a shfh collision... */
540 	if (badvp != NULL) {
541 		int newindex;
542 		nfs4_fname_t *tname = nm;
543 		fn_hold(tname);
544 
545 		/*
546 		 * We must activate the shadow vnode, even though the
547 		 * rnode will be short-lived.  This is because other
548 		 * things, especially things in inactive,
549 		 *  assume that sv_dfh and sv_name are non-NULL.
550 		 */
551 		sv_activate(&badvp, dvp, &tname, newnode);
552 
553 		/*
554 		 * Since the vnode we're replacing badvp with already
555 		 * exists, it's not a newnode.
556 		 */
557 		newnode = 0;
558 
559 		/* check to see if we need a different hashq lock */
560 		newindex = rtable4hash(rp->r_fh);
561 		if (newindex != index) {
562 			rw_exit(&rtable4[index].r_lock);
563 			rw_enter(&rtable4[newindex].r_lock, RW_READER);
564 			index = newindex;
565 		}
566 	}
567 
568 	sv_activate(&vp, dvp, &nm, newnode);
569 	if (dvp->v_flag & V_XATTRDIR) {
570 		mutex_enter(&rp->r_statelock);
571 		rp->r_flags |= R4ISXATTR;
572 		mutex_exit(&rp->r_statelock);
573 	}
574 
575 	/* if getting a bad file handle, do not cache the attributes. */
576 	if (had_badfh) {
577 		rw_exit(&rtable4[index].r_lock);
578 		if (badvp != NULL) {
579 			VN_RELE(badvp);
580 		}
581 		return (vp);
582 	}
583 
584 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
585 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
586 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
587 
588 	/*
589 	 * If a shared filehandle collision occured, release the newly
590 	 * created rnode (in favor of the extant one).
591 	 */
592 	if (badvp != NULL) {
593 		VN_RELE(badvp);
594 	}
595 
596 	return (vp);
597 }
598 
599 /*
600  * Detect if there are any extant rnodes with the same fileid.  If
601  * not, store this rnode in the table.
602  *
603  * Only call this if r_attr.va_nodeid is set with the correct fileid.
604  *
605  * Returns NULL if no collision; otherwise, returns the extant vnode that
606  * has the same fileid as the one passed in.  The vnode is returned
607  * held.
608  */
609 
610 vnode_t *
611 nfs4fidcollide(rnode4_t *rp, mntinfo4_t *mi)
612 {
613 	avl_index_t where;
614 	rnode4_t *conflict;
615 	vnode_t *rvp;
616 
617 	ASSERT(RW_LOCK_HELD(&rp->r_hashq->r_lock));
618 	ASSERT(MUTEX_HELD(&mi->mi_fileid_lock));
619 	ASSERT(rp->r_attr.va_mask & AT_NODEID);
620 
621 	conflict = avl_find(&mi->mi_fileid_map, rp, &where);
622 
623 	if (conflict == rp)
624 		return (NULL);
625 
626 	if (conflict == NULL) {
627 		avl_insert(&mi->mi_fileid_map,
628 		    rp, where);
629 		mutex_enter(&rp->r_statelock);
630 		rp->r_flags |= R4FILEIDMAP;
631 		mutex_exit(&rp->r_statelock);
632 		return (NULL);
633 	}
634 
635 	NFS4_DEBUG(nfs4_fidcollide_debug, (CE_NOTE,
636 	    "nfs4fidcollide: fileid %lld remapping to rnode %p",
637 	    rp->r_attr.va_nodeid, (void *) conflict));
638 	rvp = RTOV4(conflict);
639 	VN_HOLD(rvp);
640 	return (rvp);
641 }
642 
643 /*
644  * Hash on address of filehandle object.
645  * XXX totally untuned.
646  */
647 
648 int
649 rtable4hash(nfs4_sharedfh_t *fh)
650 {
651 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
652 }
653 
654 /*
655  * Find or create the vnode for the given filehandle and filesystem.
656  * *newnode is set to zero if the vnode already existed; non-zero if it had
657  * to be created.
658  *
659  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
660  */
661 
662 static vnode_t *
663 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
664     struct vnodeops *vops,
665     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
666     int *newnode, cred_t *cr)
667 {
668 	rnode4_t *rp;
669 	rnode4_t *trp;
670 	vnode_t *vp;
671 	mntinfo4_t *mi;
672 
673 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
674 
675 	mi = VFTOMI4(vfsp);
676 
677 start:
678 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
679 		vp = RTOV4(rp);
680 		*newnode = 0;
681 		return (vp);
682 	}
683 	rw_exit(&rhtp->r_lock);
684 
685 	mutex_enter(&rp4freelist_lock);
686 
687 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
688 		rp = rp4freelist;
689 		rp4_rmfree(rp);
690 		mutex_exit(&rp4freelist_lock);
691 
692 		vp = RTOV4(rp);
693 
694 		if (rp->r_flags & R4FILEIDMAP)
695 			rp4_fileid_map_remove(rp);
696 		if (rp->r_flags & R4HASHED) {
697 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
698 			mutex_enter(&vp->v_lock);
699 			if (vp->v_count > 1) {
700 				vp->v_count--;
701 				mutex_exit(&vp->v_lock);
702 				rw_exit(&rp->r_hashq->r_lock);
703 				rw_enter(&rhtp->r_lock, RW_READER);
704 				goto start;
705 			}
706 			mutex_exit(&vp->v_lock);
707 			rp4_rmhash_locked(rp);
708 			rw_exit(&rp->r_hashq->r_lock);
709 		}
710 
711 		r4inactive(rp, cr);
712 
713 		mutex_enter(&vp->v_lock);
714 		if (vp->v_count > 1) {
715 			vp->v_count--;
716 			mutex_exit(&vp->v_lock);
717 			rw_enter(&rhtp->r_lock, RW_READER);
718 			goto start;
719 		}
720 		mutex_exit(&vp->v_lock);
721 		vn_invalid(vp);
722 
723 		/*
724 		 * destroy old locks before bzero'ing and
725 		 * recreating the locks below.
726 		 */
727 		uninit_rnode4(rp);
728 
729 		/*
730 		 * Make sure that if rnode is recycled then
731 		 * VFS count is decremented properly before
732 		 * reuse.
733 		 */
734 		VFS_RELE(vp->v_vfsp);
735 		vn_reinit(vp);
736 	} else {
737 		vnode_t *new_vp;
738 
739 		mutex_exit(&rp4freelist_lock);
740 
741 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
742 		new_vp = vn_alloc(KM_SLEEP);
743 
744 		atomic_add_long((ulong_t *)&rnode4_new, 1);
745 #ifdef DEBUG
746 		clstat4_debug.nrnode.value.ui64++;
747 #endif
748 		vp = new_vp;
749 	}
750 
751 	bzero(rp, sizeof (*rp));
752 	rp->r_vnode = vp;
753 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
754 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
755 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
756 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
757 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
758 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
759 	rp->created_v4 = 0;
760 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
761 	    offsetof(nfs4_open_stream_t, os_node));
762 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
763 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
764 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
765 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
766 	rp->r_flags = R4READDIRWATTR;
767 	rp->r_fh = fh;
768 	rp->r_hashq = rhtp;
769 	sfh4_hold(rp->r_fh);
770 	rp->r_server = mi->mi_curr_serv;
771 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
772 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
773 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
774 
775 	rddir4_cache_create(rp);
776 	rp->r_putapage = putapage;
777 	vn_setops(vp, vops);
778 	vp->v_data = (caddr_t)rp;
779 	vp->v_vfsp = vfsp;
780 	VFS_HOLD(vfsp);
781 	vp->v_type = VNON;
782 	if (isrootfh(fh, rp))
783 		vp->v_flag = VROOT;
784 	vn_exists(vp);
785 
786 	/*
787 	 * There is a race condition if someone else
788 	 * alloc's the rnode while no locks are held, so we
789 	 * check again and recover if found.
790 	 */
791 	rw_enter(&rhtp->r_lock, RW_WRITER);
792 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
793 		vp = RTOV4(trp);
794 		*newnode = 0;
795 		rw_exit(&rhtp->r_lock);
796 		rp4_addfree(rp, cr);
797 		rw_enter(&rhtp->r_lock, RW_READER);
798 		return (vp);
799 	}
800 	rp4_addhash(rp);
801 	*newnode = 1;
802 	return (vp);
803 }
804 
805 static void
806 uninit_rnode4(rnode4_t *rp)
807 {
808 	vnode_t *vp = RTOV4(rp);
809 
810 	ASSERT(rp != NULL);
811 	ASSERT(vp != NULL);
812 	ASSERT(vp->v_count == 1);
813 	ASSERT(rp->r_count == 0);
814 	ASSERT(rp->r_mapcnt == 0);
815 	if (rp->r_flags & R4LODANGLERS) {
816 		nfs4_flush_lock_owners(rp);
817 	}
818 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
819 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
820 	ASSERT(!(rp->r_flags & R4HASHED));
821 	ASSERT(!(rp->r_flags & R4FILEIDMAP));
822 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
823 	nfs4_clear_open_streams(rp);
824 	list_destroy(&rp->r_open_streams);
825 
826 	/*
827 	 * Destroy the rddir cache first since we need to grab the r_statelock.
828 	 */
829 	mutex_enter(&rp->r_statelock);
830 	rddir4_cache_destroy(rp);
831 	mutex_exit(&rp->r_statelock);
832 	sv_uninit(&rp->r_svnode);
833 	sfh4_rele(&rp->r_fh);
834 	nfs_rw_destroy(&rp->r_rwlock);
835 	nfs_rw_destroy(&rp->r_lkserlock);
836 	mutex_destroy(&rp->r_statelock);
837 	mutex_destroy(&rp->r_statev4_lock);
838 	mutex_destroy(&rp->r_os_lock);
839 	cv_destroy(&rp->r_cv);
840 	cv_destroy(&rp->r_commit.c_cv);
841 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
842 	if (rp->r_flags & R4DELMAPLIST)
843 		list_destroy(&rp->r_indelmap);
844 }
845 
846 /*
847  * Put an rnode on the free list.
848  *
849  * Rnodes which were allocated above and beyond the normal limit
850  * are immediately freed.
851  */
852 void
853 rp4_addfree(rnode4_t *rp, cred_t *cr)
854 {
855 	vnode_t *vp;
856 	vnode_t *xattr;
857 	struct vfs *vfsp;
858 
859 	vp = RTOV4(rp);
860 	ASSERT(vp->v_count >= 1);
861 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
862 
863 	/*
864 	 * If we have too many rnodes allocated and there are no
865 	 * references to this rnode, or if the rnode is no longer
866 	 * accessible by it does not reside in the hash queues,
867 	 * or if an i/o error occurred while writing to the file,
868 	 * then just free it instead of putting it on the rnode
869 	 * freelist.
870 	 */
871 	vfsp = vp->v_vfsp;
872 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
873 #ifdef DEBUG
874 	    (nfs4_rnode_nofreelist != 0) ||
875 #endif
876 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
877 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
878 		if (rp->r_flags & R4FILEIDMAP)
879 			rp4_fileid_map_remove(rp);
880 		if (rp->r_flags & R4HASHED) {
881 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
882 			mutex_enter(&vp->v_lock);
883 			if (vp->v_count > 1) {
884 				vp->v_count--;
885 				mutex_exit(&vp->v_lock);
886 				rw_exit(&rp->r_hashq->r_lock);
887 				return;
888 			}
889 			mutex_exit(&vp->v_lock);
890 			rp4_rmhash_locked(rp);
891 			rw_exit(&rp->r_hashq->r_lock);
892 		}
893 
894 		/*
895 		 * Make sure we don't have a delegation on this rnode
896 		 * before destroying it.
897 		 */
898 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
899 			(void) nfs4delegreturn(rp,
900 				NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
901 		}
902 
903 		r4inactive(rp, cr);
904 
905 		/*
906 		 * Recheck the vnode reference count.  We need to
907 		 * make sure that another reference has not been
908 		 * acquired while we were not holding v_lock.  The
909 		 * rnode is not in the rnode hash queues; one
910 		 * way for a reference to have been acquired
911 		 * is for a VOP_PUTPAGE because the rnode was marked
912 		 * with R4DIRTY or for a modified page.  This
913 		 * reference may have been acquired before our call
914 		 * to r4inactive.  The i/o may have been completed,
915 		 * thus allowing r4inactive to complete, but the
916 		 * reference to the vnode may not have been released
917 		 * yet.  In any case, the rnode can not be destroyed
918 		 * until the other references to this vnode have been
919 		 * released.  The other references will take care of
920 		 * either destroying the rnode or placing it on the
921 		 * rnode freelist.  If there are no other references,
922 		 * then the rnode may be safely destroyed.
923 		 *
924 		 * Another way for a reference to be acquired
925 		 * is through the mi_fileid_map, which is used for
926 		 * detecting and correcting shared fh collisions.
927 		 * A race between this thread and the one using
928 		 * mi_fileid_map would have blocked us, above, when
929 		 * we called rp4_fileid_map_removed, and needed the
930 		 * mi_fileid_lock mutex.  By the time the other thread
931 		 * released that mutex, it would have done a VN_HOLD(),
932 		 * which we check for here.
933 		 */
934 		mutex_enter(&vp->v_lock);
935 		if (vp->v_count > 1) {
936 			vp->v_count--;
937 			mutex_exit(&vp->v_lock);
938 			return;
939 		}
940 		mutex_exit(&vp->v_lock);
941 
942 		destroy_rnode4(rp);
943 		return;
944 	}
945 
946 	/*
947 	 * Lock the hash queue and then recheck the reference count
948 	 * to ensure that no other threads have acquired a reference
949 	 * to indicate that the rnode should not be placed on the
950 	 * freelist.  If another reference has been acquired, then
951 	 * just release this one and let the other thread complete
952 	 * the processing of adding this rnode to the freelist.
953 	 */
954 again:
955 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
956 
957 	mutex_enter(&vp->v_lock);
958 	if (vp->v_count > 1) {
959 		vp->v_count--;
960 		mutex_exit(&vp->v_lock);
961 		rw_exit(&rp->r_hashq->r_lock);
962 		return;
963 	}
964 	mutex_exit(&vp->v_lock);
965 
966 	/*
967 	 * Make sure we don't put an rnode with a delegation
968 	 * on the free list.
969 	 */
970 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
971 		rw_exit(&rp->r_hashq->r_lock);
972 		(void) nfs4delegreturn(rp,
973 			NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
974 		goto again;
975 	}
976 
977 	/*
978 	 * Now that we have the hash queue lock, and we know there
979 	 * are not anymore references on the vnode, check to make
980 	 * sure there aren't any open streams still on the rnode.
981 	 * If so, drop the hash queue lock, remove the open streams,
982 	 * and recheck the v_count.
983 	 */
984 	mutex_enter(&rp->r_os_lock);
985 	if (list_head(&rp->r_open_streams) != NULL) {
986 		mutex_exit(&rp->r_os_lock);
987 		rw_exit(&rp->r_hashq->r_lock);
988 		if (curproc->p_zone != VTOMI4(vp)->mi_zone)
989 			nfs4_clear_open_streams(rp);
990 		else
991 			(void) nfs4close_all(vp, cr);
992 		goto again;
993 	}
994 	mutex_exit(&rp->r_os_lock);
995 
996 	/*
997 	 * Before we put it on the freelist, make sure there is no
998 	 * active xattr directory cached, the freelist will not
999 	 * have its entries r4inactive'd if there is still an active
1000 	 * rnode, thus nothing in the freelist can hold another
1001 	 * rnode active.
1002 	 */
1003 	xattr = rp->r_xattr_dir;
1004 	rp->r_xattr_dir = NULL;
1005 
1006 	/*
1007 	 * If there is no cached data or metadata for this file, then
1008 	 * put the rnode on the front of the freelist so that it will
1009 	 * be reused before other rnodes which may have cached data or
1010 	 * metadata associated with them.
1011 	 */
1012 	mutex_enter(&rp4freelist_lock);
1013 	if (rp4freelist == NULL) {
1014 		rp->r_freef = rp;
1015 		rp->r_freeb = rp;
1016 		rp4freelist = rp;
1017 	} else {
1018 		rp->r_freef = rp4freelist;
1019 		rp->r_freeb = rp4freelist->r_freeb;
1020 		rp4freelist->r_freeb->r_freef = rp;
1021 		rp4freelist->r_freeb = rp;
1022 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
1023 				rp->r_symlink.contents == NULL &&
1024 				rp->r_secattr == NULL)
1025 			rp4freelist = rp;
1026 	}
1027 	mutex_exit(&rp4freelist_lock);
1028 
1029 	rw_exit(&rp->r_hashq->r_lock);
1030 
1031 	if (xattr)
1032 		VN_RELE(xattr);
1033 }
1034 
1035 /*
1036  * Remove an rnode from the free list.
1037  *
1038  * The caller must be holding rp4freelist_lock and the rnode
1039  * must be on the freelist.
1040  */
1041 static void
1042 rp4_rmfree(rnode4_t *rp)
1043 {
1044 
1045 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
1046 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
1047 
1048 	if (rp == rp4freelist) {
1049 		rp4freelist = rp->r_freef;
1050 		if (rp == rp4freelist)
1051 			rp4freelist = NULL;
1052 	}
1053 	rp->r_freeb->r_freef = rp->r_freef;
1054 	rp->r_freef->r_freeb = rp->r_freeb;
1055 
1056 	rp->r_freef = rp->r_freeb = NULL;
1057 }
1058 
1059 /*
1060  * Put a rnode in the hash table.
1061  *
1062  * The caller must be holding the exclusive hash queue lock
1063  */
1064 void
1065 rp4_addhash(rnode4_t *rp)
1066 {
1067 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
1068 	ASSERT(!(rp->r_flags & R4HASHED));
1069 
1070 #ifdef DEBUG
1071 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
1072 #endif
1073 
1074 	rp->r_hashf = rp->r_hashq->r_hashf;
1075 	rp->r_hashq->r_hashf = rp;
1076 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
1077 	rp->r_hashf->r_hashb = rp;
1078 
1079 	mutex_enter(&rp->r_statelock);
1080 	rp->r_flags |= R4HASHED;
1081 	mutex_exit(&rp->r_statelock);
1082 }
1083 
1084 /*
1085  * Remove a rnode from the hash table.
1086  *
1087  * The caller must be holding the hash queue lock.
1088  */
1089 void
1090 rp4_rmhash_locked(rnode4_t *rp)
1091 {
1092 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
1093 	ASSERT(rp->r_flags & R4HASHED);
1094 
1095 	rp->r_hashb->r_hashf = rp->r_hashf;
1096 	rp->r_hashf->r_hashb = rp->r_hashb;
1097 
1098 	mutex_enter(&rp->r_statelock);
1099 	rp->r_flags &= ~R4HASHED;
1100 	mutex_exit(&rp->r_statelock);
1101 }
1102 
1103 /*
1104  * Remove a rnode from the hash table.
1105  *
1106  * The caller must not be holding the hash queue lock.
1107  */
1108 void
1109 rp4_rmhash(rnode4_t *rp)
1110 {
1111 
1112 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1113 	rp4_rmhash_locked(rp);
1114 	rw_exit(&rp->r_hashq->r_lock);
1115 }
1116 
1117 /*
1118  * fileid map routines
1119  */
1120 
1121 void
1122 rp4_fileid_map_init(avl_tree_t *map)
1123 {
1124 	avl_create(map, rp4_fileid_cmp, sizeof (rnode4_t),
1125 	    offsetof(rnode4_t, r_fileid_map));
1126 }
1127 
1128 int
1129 rp4_fileid_cmp(const void *p1, const void *p2)
1130 {
1131 	const rnode4_t *rp1 = (const rnode4_t *) p1;
1132 	const rnode4_t *rp2 = (const rnode4_t *) p2;
1133 
1134 	if (rp1->r_attr.va_nodeid < rp2->r_attr.va_nodeid)
1135 		return (-1);
1136 	if (rp1->r_attr.va_nodeid > rp2->r_attr.va_nodeid)
1137 		return (1);
1138 	return (0);
1139 }
1140 
1141 void
1142 rp4_fileid_map_remove(rnode4_t *rp)
1143 {
1144 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1145 	ASSERT(rp->r_flags & R4FILEIDMAP);
1146 
1147 	mutex_enter(&mi->mi_fileid_lock);
1148 	mutex_enter(&rp->r_statelock);
1149 	avl_remove(&mi->mi_fileid_map, rp);
1150 	rp->r_flags &= ~R4FILEIDMAP;
1151 	mutex_exit(&rp->r_statelock);
1152 	mutex_exit(&mi->mi_fileid_lock);
1153 }
1154 
1155 void
1156 destroy_fileid_map(struct vfs *vfsp)
1157 {
1158 	mntinfo4_t *mi;
1159 	rnode4_t *rp;
1160 	void *cookie = NULL;
1161 
1162 	if (vfsp == NULL)
1163 		return;
1164 
1165 	mi = VFTOMI4(vfsp);
1166 
1167 	/*
1168 	 * We cannot assert that any locks (e.g. hash bucket, free list) are
1169 	 * held.
1170 	 */
1171 
1172 	mutex_enter(&mi->mi_fileid_lock);
1173 	while ((rp = avl_destroy_nodes(&mi->mi_fileid_map, &cookie)) != NULL) {
1174 		mutex_enter(&rp->r_statelock);
1175 		rp->r_flags &= ~R4FILEIDMAP;
1176 		mutex_exit(&rp->r_statelock);
1177 	}
1178 	mutex_exit(&mi->mi_fileid_lock);
1179 }
1180 
1181 /*
1182  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1183  * Returns NULL if no match.  If an rnode is returned, the reference count
1184  * on the master vnode is incremented.
1185  *
1186  * The caller must be holding the hash queue lock, either shared or exclusive.
1187  */
1188 rnode4_t *
1189 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1190 {
1191 	rnode4_t *rp;
1192 	vnode_t *vp;
1193 
1194 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1195 
1196 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1197 		vp = RTOV4(rp);
1198 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1199 
1200 			mutex_enter(&rp->r_statelock);
1201 			if (rp->r_flags & R4RECOVERR) {
1202 				mutex_exit(&rp->r_statelock);
1203 				continue;
1204 			}
1205 			mutex_exit(&rp->r_statelock);
1206 #ifdef DEBUG
1207 			r4_dup_check(rp, vfsp);
1208 #endif
1209 			if (rp->r_freef != NULL) {
1210 				mutex_enter(&rp4freelist_lock);
1211 				/*
1212 				 * If the rnode is on the freelist,
1213 				 * then remove it and use that reference
1214 				 * as the new reference.  Otherwise,
1215 				 * need to increment the reference count.
1216 				 */
1217 				if (rp->r_freef != NULL) {
1218 					rp4_rmfree(rp);
1219 					mutex_exit(&rp4freelist_lock);
1220 				} else {
1221 					mutex_exit(&rp4freelist_lock);
1222 					VN_HOLD(vp);
1223 				}
1224 			} else
1225 				VN_HOLD(vp);
1226 
1227 			/*
1228 			 * if root vnode, set v_flag to indicate that
1229 			 */
1230 			if (isrootfh(fh, rp)) {
1231 				if (!(vp->v_flag & VROOT)) {
1232 					mutex_enter(&vp->v_lock);
1233 					vp->v_flag |= VROOT;
1234 					mutex_exit(&vp->v_lock);
1235 				}
1236 			}
1237 			return (rp);
1238 		}
1239 	}
1240 	return (NULL);
1241 }
1242 
1243 /*
1244  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1245  * that assumes the caller hasn't already got the lock
1246  * on the hash bucket.
1247  */
1248 rnode4_t *
1249 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1250 {
1251 	rnode4_t *rp;
1252 	int index;
1253 
1254 	index = rtable4hash(fh);
1255 	rw_enter(&rtable4[index].r_lock, RW_READER);
1256 	rp = r4find(&rtable4[index], fh, vfsp);
1257 	rw_exit(&rtable4[index].r_lock);
1258 
1259 	return (rp);
1260 }
1261 
1262 /*
1263  * Return 1 if there is a active vnode belonging to this vfs in the
1264  * rtable4 cache.
1265  *
1266  * Several of these checks are done without holding the usual
1267  * locks.  This is safe because destroy_rtable(), rp_addfree(),
1268  * etc. will redo the necessary checks before actually destroying
1269  * any rnodes.
1270  */
1271 int
1272 check_rtable4(struct vfs *vfsp)
1273 {
1274 	rnode4_t *rp;
1275 	vnode_t *vp;
1276 	char *busy = NULL;
1277 	int index;
1278 
1279 	for (index = 0; index < rtable4size; index++) {
1280 		rw_enter(&rtable4[index].r_lock, RW_READER);
1281 
1282 		for (rp = rtable4[index].r_hashf;
1283 		    rp != (rnode4_t *)(&rtable4[index]);
1284 		    rp = rp->r_hashf) {
1285 
1286 			vp = RTOV4(rp);
1287 			if (vp->v_vfsp == vfsp) {
1288 				if (rp->r_freef == NULL) {
1289 					busy = "not on free list";
1290 				} else if (nfs4_has_pages(vp) &&
1291 					    (rp->r_flags & R4DIRTY)) {
1292 					busy = "dirty pages";
1293 				} else if (rp->r_count > 0) {
1294 					busy = "r_count > 0";
1295 				}
1296 
1297 				if (busy != NULL) {
1298 #ifdef DEBUG
1299 					char *path;
1300 
1301 					path = fn_path(rp->r_svnode.sv_name);
1302 					NFS4_DEBUG(nfs4_rnode_debug,
1303 					    (CE_NOTE, "check_rtable4: " "%s %s",
1304 					    path, busy));
1305 					kmem_free(path, strlen(path)+1);
1306 #endif
1307 					rw_exit(&rtable4[index].r_lock);
1308 					return (1);
1309 				}
1310 			}
1311 		}
1312 		rw_exit(&rtable4[index].r_lock);
1313 	}
1314 	return (0);
1315 }
1316 
1317 /*
1318  * Destroy inactive vnodes from the hash queues which
1319  * belong to this vfs. All of the vnodes should be inactive.
1320  * It is essential that we destory all rnodes in case of
1321  * forced unmount as well as in normal unmount case.
1322  */
1323 
1324 void
1325 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1326 {
1327 	int index;
1328 	vnode_t *vp;
1329 	rnode4_t *rp, *r_hashf, *rlist;
1330 
1331 	rlist = NULL;
1332 
1333 	for (index = 0; index < rtable4size; index++) {
1334 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1335 		for (rp = rtable4[index].r_hashf;
1336 		    rp != (rnode4_t *)(&rtable4[index]);
1337 		    rp = r_hashf) {
1338 			/* save the hash pointer before destroying */
1339 			r_hashf = rp->r_hashf;
1340 
1341 			vp = RTOV4(rp);
1342 			if (vp->v_vfsp == vfsp) {
1343 				mutex_enter(&rp4freelist_lock);
1344 				if (rp->r_freef != NULL) {
1345 					rp4_rmfree(rp);
1346 					mutex_exit(&rp4freelist_lock);
1347 					rp4_rmhash_locked(rp);
1348 					rp->r_hashf = rlist;
1349 					rlist = rp;
1350 				} else
1351 					mutex_exit(&rp4freelist_lock);
1352 			}
1353 		}
1354 		rw_exit(&rtable4[index].r_lock);
1355 	}
1356 
1357 	for (rp = rlist; rp != NULL; rp = r_hashf) {
1358 		r_hashf = rp->r_hashf;
1359 		/*
1360 		 * This call to rp4_addfree will end up destroying the
1361 		 * rnode, but in a safe way with the appropriate set
1362 		 * of checks done.
1363 		 */
1364 		rp4_addfree(rp, cr);
1365 	}
1366 }
1367 
1368 /*
1369  * This routine destroys all the resources of an rnode
1370  * and finally the rnode itself.
1371  */
1372 static void
1373 destroy_rnode4(rnode4_t *rp)
1374 {
1375 	vnode_t *vp;
1376 	vfs_t *vfsp;
1377 
1378 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1379 
1380 	vp = RTOV4(rp);
1381 	vfsp = vp->v_vfsp;
1382 
1383 	uninit_rnode4(rp);
1384 	atomic_add_long((ulong_t *)&rnode4_new, -1);
1385 #ifdef DEBUG
1386 	clstat4_debug.nrnode.value.ui64--;
1387 #endif
1388 	kmem_cache_free(rnode4_cache, rp);
1389 	vn_invalid(vp);
1390 	vn_free(vp);
1391 	VFS_RELE(vfsp);
1392 }
1393 
1394 /*
1395  * Invalidate the attributes on all rnodes forcing the next getattr
1396  * to go over the wire.  Used to flush stale uid and gid mappings.
1397  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1398  */
1399 void
1400 nfs4_rnode_invalidate(struct vfs *vfsp)
1401 {
1402 	int index;
1403 	rnode4_t *rp;
1404 	vnode_t *vp;
1405 
1406 	/*
1407 	 * Walk the hash queues looking for rnodes.
1408 	 */
1409 	for (index = 0; index < rtable4size; index++) {
1410 		rw_enter(&rtable4[index].r_lock, RW_READER);
1411 		for (rp = rtable4[index].r_hashf;
1412 		    rp != (rnode4_t *)(&rtable4[index]);
1413 		    rp = rp->r_hashf) {
1414 			vp = RTOV4(rp);
1415 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1416 				continue;
1417 
1418 			if (!mutex_tryenter(&rp->r_statelock))
1419 				continue;
1420 
1421 			/*
1422 			 * Expire the attributes by resetting the change
1423 			 * and attr timeout.
1424 			 */
1425 			rp->r_change = 0;
1426 			PURGE_ATTRCACHE4_LOCKED(rp);
1427 			mutex_exit(&rp->r_statelock);
1428 		}
1429 		rw_exit(&rtable4[index].r_lock);
1430 	}
1431 }
1432 
1433 /*
1434  * Flush all vnodes in this (or every) vfs.
1435  * Used by nfs_sync and by nfs_unmount.
1436  */
1437 void
1438 r4flush(struct vfs *vfsp, cred_t *cr)
1439 {
1440 	int index;
1441 	rnode4_t *rp;
1442 	vnode_t *vp, **vplist;
1443 	long num, cnt;
1444 
1445 	/*
1446 	 * Check to see whether there is anything to do.
1447 	 */
1448 	num = rnode4_new;
1449 	if (num == 0)
1450 		return;
1451 
1452 	/*
1453 	 * Allocate a slot for all currently active rnodes on the
1454 	 * supposition that they all may need flushing.
1455 	 */
1456 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1457 	cnt = 0;
1458 
1459 	/*
1460 	 * Walk the hash queues looking for rnodes with page
1461 	 * lists associated with them.  Make a list of these
1462 	 * files.
1463 	 */
1464 	for (index = 0; index < rtable4size; index++) {
1465 		rw_enter(&rtable4[index].r_lock, RW_READER);
1466 		for (rp = rtable4[index].r_hashf;
1467 		    rp != (rnode4_t *)(&rtable4[index]);
1468 		    rp = rp->r_hashf) {
1469 			vp = RTOV4(rp);
1470 			/*
1471 			 * Don't bother sync'ing a vp if it
1472 			 * is part of virtual swap device or
1473 			 * if VFS is read-only
1474 			 */
1475 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1476 				continue;
1477 			/*
1478 			 * If flushing all mounted file systems or
1479 			 * the vnode belongs to this vfs, has pages
1480 			 * and is marked as either dirty or mmap'd,
1481 			 * hold and add this vnode to the list of
1482 			 * vnodes to flush.
1483 			 */
1484 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1485 			    nfs4_has_pages(vp) &&
1486 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1487 				VN_HOLD(vp);
1488 				vplist[cnt++] = vp;
1489 				if (cnt == num) {
1490 					rw_exit(&rtable4[index].r_lock);
1491 					goto toomany;
1492 				}
1493 			}
1494 		}
1495 		rw_exit(&rtable4[index].r_lock);
1496 	}
1497 toomany:
1498 
1499 	/*
1500 	 * Flush and release all of the files on the list.
1501 	 */
1502 	while (cnt-- > 0) {
1503 		vp = vplist[cnt];
1504 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr);
1505 		VN_RELE(vp);
1506 	}
1507 
1508 	/*
1509 	 * Free the space allocated to hold the list.
1510 	 */
1511 	kmem_free(vplist, num * sizeof (*vplist));
1512 }
1513 
1514 int
1515 nfs4_free_data_reclaim(rnode4_t *rp)
1516 {
1517 	char *contents;
1518 	vnode_t *xattr;
1519 	int size;
1520 	vsecattr_t *vsp;
1521 	int freed;
1522 	bool_t rdc = FALSE;
1523 
1524 	/*
1525 	 * Free any held caches which may
1526 	 * be associated with this rnode.
1527 	 */
1528 	mutex_enter(&rp->r_statelock);
1529 	if (rp->r_dir != NULL)
1530 		rdc = TRUE;
1531 	contents = rp->r_symlink.contents;
1532 	size = rp->r_symlink.size;
1533 	rp->r_symlink.contents = NULL;
1534 	vsp = rp->r_secattr;
1535 	rp->r_secattr = NULL;
1536 	xattr = rp->r_xattr_dir;
1537 	rp->r_xattr_dir = NULL;
1538 	mutex_exit(&rp->r_statelock);
1539 
1540 	/*
1541 	 * Free the access cache entries.
1542 	 */
1543 	freed = nfs4_access_purge_rp(rp);
1544 
1545 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1546 		return (freed);
1547 
1548 	/*
1549 	 * Free the readdir cache entries, incompletely if we can't block.
1550 	 */
1551 	nfs4_purge_rddir_cache(RTOV4(rp));
1552 
1553 	/*
1554 	 * Free the symbolic link cache.
1555 	 */
1556 	if (contents != NULL) {
1557 
1558 		kmem_free((void *)contents, size);
1559 	}
1560 
1561 	/*
1562 	 * Free any cached ACL.
1563 	 */
1564 	if (vsp != NULL)
1565 		nfs4_acl_free_cache(vsp);
1566 
1567 	/*
1568 	 * Release the xattr directory vnode
1569 	 */
1570 	if (xattr != NULL)
1571 		VN_RELE(xattr);
1572 
1573 	return (1);
1574 }
1575 
1576 static int
1577 nfs4_active_data_reclaim(rnode4_t *rp)
1578 {
1579 	char *contents;
1580 	vnode_t *xattr;
1581 	int size;
1582 	vsecattr_t *vsp;
1583 	int freed;
1584 	bool_t rdc = FALSE;
1585 
1586 	/*
1587 	 * Free any held credentials and caches which
1588 	 * may be associated with this rnode.
1589 	 */
1590 	if (!mutex_tryenter(&rp->r_statelock))
1591 		return (0);
1592 	contents = rp->r_symlink.contents;
1593 	size = rp->r_symlink.size;
1594 	rp->r_symlink.contents = NULL;
1595 	vsp = rp->r_secattr;
1596 	rp->r_secattr = NULL;
1597 	if (rp->r_dir != NULL)
1598 		rdc = TRUE;
1599 	xattr = rp->r_xattr_dir;
1600 	rp->r_xattr_dir = NULL;
1601 	mutex_exit(&rp->r_statelock);
1602 
1603 	/*
1604 	 * Free the access cache entries.
1605 	 */
1606 	freed = nfs4_access_purge_rp(rp);
1607 
1608 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1609 		return (freed);
1610 
1611 	/*
1612 	 * Free the symbolic link cache.
1613 	 */
1614 	if (contents != NULL) {
1615 
1616 		kmem_free((void *)contents, size);
1617 	}
1618 
1619 	/*
1620 	 * Free any cached ACL.
1621 	 */
1622 	if (vsp != NULL)
1623 		nfs4_acl_free_cache(vsp);
1624 
1625 	nfs4_purge_rddir_cache(RTOV4(rp));
1626 
1627 	/*
1628 	 * Release the xattr directory vnode
1629 	 */
1630 	if (xattr != NULL)
1631 		VN_RELE(xattr);
1632 
1633 	return (1);
1634 }
1635 
1636 static int
1637 nfs4_free_reclaim(void)
1638 {
1639 	int freed;
1640 	rnode4_t *rp;
1641 
1642 #ifdef DEBUG
1643 	clstat4_debug.f_reclaim.value.ui64++;
1644 #endif
1645 	freed = 0;
1646 	mutex_enter(&rp4freelist_lock);
1647 	rp = rp4freelist;
1648 	if (rp != NULL) {
1649 		do {
1650 			if (nfs4_free_data_reclaim(rp))
1651 				freed = 1;
1652 		} while ((rp = rp->r_freef) != rp4freelist);
1653 	}
1654 	mutex_exit(&rp4freelist_lock);
1655 	return (freed);
1656 }
1657 
1658 static int
1659 nfs4_active_reclaim(void)
1660 {
1661 	int freed;
1662 	int index;
1663 	rnode4_t *rp;
1664 
1665 #ifdef DEBUG
1666 	clstat4_debug.a_reclaim.value.ui64++;
1667 #endif
1668 	freed = 0;
1669 	for (index = 0; index < rtable4size; index++) {
1670 		rw_enter(&rtable4[index].r_lock, RW_READER);
1671 		for (rp = rtable4[index].r_hashf;
1672 		    rp != (rnode4_t *)(&rtable4[index]);
1673 		    rp = rp->r_hashf) {
1674 			if (nfs4_active_data_reclaim(rp))
1675 				freed = 1;
1676 		}
1677 		rw_exit(&rtable4[index].r_lock);
1678 	}
1679 	return (freed);
1680 }
1681 
1682 static int
1683 nfs4_rnode_reclaim(void)
1684 {
1685 	int freed;
1686 	rnode4_t *rp;
1687 	vnode_t *vp;
1688 
1689 #ifdef DEBUG
1690 	clstat4_debug.r_reclaim.value.ui64++;
1691 #endif
1692 	freed = 0;
1693 	mutex_enter(&rp4freelist_lock);
1694 	while ((rp = rp4freelist) != NULL) {
1695 		rp4_rmfree(rp);
1696 		mutex_exit(&rp4freelist_lock);
1697 		if (rp->r_flags & R4FILEIDMAP)
1698 			rp4_fileid_map_remove(rp);
1699 		if (rp->r_flags & R4HASHED) {
1700 			vp = RTOV4(rp);
1701 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1702 			mutex_enter(&vp->v_lock);
1703 			if (vp->v_count > 1) {
1704 				vp->v_count--;
1705 				mutex_exit(&vp->v_lock);
1706 				rw_exit(&rp->r_hashq->r_lock);
1707 				mutex_enter(&rp4freelist_lock);
1708 				continue;
1709 			}
1710 			mutex_exit(&vp->v_lock);
1711 			rp4_rmhash_locked(rp);
1712 			rw_exit(&rp->r_hashq->r_lock);
1713 		}
1714 		/*
1715 		 * This call to rp_addfree will end up destroying the
1716 		 * rnode, but in a safe way with the appropriate set
1717 		 * of checks done.
1718 		 */
1719 		rp4_addfree(rp, CRED());
1720 		mutex_enter(&rp4freelist_lock);
1721 	}
1722 	mutex_exit(&rp4freelist_lock);
1723 	return (freed);
1724 }
1725 
1726 /*ARGSUSED*/
1727 static void
1728 nfs4_reclaim(void *cdrarg)
1729 {
1730 
1731 #ifdef DEBUG
1732 	clstat4_debug.reclaim.value.ui64++;
1733 #endif
1734 	if (nfs4_free_reclaim())
1735 		return;
1736 
1737 	if (nfs4_active_reclaim())
1738 		return;
1739 
1740 	(void) nfs4_rnode_reclaim();
1741 }
1742 
1743 /*
1744  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1745  * clientid can change if the caller drops mi_recovlock.
1746  */
1747 
1748 clientid4
1749 mi2clientid(mntinfo4_t *mi)
1750 {
1751 	nfs4_server_t	*sp;
1752 	clientid4	clientid = 0;
1753 
1754 	/* this locks down sp if it is found */
1755 	sp = find_nfs4_server(mi);
1756 	if (sp != NULL) {
1757 		clientid = sp->clientid;
1758 		mutex_exit(&sp->s_lock);
1759 		nfs4_server_rele(sp);
1760 	}
1761 	return (clientid);
1762 }
1763 
1764 /*
1765  * Return the current lease time for the server associated with the given
1766  * file.  Note that the lease time could change immediately after this
1767  * call.
1768  */
1769 
1770 time_t
1771 r2lease_time(rnode4_t *rp)
1772 {
1773 	nfs4_server_t	*sp;
1774 	time_t		lease_time;
1775 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1776 
1777 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1778 
1779 	/* this locks down sp if it is found */
1780 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1781 
1782 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1783 		if (sp != NULL) {
1784 			mutex_exit(&sp->s_lock);
1785 			nfs4_server_rele(sp);
1786 		}
1787 		nfs_rw_exit(&mi->mi_recovlock);
1788 		return (1);		/* 1 second */
1789 	}
1790 
1791 	ASSERT(sp != NULL);
1792 
1793 	lease_time = sp->s_lease_time;
1794 
1795 	mutex_exit(&sp->s_lock);
1796 	nfs4_server_rele(sp);
1797 	nfs_rw_exit(&mi->mi_recovlock);
1798 
1799 	return (lease_time);
1800 }
1801 
1802 /*
1803  * Return a list with information about all the known open instances for
1804  * a filesystem. The caller must call r4releopenlist() when done with the
1805  * list.
1806  *
1807  * We are safe at looking at os_valid and os_pending_close across dropping
1808  * the 'os_sync_lock' to count up the number of open streams and then
1809  * allocate memory for the osp list due to:
1810  *	-Looking at os_pending_close is safe since this routine is
1811  *	only called via recovery, and os_pending_close can only be set via
1812  *	a non-recovery operation (which are all blocked when recovery
1813  *	is active).
1814  *
1815  *	-Examining os_valid is safe since non-recovery operations, which
1816  *	could potentially switch os_valid to 0, are blocked (via
1817  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1818  *	(which means we are the only recovery thread potentially acting
1819  *	on this open stream).
1820  */
1821 
1822 nfs4_opinst_t *
1823 r4mkopenlist(mntinfo4_t *mi)
1824 {
1825 	nfs4_opinst_t *reopenlist, *rep;
1826 	rnode4_t *rp;
1827 	vnode_t *vp;
1828 	vfs_t *vfsp = mi->mi_vfsp;
1829 	int numosp;
1830 	nfs4_open_stream_t *osp;
1831 	int index;
1832 	open_delegation_type4 dtype;
1833 	int hold_vnode;
1834 
1835 	reopenlist = NULL;
1836 
1837 	for (index = 0; index < rtable4size; index++) {
1838 		rw_enter(&rtable4[index].r_lock, RW_READER);
1839 		for (rp = rtable4[index].r_hashf;
1840 		    rp != (rnode4_t *)(&rtable4[index]);
1841 		    rp = rp->r_hashf) {
1842 
1843 			vp = RTOV4(rp);
1844 			if (vp->v_vfsp != vfsp)
1845 				continue;
1846 			hold_vnode = 0;
1847 
1848 			mutex_enter(&rp->r_os_lock);
1849 
1850 			/* Count the number of valid open_streams of the file */
1851 			numosp = 0;
1852 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1853 			    osp = list_next(&rp->r_open_streams, osp)) {
1854 				mutex_enter(&osp->os_sync_lock);
1855 				if (osp->os_valid && !osp->os_pending_close)
1856 					numosp++;
1857 				mutex_exit(&osp->os_sync_lock);
1858 			}
1859 
1860 			/* Fill in the valid open streams per vp */
1861 			if (numosp > 0) {
1862 				int j;
1863 
1864 				hold_vnode = 1;
1865 
1866 				/*
1867 				 * Add a new open instance to the list
1868 				 */
1869 				rep = kmem_zalloc(sizeof (*reopenlist),
1870 					KM_SLEEP);
1871 				rep->re_next = reopenlist;
1872 				reopenlist = rep;
1873 
1874 				rep->re_vp = vp;
1875 				rep->re_osp = kmem_zalloc(
1876 					numosp * sizeof (*(rep->re_osp)),
1877 					KM_SLEEP);
1878 				rep->re_numosp = numosp;
1879 
1880 				j = 0;
1881 				for (osp = list_head(&rp->r_open_streams);
1882 				    osp != NULL;
1883 				    osp = list_next(&rp->r_open_streams, osp)) {
1884 
1885 					mutex_enter(&osp->os_sync_lock);
1886 					if (osp->os_valid &&
1887 					    !osp->os_pending_close) {
1888 						osp->os_ref_count++;
1889 						rep->re_osp[j] = osp;
1890 						j++;
1891 					}
1892 					mutex_exit(&osp->os_sync_lock);
1893 				}
1894 				/*
1895 				 * Assuming valid osp(s) stays valid between
1896 				 * the time obtaining j and numosp.
1897 				 */
1898 				ASSERT(j == numosp);
1899 			}
1900 
1901 			mutex_exit(&rp->r_os_lock);
1902 			/* do this here to keep v_lock > r_os_lock */
1903 			if (hold_vnode)
1904 				VN_HOLD(vp);
1905 			mutex_enter(&rp->r_statev4_lock);
1906 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1907 				/*
1908 				 * If this rnode holds a delegation,
1909 				 * but if there are no valid open streams,
1910 				 * then just discard the delegation
1911 				 * without doing delegreturn.
1912 				 */
1913 				if (numosp > 0)
1914 					rp->r_deleg_needs_recovery =
1915 							rp->r_deleg_type;
1916 			}
1917 			/* Save the delegation type for use outside the lock */
1918 			dtype = rp->r_deleg_type;
1919 			mutex_exit(&rp->r_statev4_lock);
1920 
1921 			/*
1922 			 * If we have a delegation then get rid of it.
1923 			 * We've set rp->r_deleg_needs_recovery so we have
1924 			 * enough information to recover.
1925 			 */
1926 			if (dtype != OPEN_DELEGATE_NONE) {
1927 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1928 			}
1929 		}
1930 		rw_exit(&rtable4[index].r_lock);
1931 	}
1932 	return (reopenlist);
1933 }
1934 
1935 /*
1936  * Release the list of open instance references.
1937  */
1938 
1939 void
1940 r4releopenlist(nfs4_opinst_t *reopenp)
1941 {
1942 	nfs4_opinst_t *rep, *next;
1943 	int i;
1944 
1945 	for (rep = reopenp; rep; rep = next) {
1946 		next = rep->re_next;
1947 
1948 		for (i = 0; i < rep->re_numosp; i++)
1949 		    open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1950 
1951 		VN_RELE(rep->re_vp);
1952 		kmem_free(rep->re_osp,
1953 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1954 
1955 		kmem_free(rep, sizeof (*rep));
1956 	}
1957 }
1958 
1959 int
1960 nfs4_rnode_init(void)
1961 {
1962 	ulong_t nrnode4_max;
1963 	int i;
1964 
1965 	/*
1966 	 * Compute the size of the rnode4 hash table
1967 	 */
1968 	if (nrnode <= 0)
1969 		nrnode = ncsize;
1970 	nrnode4_max =
1971 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1972 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1973 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1974 		    "setting nrnode to max value of %ld", nrnode4_max);
1975 		nrnode = nrnode4_max;
1976 	}
1977 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1978 	rtable4mask = rtable4size - 1;
1979 
1980 	/*
1981 	 * Allocate and initialize the hash buckets
1982 	 */
1983 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1984 	for (i = 0; i < rtable4size; i++) {
1985 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1986 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1987 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1988 	}
1989 
1990 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1991 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1992 
1993 	return (0);
1994 }
1995 
1996 int
1997 nfs4_rnode_fini(void)
1998 {
1999 	int i;
2000 
2001 	/*
2002 	 * Deallocate the rnode hash queues
2003 	 */
2004 	kmem_cache_destroy(rnode4_cache);
2005 
2006 	for (i = 0; i < rtable4size; i++)
2007 		rw_destroy(&rtable4[i].r_lock);
2008 
2009 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
2010 
2011 	return (0);
2012 }
2013 
2014 /*
2015  * Return non-zero if the given filehandle refers to the root filehandle
2016  * for the given rnode.
2017  */
2018 
2019 static int
2020 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
2021 {
2022 	int isroot;
2023 
2024 	isroot = 0;
2025 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
2026 		isroot = 1;
2027 
2028 	return (isroot);
2029 }
2030 
2031 #ifdef DEBUG
2032 
2033 /*
2034  * Look in the rnode table for other rnodes that have the same filehandle.
2035  * Assume the lock is held for the hash chain of checkrp
2036  */
2037 
2038 static void
2039 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
2040 {
2041 	rnode4_t *rp;
2042 	vnode_t *tvp;
2043 	nfs4_fhandle_t fh, fh2;
2044 	int index;
2045 
2046 	if (!r4_check_for_dups)
2047 		return;
2048 
2049 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
2050 
2051 	sfh4_copyval(checkrp->r_fh, &fh);
2052 
2053 	for (index = 0; index < rtable4size; index++) {
2054 
2055 		if (&rtable4[index] != checkrp->r_hashq)
2056 			rw_enter(&rtable4[index].r_lock, RW_READER);
2057 
2058 		for (rp = rtable4[index].r_hashf;
2059 		    rp != (rnode4_t *)(&rtable4[index]);
2060 		    rp = rp->r_hashf) {
2061 
2062 			if (rp == checkrp)
2063 				continue;
2064 
2065 			tvp = RTOV4(rp);
2066 			if (tvp->v_vfsp != vfsp)
2067 				continue;
2068 
2069 			sfh4_copyval(rp->r_fh, &fh2);
2070 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2071 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
2072 				    "(%p, %p)", (void *)checkrp, (void *)rp);
2073 			}
2074 		}
2075 
2076 		if (&rtable4[index] != checkrp->r_hashq)
2077 			rw_exit(&rtable4[index].r_lock);
2078 	}
2079 }
2080 
2081 #endif /* DEBUG */
2082