xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision f8bbc571)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/proc.h>
37 #include <sys/user.h>
38 #include <sys/time.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/socket.h>
43 #include <sys/uio.h>
44 #include <sys/tiuser.h>
45 #include <sys/swap.h>
46 #include <sys/errno.h>
47 #include <sys/debug.h>
48 #include <sys/kmem.h>
49 #include <sys/kstat.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vtrace.h>
52 #include <sys/session.h>
53 #include <sys/dnlc.h>
54 #include <sys/bitmap.h>
55 #include <sys/acl.h>
56 #include <sys/ddi.h>
57 #include <sys/pathname.h>
58 #include <sys/flock.h>
59 #include <sys/dirent.h>
60 #include <sys/flock.h>
61 #include <sys/callb.h>
62 #include <sys/sdt.h>
63 
64 #include <vm/pvn.h>
65 
66 #include <rpc/types.h>
67 #include <rpc/xdr.h>
68 #include <rpc/auth.h>
69 #include <rpc/rpcsec_gss.h>
70 #include <rpc/clnt.h>
71 
72 #include <nfs/nfs.h>
73 #include <nfs/nfs_clnt.h>
74 #include <nfs/nfs_acl.h>
75 
76 #include <nfs/nfs4.h>
77 #include <nfs/rnode4.h>
78 #include <nfs/nfs4_clnt.h>
79 
80 /*
81  * The hash queues for the access to active and cached rnodes
82  * are organized as doubly linked lists.  A reader/writer lock
83  * for each hash bucket is used to control access and to synchronize
84  * lookups, additions, and deletions from the hash queue.
85  *
86  * The rnode freelist is organized as a doubly linked list with
87  * a head pointer.  Additions and deletions are synchronized via
88  * a single mutex.
89  *
90  * In order to add an rnode to the free list, it must be hashed into
91  * a hash queue and the exclusive lock to the hash queue be held.
92  * If an rnode is not hashed into a hash queue, then it is destroyed
93  * because it represents no valuable information that can be reused
94  * about the file.  The exclusive lock to the hash queue must be
95  * held in order to prevent a lookup in the hash queue from finding
96  * the rnode and using it and assuming that the rnode is not on the
97  * freelist.  The lookup in the hash queue will have the hash queue
98  * locked, either exclusive or shared.
99  *
100  * The vnode reference count for each rnode is not allowed to drop
101  * below 1.  This prevents external entities, such as the VM
102  * subsystem, from acquiring references to vnodes already on the
103  * freelist and then trying to place them back on the freelist
104  * when their reference is released.  This means that the when an
105  * rnode is looked up in the hash queues, then either the rnode
106  * is removed from the freelist and that reference is transferred to
107  * the new reference or the vnode reference count must be incremented
108  * accordingly.  The mutex for the freelist must be held in order to
109  * accurately test to see if the rnode is on the freelist or not.
110  * The hash queue lock might be held shared and it is possible that
111  * two different threads may race to remove the rnode from the
112  * freelist.  This race can be resolved by holding the mutex for the
113  * freelist.  Please note that the mutex for the freelist does not
114  * need to be held if the rnode is not on the freelist.  It can not be
115  * placed on the freelist due to the requirement that the thread
116  * putting the rnode on the freelist must hold the exclusive lock
117  * to the hash queue and the thread doing the lookup in the hash
118  * queue is holding either a shared or exclusive lock to the hash
119  * queue.
120  *
121  * The lock ordering is:
122  *
123  *	hash bucket lock -> vnode lock
124  *	hash bucket lock -> freelist lock -> r_statelock
125  */
126 r4hashq_t *rtable4;
127 
128 static kmutex_t rp4freelist_lock;
129 static rnode4_t *rp4freelist = NULL;
130 static long rnode4_new = 0;
131 int rtable4size;
132 static int rtable4mask;
133 static struct kmem_cache *rnode4_cache;
134 static int rnode4_hashlen = 4;
135 
136 static void	r4inactive(rnode4_t *, cred_t *);
137 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
138 		    struct vnodeops *,
139 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
140 		    cred_t *),
141 		    int *, cred_t *);
142 static void	rp4_rmfree(rnode4_t *);
143 int		nfs4_free_data_reclaim(rnode4_t *);
144 static int	nfs4_active_data_reclaim(rnode4_t *);
145 static int	nfs4_free_reclaim(void);
146 static int	nfs4_active_reclaim(void);
147 static int	nfs4_rnode_reclaim(void);
148 static void	nfs4_reclaim(void *);
149 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
150 static void	uninit_rnode4(rnode4_t *);
151 static void	destroy_rnode4(rnode4_t *);
152 static void	r4_stub_set(rnode4_t *, nfs4_stub_type_t);
153 
154 #ifdef DEBUG
155 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
156 static int nfs4_rnode_debug = 0;
157 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
158 static int nfs4_rnode_nofreelist = 0;
159 /* give messages on colliding shared filehandles */
160 static void	r4_dup_check(rnode4_t *, vfs_t *);
161 #endif
162 
163 /*
164  * If the vnode has pages, run the list and check for any that are
165  * still dangling.  We call this routine before putting an rnode on
166  * the free list.
167  */
168 static int
169 nfs4_dross_pages(vnode_t *vp)
170 {
171 	page_t *pp;
172 	kmutex_t *vphm;
173 
174 	vphm = page_vnode_mutex(vp);
175 	mutex_enter(vphm);
176 	if ((pp = vp->v_pages) != NULL) {
177 		do {
178 			if (pp->p_hash != PVN_VPLIST_HASH_TAG &&
179 			    pp->p_fsdata != C_NOCOMMIT) {
180 				mutex_exit(vphm);
181 				return (1);
182 			}
183 		} while ((pp = pp->p_vpnext) != vp->v_pages);
184 	}
185 	mutex_exit(vphm);
186 
187 	return (0);
188 }
189 
190 /*
191  * Flush any pages left on this rnode.
192  */
193 static void
194 r4flushpages(rnode4_t *rp, cred_t *cr)
195 {
196 	vnode_t *vp;
197 	int error;
198 
199 	/*
200 	 * Before freeing anything, wait until all asynchronous
201 	 * activity is done on this rnode.  This will allow all
202 	 * asynchronous read ahead and write behind i/o's to
203 	 * finish.
204 	 */
205 	mutex_enter(&rp->r_statelock);
206 	while (rp->r_count > 0)
207 		cv_wait(&rp->r_cv, &rp->r_statelock);
208 	mutex_exit(&rp->r_statelock);
209 
210 	/*
211 	 * Flush and invalidate all pages associated with the vnode.
212 	 */
213 	vp = RTOV4(rp);
214 	if (nfs4_has_pages(vp)) {
215 		ASSERT(vp->v_type != VCHR);
216 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
217 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
218 			if (error && (error == ENOSPC || error == EDQUOT)) {
219 				mutex_enter(&rp->r_statelock);
220 				if (!rp->r_error)
221 					rp->r_error = error;
222 				mutex_exit(&rp->r_statelock);
223 			}
224 		}
225 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
226 	}
227 }
228 
229 /*
230  * Free the resources associated with an rnode.
231  */
232 static void
233 r4inactive(rnode4_t *rp, cred_t *cr)
234 {
235 	vnode_t *vp;
236 	char *contents;
237 	int size;
238 	vsecattr_t *vsp;
239 	vnode_t *xattr;
240 
241 	r4flushpages(rp, cr);
242 
243 	vp = RTOV4(rp);
244 
245 	/*
246 	 * Free any held caches which may be
247 	 * associated with this rnode.
248 	 */
249 	mutex_enter(&rp->r_statelock);
250 	contents = rp->r_symlink.contents;
251 	size = rp->r_symlink.size;
252 	rp->r_symlink.contents = NULL;
253 	vsp = rp->r_secattr;
254 	rp->r_secattr = NULL;
255 	xattr = rp->r_xattr_dir;
256 	rp->r_xattr_dir = NULL;
257 	mutex_exit(&rp->r_statelock);
258 
259 	/*
260 	 * Free the access cache entries.
261 	 */
262 	(void) nfs4_access_purge_rp(rp);
263 
264 	/*
265 	 * Free the readdir cache entries.
266 	 */
267 	nfs4_purge_rddir_cache(vp);
268 
269 	/*
270 	 * Free the symbolic link cache.
271 	 */
272 	if (contents != NULL) {
273 
274 		kmem_free((void *)contents, size);
275 	}
276 
277 	/*
278 	 * Free any cached ACL.
279 	 */
280 	if (vsp != NULL)
281 		nfs4_acl_free_cache(vsp);
282 
283 	/*
284 	 * Release the cached xattr_dir
285 	 */
286 	if (xattr != NULL)
287 		VN_RELE(xattr);
288 }
289 
290 /*
291  * We have seen a case that the fh passed in is for "." which
292  * should be a VROOT node, however, the fh is different from the
293  * root fh stored in the mntinfo4_t. The invalid fh might be
294  * from a misbehaved server and will panic the client system at
295  * a later time. To avoid the panic, we drop the bad fh, use
296  * the root fh from mntinfo4_t, and print an error message
297  * for attention.
298  */
299 nfs4_sharedfh_t *
300 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
301     int *wasbad)
302 {
303 	char *s;
304 
305 	*wasbad = 0;
306 	s = fn_name(nm);
307 	ASSERT(strcmp(s, "..") != 0);
308 
309 	if ((s[0] == '.' && s[1] == '\0') && fh &&
310 	    !SFH4_SAME(mi->mi_rootfh, fh)) {
311 #ifdef DEBUG
312 		nfs4_fhandle_t fhandle;
313 
314 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
315 		    "Server %s returns a different "
316 		    "root filehandle for the path %s:",
317 		    mi->mi_curr_serv->sv_hostname,
318 		    mi->mi_curr_serv->sv_path);
319 
320 		/* print the bad fh */
321 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
322 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
323 		    fhandle.fh_len);
324 		nfs4_printfhandle(&fhandle);
325 
326 		/* print mi_rootfh */
327 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
328 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
329 		    fhandle.fh_len);
330 		nfs4_printfhandle(&fhandle);
331 #endif
332 		/* use mi_rootfh instead; fh will be rele by the caller */
333 		fh = mi->mi_rootfh;
334 		*wasbad = 1;
335 	}
336 
337 	kmem_free(s, MAXNAMELEN);
338 	return (fh);
339 }
340 
341 void
342 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
343     hrtime_t t, cred_t *cr, int index)
344 {
345 	int is_stub;
346 	vattr_t *attr;
347 	/*
348 	 * Don't add to attrcache if time overflow, but
349 	 * no need to check because either attr is null or the time
350 	 * values in it were processed by nfs4_time_ntov(), which checks
351 	 * for time overflows.
352 	 */
353 	attr = garp ? &garp->n4g_va : NULL;
354 
355 	if (attr) {
356 		if (!newnode) {
357 			rw_exit(&rtable4[index].r_lock);
358 #ifdef DEBUG
359 			if (vp->v_type != attr->va_type &&
360 			    vp->v_type != VNON && attr->va_type != VNON) {
361 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
362 				    "makenfs4node: type (%d) doesn't "
363 				    "match type of found node at %p (%d)",
364 				    attr->va_type, (void *)vp, vp->v_type);
365 			}
366 #endif
367 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
368 		} else {
369 			rnode4_t *rp = VTOR4(vp);
370 
371 			vp->v_type = attr->va_type;
372 			vp->v_rdev = attr->va_rdev;
373 
374 			/*
375 			 * Turn this object into a "stub" object if we
376 			 * crossed an underlying server fs boundary.
377 			 * To make this check, during mount we save the
378 			 * fsid of the server object being mounted.
379 			 * Here we compare this object's server fsid
380 			 * with the fsid we saved at mount.  If they
381 			 * are different, we crossed server fs boundary.
382 			 *
383 			 * The stub type is set (or not) at rnode
384 			 * creation time and it never changes for life
385 			 * of the rnode.
386 			 *
387 			 * This stub will be for a mirror-mount, rather than
388 			 * a referral (the latter also sets R4SRVSTUB).
389 			 *
390 			 * The stub type is also set during RO failover,
391 			 * nfs4_remap_file().
392 			 *
393 			 * We don't bother with taking r_state_lock to
394 			 * set the stub type because this is a new rnode
395 			 * and we're holding the hash bucket r_lock RW_WRITER.
396 			 * No other thread could have obtained access
397 			 * to this rnode.
398 			 */
399 			is_stub = 0;
400 			if (garp->n4g_fsid_valid) {
401 				fattr4_fsid ga_fsid = garp->n4g_fsid;
402 				servinfo4_t *svp = rp->r_server;
403 
404 				rp->r_srv_fsid = ga_fsid;
405 
406 				(void) nfs_rw_enter_sig(&svp->sv_lock,
407 				    RW_READER, 0);
408 				if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
409 					is_stub = 1;
410 				nfs_rw_exit(&svp->sv_lock);
411 			}
412 
413 			if (is_stub)
414 				r4_stub_mirrormount(rp);
415 			else
416 				r4_stub_none(rp);
417 
418 			/* Can not cache partial attr */
419 			if (attr->va_mask == AT_ALL)
420 				nfs4_attrcache_noinval(vp, garp, t);
421 			else
422 				PURGE_ATTRCACHE4(vp);
423 
424 			rw_exit(&rtable4[index].r_lock);
425 		}
426 	} else {
427 		if (newnode) {
428 			PURGE_ATTRCACHE4(vp);
429 		}
430 		rw_exit(&rtable4[index].r_lock);
431 	}
432 }
433 
434 /*
435  * Find or create an rnode based primarily on filehandle.  To be
436  * used when dvp (vnode for parent directory) is not available;
437  * otherwise, makenfs4node() should be used.
438  *
439  * The nfs4_fname_t argument *npp is consumed and nulled out.
440  */
441 
442 vnode_t *
443 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
444     nfs4_fname_t **npp, nfs4_ga_res_t *garp,
445     mntinfo4_t *mi, cred_t *cr, hrtime_t t)
446 {
447 	vfs_t *vfsp = mi->mi_vfsp;
448 	int newnode = 0;
449 	vnode_t *vp;
450 	rnode4_t *rp;
451 	svnode_t *svp;
452 	nfs4_fname_t *name, *svpname;
453 	int index;
454 
455 	ASSERT(npp && *npp);
456 	name = *npp;
457 	*npp = NULL;
458 
459 	index = rtable4hash(sfh);
460 	rw_enter(&rtable4[index].r_lock, RW_READER);
461 
462 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
463 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
464 
465 	svp = VTOSV(vp);
466 	rp = VTOR4(vp);
467 	if (newnode) {
468 		svp->sv_forw = svp->sv_back = svp;
469 		svp->sv_name = name;
470 		if (psfh != NULL)
471 			sfh4_hold(psfh);
472 		svp->sv_dfh = psfh;
473 	} else {
474 		/*
475 		 * It is possible that due to a server
476 		 * side rename fnames have changed.
477 		 * update the fname here.
478 		 */
479 		mutex_enter(&rp->r_svlock);
480 		svpname = svp->sv_name;
481 		if (svp->sv_name != name) {
482 			svp->sv_name = name;
483 			mutex_exit(&rp->r_svlock);
484 			fn_rele(&svpname);
485 		} else {
486 			mutex_exit(&rp->r_svlock);
487 			fn_rele(&name);
488 		}
489 	}
490 
491 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
492 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
493 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
494 
495 	return (vp);
496 }
497 
498 /*
499  * Find or create a vnode for the given filehandle, filesystem, parent, and
500  * name.  The reference to nm is consumed, so the caller must first do an
501  * fn_hold() if it wants to continue using nm after this call.
502  */
503 vnode_t *
504 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
505     hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
506 {
507 	vnode_t *vp;
508 	int newnode;
509 	int index;
510 	mntinfo4_t *mi = VFTOMI4(vfsp);
511 	int had_badfh = 0;
512 	rnode4_t *rp;
513 
514 	ASSERT(dvp != NULL);
515 
516 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
517 
518 	index = rtable4hash(fh);
519 	rw_enter(&rtable4[index].r_lock, RW_READER);
520 
521 	/*
522 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
523 	 */
524 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
525 	    nfs4_putapage, &newnode, cr);
526 
527 	rp = VTOR4(vp);
528 	sv_activate(&vp, dvp, &nm, newnode);
529 	if (dvp->v_flag & V_XATTRDIR) {
530 		mutex_enter(&rp->r_statelock);
531 		rp->r_flags |= R4ISXATTR;
532 		mutex_exit(&rp->r_statelock);
533 	}
534 
535 	/* if getting a bad file handle, do not cache the attributes. */
536 	if (had_badfh) {
537 		rw_exit(&rtable4[index].r_lock);
538 		return (vp);
539 	}
540 
541 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
542 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
543 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
544 
545 	return (vp);
546 }
547 
548 /*
549  * Hash on address of filehandle object.
550  * XXX totally untuned.
551  */
552 
553 int
554 rtable4hash(nfs4_sharedfh_t *fh)
555 {
556 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
557 }
558 
559 /*
560  * Find or create the vnode for the given filehandle and filesystem.
561  * *newnode is set to zero if the vnode already existed; non-zero if it had
562  * to be created.
563  *
564  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
565  */
566 
567 static vnode_t *
568 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
569     struct vnodeops *vops,
570     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
571     int *newnode, cred_t *cr)
572 {
573 	rnode4_t *rp;
574 	rnode4_t *trp;
575 	vnode_t *vp;
576 	mntinfo4_t *mi;
577 
578 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
579 
580 	mi = VFTOMI4(vfsp);
581 
582 start:
583 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
584 		vp = RTOV4(rp);
585 		*newnode = 0;
586 		return (vp);
587 	}
588 	rw_exit(&rhtp->r_lock);
589 
590 	mutex_enter(&rp4freelist_lock);
591 
592 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
593 		rp = rp4freelist;
594 		rp4_rmfree(rp);
595 		mutex_exit(&rp4freelist_lock);
596 
597 		vp = RTOV4(rp);
598 
599 		if (rp->r_flags & R4HASHED) {
600 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
601 			mutex_enter(&vp->v_lock);
602 			if (vp->v_count > 1) {
603 				vp->v_count--;
604 				mutex_exit(&vp->v_lock);
605 				rw_exit(&rp->r_hashq->r_lock);
606 				rw_enter(&rhtp->r_lock, RW_READER);
607 				goto start;
608 			}
609 			mutex_exit(&vp->v_lock);
610 			rp4_rmhash_locked(rp);
611 			rw_exit(&rp->r_hashq->r_lock);
612 		}
613 
614 		r4inactive(rp, cr);
615 
616 		mutex_enter(&vp->v_lock);
617 		if (vp->v_count > 1) {
618 			vp->v_count--;
619 			mutex_exit(&vp->v_lock);
620 			rw_enter(&rhtp->r_lock, RW_READER);
621 			goto start;
622 		}
623 		mutex_exit(&vp->v_lock);
624 		vn_invalid(vp);
625 
626 		/*
627 		 * destroy old locks before bzero'ing and
628 		 * recreating the locks below.
629 		 */
630 		uninit_rnode4(rp);
631 
632 		/*
633 		 * Make sure that if rnode is recycled then
634 		 * VFS count is decremented properly before
635 		 * reuse.
636 		 */
637 		VFS_RELE(vp->v_vfsp);
638 		vn_reinit(vp);
639 	} else {
640 		vnode_t *new_vp;
641 
642 		mutex_exit(&rp4freelist_lock);
643 
644 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
645 		new_vp = vn_alloc(KM_SLEEP);
646 
647 		atomic_add_long((ulong_t *)&rnode4_new, 1);
648 #ifdef DEBUG
649 		clstat4_debug.nrnode.value.ui64++;
650 #endif
651 		vp = new_vp;
652 	}
653 
654 	bzero(rp, sizeof (*rp));
655 	rp->r_vnode = vp;
656 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
657 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
658 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
659 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
660 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
661 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
662 	rp->created_v4 = 0;
663 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
664 	    offsetof(nfs4_open_stream_t, os_node));
665 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
666 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
667 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
668 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
669 	rp->r_flags = R4READDIRWATTR;
670 	rp->r_fh = fh;
671 	rp->r_hashq = rhtp;
672 	sfh4_hold(rp->r_fh);
673 	rp->r_server = mi->mi_curr_serv;
674 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
675 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
676 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
677 
678 	rddir4_cache_create(rp);
679 	rp->r_putapage = putapage;
680 	vn_setops(vp, vops);
681 	vp->v_data = (caddr_t)rp;
682 	vp->v_vfsp = vfsp;
683 	VFS_HOLD(vfsp);
684 	vp->v_type = VNON;
685 	vp->v_flag |= VMODSORT;
686 	if (isrootfh(fh, rp))
687 		vp->v_flag = VROOT;
688 	vn_exists(vp);
689 
690 	/*
691 	 * There is a race condition if someone else
692 	 * alloc's the rnode while no locks are held, so we
693 	 * check again and recover if found.
694 	 */
695 	rw_enter(&rhtp->r_lock, RW_WRITER);
696 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
697 		vp = RTOV4(trp);
698 		*newnode = 0;
699 		rw_exit(&rhtp->r_lock);
700 		rp4_addfree(rp, cr);
701 		rw_enter(&rhtp->r_lock, RW_READER);
702 		return (vp);
703 	}
704 	rp4_addhash(rp);
705 	*newnode = 1;
706 	return (vp);
707 }
708 
709 static void
710 uninit_rnode4(rnode4_t *rp)
711 {
712 	vnode_t *vp = RTOV4(rp);
713 
714 	ASSERT(rp != NULL);
715 	ASSERT(vp != NULL);
716 	ASSERT(vp->v_count == 1);
717 	ASSERT(rp->r_count == 0);
718 	ASSERT(rp->r_mapcnt == 0);
719 	if (rp->r_flags & R4LODANGLERS) {
720 		nfs4_flush_lock_owners(rp);
721 	}
722 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
723 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
724 	ASSERT(!(rp->r_flags & R4HASHED));
725 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
726 	nfs4_clear_open_streams(rp);
727 	list_destroy(&rp->r_open_streams);
728 
729 	/*
730 	 * Destroy the rddir cache first since we need to grab the r_statelock.
731 	 */
732 	mutex_enter(&rp->r_statelock);
733 	rddir4_cache_destroy(rp);
734 	mutex_exit(&rp->r_statelock);
735 	sv_uninit(&rp->r_svnode);
736 	sfh4_rele(&rp->r_fh);
737 	nfs_rw_destroy(&rp->r_rwlock);
738 	nfs_rw_destroy(&rp->r_lkserlock);
739 	mutex_destroy(&rp->r_statelock);
740 	mutex_destroy(&rp->r_statev4_lock);
741 	mutex_destroy(&rp->r_os_lock);
742 	cv_destroy(&rp->r_cv);
743 	cv_destroy(&rp->r_commit.c_cv);
744 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
745 	if (rp->r_flags & R4DELMAPLIST)
746 		list_destroy(&rp->r_indelmap);
747 }
748 
749 /*
750  * Put an rnode on the free list.
751  *
752  * Rnodes which were allocated above and beyond the normal limit
753  * are immediately freed.
754  */
755 void
756 rp4_addfree(rnode4_t *rp, cred_t *cr)
757 {
758 	vnode_t *vp;
759 	vnode_t *xattr;
760 	struct vfs *vfsp;
761 
762 	vp = RTOV4(rp);
763 	ASSERT(vp->v_count >= 1);
764 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
765 
766 	/*
767 	 * If we have too many rnodes allocated and there are no
768 	 * references to this rnode, or if the rnode is no longer
769 	 * accessible by it does not reside in the hash queues,
770 	 * or if an i/o error occurred while writing to the file,
771 	 * then just free it instead of putting it on the rnode
772 	 * freelist.
773 	 */
774 	vfsp = vp->v_vfsp;
775 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
776 #ifdef DEBUG
777 	    (nfs4_rnode_nofreelist != 0) ||
778 #endif
779 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
780 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
781 		if (rp->r_flags & R4HASHED) {
782 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
783 			mutex_enter(&vp->v_lock);
784 			if (vp->v_count > 1) {
785 				vp->v_count--;
786 				mutex_exit(&vp->v_lock);
787 				rw_exit(&rp->r_hashq->r_lock);
788 				return;
789 			}
790 			mutex_exit(&vp->v_lock);
791 			rp4_rmhash_locked(rp);
792 			rw_exit(&rp->r_hashq->r_lock);
793 		}
794 
795 		/*
796 		 * Make sure we don't have a delegation on this rnode
797 		 * before destroying it.
798 		 */
799 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
800 			(void) nfs4delegreturn(rp,
801 			    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
802 		}
803 
804 		r4inactive(rp, cr);
805 
806 		/*
807 		 * Recheck the vnode reference count.  We need to
808 		 * make sure that another reference has not been
809 		 * acquired while we were not holding v_lock.  The
810 		 * rnode is not in the rnode hash queues; one
811 		 * way for a reference to have been acquired
812 		 * is for a VOP_PUTPAGE because the rnode was marked
813 		 * with R4DIRTY or for a modified page.  This
814 		 * reference may have been acquired before our call
815 		 * to r4inactive.  The i/o may have been completed,
816 		 * thus allowing r4inactive to complete, but the
817 		 * reference to the vnode may not have been released
818 		 * yet.  In any case, the rnode can not be destroyed
819 		 * until the other references to this vnode have been
820 		 * released.  The other references will take care of
821 		 * either destroying the rnode or placing it on the
822 		 * rnode freelist.  If there are no other references,
823 		 * then the rnode may be safely destroyed.
824 		 */
825 		mutex_enter(&vp->v_lock);
826 		if (vp->v_count > 1) {
827 			vp->v_count--;
828 			mutex_exit(&vp->v_lock);
829 			return;
830 		}
831 		mutex_exit(&vp->v_lock);
832 
833 		destroy_rnode4(rp);
834 		return;
835 	}
836 
837 	/*
838 	 * Lock the hash queue and then recheck the reference count
839 	 * to ensure that no other threads have acquired a reference
840 	 * to indicate that the rnode should not be placed on the
841 	 * freelist.  If another reference has been acquired, then
842 	 * just release this one and let the other thread complete
843 	 * the processing of adding this rnode to the freelist.
844 	 */
845 again:
846 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
847 
848 	mutex_enter(&vp->v_lock);
849 	if (vp->v_count > 1) {
850 		vp->v_count--;
851 		mutex_exit(&vp->v_lock);
852 		rw_exit(&rp->r_hashq->r_lock);
853 		return;
854 	}
855 	mutex_exit(&vp->v_lock);
856 
857 	/*
858 	 * Make sure we don't put an rnode with a delegation
859 	 * on the free list.
860 	 */
861 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
862 		rw_exit(&rp->r_hashq->r_lock);
863 		(void) nfs4delegreturn(rp,
864 		    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
865 		goto again;
866 	}
867 
868 	/*
869 	 * Now that we have the hash queue lock, and we know there
870 	 * are not anymore references on the vnode, check to make
871 	 * sure there aren't any open streams still on the rnode.
872 	 * If so, drop the hash queue lock, remove the open streams,
873 	 * and recheck the v_count.
874 	 */
875 	mutex_enter(&rp->r_os_lock);
876 	if (list_head(&rp->r_open_streams) != NULL) {
877 		mutex_exit(&rp->r_os_lock);
878 		rw_exit(&rp->r_hashq->r_lock);
879 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
880 			nfs4_clear_open_streams(rp);
881 		else
882 			(void) nfs4close_all(vp, cr);
883 		goto again;
884 	}
885 	mutex_exit(&rp->r_os_lock);
886 
887 	/*
888 	 * Before we put it on the freelist, make sure there are no pages.
889 	 * If there are, flush and commit of all of the dirty and
890 	 * uncommitted pages, assuming the file system isn't read only.
891 	 */
892 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
893 		rw_exit(&rp->r_hashq->r_lock);
894 		r4flushpages(rp, cr);
895 		goto again;
896 	}
897 
898 	/*
899 	 * Before we put it on the freelist, make sure there is no
900 	 * active xattr directory cached, the freelist will not
901 	 * have its entries r4inactive'd if there is still an active
902 	 * rnode, thus nothing in the freelist can hold another
903 	 * rnode active.
904 	 */
905 	xattr = rp->r_xattr_dir;
906 	rp->r_xattr_dir = NULL;
907 
908 	/*
909 	 * If there is no cached data or metadata for this file, then
910 	 * put the rnode on the front of the freelist so that it will
911 	 * be reused before other rnodes which may have cached data or
912 	 * metadata associated with them.
913 	 */
914 	mutex_enter(&rp4freelist_lock);
915 	if (rp4freelist == NULL) {
916 		rp->r_freef = rp;
917 		rp->r_freeb = rp;
918 		rp4freelist = rp;
919 	} else {
920 		rp->r_freef = rp4freelist;
921 		rp->r_freeb = rp4freelist->r_freeb;
922 		rp4freelist->r_freeb->r_freef = rp;
923 		rp4freelist->r_freeb = rp;
924 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
925 		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
926 			rp4freelist = rp;
927 	}
928 	mutex_exit(&rp4freelist_lock);
929 
930 	rw_exit(&rp->r_hashq->r_lock);
931 
932 	if (xattr)
933 		VN_RELE(xattr);
934 }
935 
936 /*
937  * Remove an rnode from the free list.
938  *
939  * The caller must be holding rp4freelist_lock and the rnode
940  * must be on the freelist.
941  */
942 static void
943 rp4_rmfree(rnode4_t *rp)
944 {
945 
946 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
947 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
948 
949 	if (rp == rp4freelist) {
950 		rp4freelist = rp->r_freef;
951 		if (rp == rp4freelist)
952 			rp4freelist = NULL;
953 	}
954 	rp->r_freeb->r_freef = rp->r_freef;
955 	rp->r_freef->r_freeb = rp->r_freeb;
956 
957 	rp->r_freef = rp->r_freeb = NULL;
958 }
959 
960 /*
961  * Put a rnode in the hash table.
962  *
963  * The caller must be holding the exclusive hash queue lock
964  */
965 void
966 rp4_addhash(rnode4_t *rp)
967 {
968 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
969 	ASSERT(!(rp->r_flags & R4HASHED));
970 
971 #ifdef DEBUG
972 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
973 #endif
974 
975 	rp->r_hashf = rp->r_hashq->r_hashf;
976 	rp->r_hashq->r_hashf = rp;
977 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
978 	rp->r_hashf->r_hashb = rp;
979 
980 	mutex_enter(&rp->r_statelock);
981 	rp->r_flags |= R4HASHED;
982 	mutex_exit(&rp->r_statelock);
983 }
984 
985 /*
986  * Remove a rnode from the hash table.
987  *
988  * The caller must be holding the hash queue lock.
989  */
990 void
991 rp4_rmhash_locked(rnode4_t *rp)
992 {
993 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
994 	ASSERT(rp->r_flags & R4HASHED);
995 
996 	rp->r_hashb->r_hashf = rp->r_hashf;
997 	rp->r_hashf->r_hashb = rp->r_hashb;
998 
999 	mutex_enter(&rp->r_statelock);
1000 	rp->r_flags &= ~R4HASHED;
1001 	mutex_exit(&rp->r_statelock);
1002 }
1003 
1004 /*
1005  * Remove a rnode from the hash table.
1006  *
1007  * The caller must not be holding the hash queue lock.
1008  */
1009 void
1010 rp4_rmhash(rnode4_t *rp)
1011 {
1012 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1013 	rp4_rmhash_locked(rp);
1014 	rw_exit(&rp->r_hashq->r_lock);
1015 }
1016 
1017 /*
1018  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1019  * Returns NULL if no match.  If an rnode is returned, the reference count
1020  * on the master vnode is incremented.
1021  *
1022  * The caller must be holding the hash queue lock, either shared or exclusive.
1023  */
1024 rnode4_t *
1025 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1026 {
1027 	rnode4_t *rp;
1028 	vnode_t *vp;
1029 
1030 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1031 
1032 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1033 		vp = RTOV4(rp);
1034 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1035 
1036 			mutex_enter(&rp->r_statelock);
1037 			if (rp->r_flags & R4RECOVERR) {
1038 				mutex_exit(&rp->r_statelock);
1039 				continue;
1040 			}
1041 			mutex_exit(&rp->r_statelock);
1042 #ifdef DEBUG
1043 			r4_dup_check(rp, vfsp);
1044 #endif
1045 			if (rp->r_freef != NULL) {
1046 				mutex_enter(&rp4freelist_lock);
1047 				/*
1048 				 * If the rnode is on the freelist,
1049 				 * then remove it and use that reference
1050 				 * as the new reference.  Otherwise,
1051 				 * need to increment the reference count.
1052 				 */
1053 				if (rp->r_freef != NULL) {
1054 					rp4_rmfree(rp);
1055 					mutex_exit(&rp4freelist_lock);
1056 				} else {
1057 					mutex_exit(&rp4freelist_lock);
1058 					VN_HOLD(vp);
1059 				}
1060 			} else
1061 				VN_HOLD(vp);
1062 
1063 			/*
1064 			 * if root vnode, set v_flag to indicate that
1065 			 */
1066 			if (isrootfh(fh, rp)) {
1067 				if (!(vp->v_flag & VROOT)) {
1068 					mutex_enter(&vp->v_lock);
1069 					vp->v_flag |= VROOT;
1070 					mutex_exit(&vp->v_lock);
1071 				}
1072 			}
1073 			return (rp);
1074 		}
1075 	}
1076 	return (NULL);
1077 }
1078 
1079 /*
1080  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1081  * that assumes the caller hasn't already got the lock
1082  * on the hash bucket.
1083  */
1084 rnode4_t *
1085 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1086 {
1087 	rnode4_t *rp;
1088 	int index;
1089 
1090 	index = rtable4hash(fh);
1091 	rw_enter(&rtable4[index].r_lock, RW_READER);
1092 	rp = r4find(&rtable4[index], fh, vfsp);
1093 	rw_exit(&rtable4[index].r_lock);
1094 
1095 	return (rp);
1096 }
1097 
1098 /*
1099  * Return >0 if there is a active vnode belonging to this vfs in the
1100  * rtable4 cache.
1101  *
1102  * Several of these checks are done without holding the usual
1103  * locks.  This is safe because destroy_rtable(), rp_addfree(),
1104  * etc. will redo the necessary checks before actually destroying
1105  * any rnodes.
1106  */
1107 int
1108 check_rtable4(struct vfs *vfsp)
1109 {
1110 	rnode4_t *rp;
1111 	vnode_t *vp;
1112 	int busy = NFSV4_RTABLE4_OK;
1113 	int index;
1114 
1115 	for (index = 0; index < rtable4size; index++) {
1116 		rw_enter(&rtable4[index].r_lock, RW_READER);
1117 
1118 		for (rp = rtable4[index].r_hashf;
1119 		    rp != (rnode4_t *)(&rtable4[index]);
1120 		    rp = rp->r_hashf) {
1121 
1122 			vp = RTOV4(rp);
1123 			if (vp->v_vfsp == vfsp) {
1124 				if (rp->r_freef == NULL) {
1125 					busy = NFSV4_RTABLE4_NOT_FREE_LIST;
1126 				} else if (nfs4_has_pages(vp) &&
1127 				    (rp->r_flags & R4DIRTY)) {
1128 					busy = NFSV4_RTABLE4_DIRTY_PAGES;
1129 				} else if (rp->r_count > 0) {
1130 					busy = NFSV4_RTABLE4_POS_R_COUNT;
1131 				}
1132 
1133 				if (busy != NFSV4_RTABLE4_OK) {
1134 #ifdef DEBUG
1135 					char *path;
1136 
1137 					path = fn_path(rp->r_svnode.sv_name);
1138 					DTRACE_NFSV4_3(rnode__e__debug,
1139 					    int, busy, char *, path,
1140 					    rnode4_t *, rp);
1141 					kmem_free(path, strlen(path)+1);
1142 #endif
1143 					rw_exit(&rtable4[index].r_lock);
1144 					return (busy);
1145 				}
1146 			}
1147 		}
1148 		rw_exit(&rtable4[index].r_lock);
1149 	}
1150 	return (busy);
1151 }
1152 
1153 /*
1154  * Destroy inactive vnodes from the hash queues which
1155  * belong to this vfs. All of the vnodes should be inactive.
1156  * It is essential that we destroy all rnodes in case of
1157  * forced unmount as well as in normal unmount case.
1158  */
1159 
1160 void
1161 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1162 {
1163 	int index;
1164 	vnode_t *vp;
1165 	rnode4_t *rp, *r_hashf, *rlist;
1166 
1167 	rlist = NULL;
1168 
1169 	for (index = 0; index < rtable4size; index++) {
1170 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1171 		for (rp = rtable4[index].r_hashf;
1172 		    rp != (rnode4_t *)(&rtable4[index]);
1173 		    rp = r_hashf) {
1174 			/* save the hash pointer before destroying */
1175 			r_hashf = rp->r_hashf;
1176 
1177 			vp = RTOV4(rp);
1178 			if (vp->v_vfsp == vfsp) {
1179 				mutex_enter(&rp4freelist_lock);
1180 				if (rp->r_freef != NULL) {
1181 					rp4_rmfree(rp);
1182 					mutex_exit(&rp4freelist_lock);
1183 					rp4_rmhash_locked(rp);
1184 					rp->r_hashf = rlist;
1185 					rlist = rp;
1186 				} else
1187 					mutex_exit(&rp4freelist_lock);
1188 			}
1189 		}
1190 		rw_exit(&rtable4[index].r_lock);
1191 	}
1192 
1193 	for (rp = rlist; rp != NULL; rp = r_hashf) {
1194 		r_hashf = rp->r_hashf;
1195 		/*
1196 		 * This call to rp4_addfree will end up destroying the
1197 		 * rnode, but in a safe way with the appropriate set
1198 		 * of checks done.
1199 		 */
1200 		rp4_addfree(rp, cr);
1201 	}
1202 }
1203 
1204 /*
1205  * This routine destroys all the resources of an rnode
1206  * and finally the rnode itself.
1207  */
1208 static void
1209 destroy_rnode4(rnode4_t *rp)
1210 {
1211 	vnode_t *vp;
1212 	vfs_t *vfsp;
1213 
1214 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1215 
1216 	vp = RTOV4(rp);
1217 	vfsp = vp->v_vfsp;
1218 
1219 	uninit_rnode4(rp);
1220 	atomic_add_long((ulong_t *)&rnode4_new, -1);
1221 #ifdef DEBUG
1222 	clstat4_debug.nrnode.value.ui64--;
1223 #endif
1224 	kmem_cache_free(rnode4_cache, rp);
1225 	vn_invalid(vp);
1226 	vn_free(vp);
1227 	VFS_RELE(vfsp);
1228 }
1229 
1230 /*
1231  * Invalidate the attributes on all rnodes forcing the next getattr
1232  * to go over the wire.  Used to flush stale uid and gid mappings.
1233  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1234  */
1235 void
1236 nfs4_rnode_invalidate(struct vfs *vfsp)
1237 {
1238 	int index;
1239 	rnode4_t *rp;
1240 	vnode_t *vp;
1241 
1242 	/*
1243 	 * Walk the hash queues looking for rnodes.
1244 	 */
1245 	for (index = 0; index < rtable4size; index++) {
1246 		rw_enter(&rtable4[index].r_lock, RW_READER);
1247 		for (rp = rtable4[index].r_hashf;
1248 		    rp != (rnode4_t *)(&rtable4[index]);
1249 		    rp = rp->r_hashf) {
1250 			vp = RTOV4(rp);
1251 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1252 				continue;
1253 
1254 			if (!mutex_tryenter(&rp->r_statelock))
1255 				continue;
1256 
1257 			/*
1258 			 * Expire the attributes by resetting the change
1259 			 * and attr timeout.
1260 			 */
1261 			rp->r_change = 0;
1262 			PURGE_ATTRCACHE4_LOCKED(rp);
1263 			mutex_exit(&rp->r_statelock);
1264 		}
1265 		rw_exit(&rtable4[index].r_lock);
1266 	}
1267 }
1268 
1269 /*
1270  * Flush all vnodes in this (or every) vfs.
1271  * Used by nfs_sync and by nfs_unmount.
1272  */
1273 void
1274 r4flush(struct vfs *vfsp, cred_t *cr)
1275 {
1276 	int index;
1277 	rnode4_t *rp;
1278 	vnode_t *vp, **vplist;
1279 	long num, cnt;
1280 
1281 	/*
1282 	 * Check to see whether there is anything to do.
1283 	 */
1284 	num = rnode4_new;
1285 	if (num == 0)
1286 		return;
1287 
1288 	/*
1289 	 * Allocate a slot for all currently active rnodes on the
1290 	 * supposition that they all may need flushing.
1291 	 */
1292 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1293 	cnt = 0;
1294 
1295 	/*
1296 	 * Walk the hash queues looking for rnodes with page
1297 	 * lists associated with them.  Make a list of these
1298 	 * files.
1299 	 */
1300 	for (index = 0; index < rtable4size; index++) {
1301 		rw_enter(&rtable4[index].r_lock, RW_READER);
1302 		for (rp = rtable4[index].r_hashf;
1303 		    rp != (rnode4_t *)(&rtable4[index]);
1304 		    rp = rp->r_hashf) {
1305 			vp = RTOV4(rp);
1306 			/*
1307 			 * Don't bother sync'ing a vp if it
1308 			 * is part of virtual swap device or
1309 			 * if VFS is read-only
1310 			 */
1311 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1312 				continue;
1313 			/*
1314 			 * If flushing all mounted file systems or
1315 			 * the vnode belongs to this vfs, has pages
1316 			 * and is marked as either dirty or mmap'd,
1317 			 * hold and add this vnode to the list of
1318 			 * vnodes to flush.
1319 			 */
1320 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1321 			    nfs4_has_pages(vp) &&
1322 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1323 				VN_HOLD(vp);
1324 				vplist[cnt++] = vp;
1325 				if (cnt == num) {
1326 					rw_exit(&rtable4[index].r_lock);
1327 					goto toomany;
1328 				}
1329 			}
1330 		}
1331 		rw_exit(&rtable4[index].r_lock);
1332 	}
1333 toomany:
1334 
1335 	/*
1336 	 * Flush and release all of the files on the list.
1337 	 */
1338 	while (cnt-- > 0) {
1339 		vp = vplist[cnt];
1340 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1341 		VN_RELE(vp);
1342 	}
1343 
1344 	/*
1345 	 * Free the space allocated to hold the list.
1346 	 */
1347 	kmem_free(vplist, num * sizeof (*vplist));
1348 }
1349 
1350 int
1351 nfs4_free_data_reclaim(rnode4_t *rp)
1352 {
1353 	char *contents;
1354 	vnode_t *xattr;
1355 	int size;
1356 	vsecattr_t *vsp;
1357 	int freed;
1358 	bool_t rdc = FALSE;
1359 
1360 	/*
1361 	 * Free any held caches which may
1362 	 * be associated with this rnode.
1363 	 */
1364 	mutex_enter(&rp->r_statelock);
1365 	if (rp->r_dir != NULL)
1366 		rdc = TRUE;
1367 	contents = rp->r_symlink.contents;
1368 	size = rp->r_symlink.size;
1369 	rp->r_symlink.contents = NULL;
1370 	vsp = rp->r_secattr;
1371 	rp->r_secattr = NULL;
1372 	xattr = rp->r_xattr_dir;
1373 	rp->r_xattr_dir = NULL;
1374 	mutex_exit(&rp->r_statelock);
1375 
1376 	/*
1377 	 * Free the access cache entries.
1378 	 */
1379 	freed = nfs4_access_purge_rp(rp);
1380 
1381 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1382 		return (freed);
1383 
1384 	/*
1385 	 * Free the readdir cache entries, incompletely if we can't block.
1386 	 */
1387 	nfs4_purge_rddir_cache(RTOV4(rp));
1388 
1389 	/*
1390 	 * Free the symbolic link cache.
1391 	 */
1392 	if (contents != NULL) {
1393 
1394 		kmem_free((void *)contents, size);
1395 	}
1396 
1397 	/*
1398 	 * Free any cached ACL.
1399 	 */
1400 	if (vsp != NULL)
1401 		nfs4_acl_free_cache(vsp);
1402 
1403 	/*
1404 	 * Release the xattr directory vnode
1405 	 */
1406 	if (xattr != NULL)
1407 		VN_RELE(xattr);
1408 
1409 	return (1);
1410 }
1411 
1412 static int
1413 nfs4_active_data_reclaim(rnode4_t *rp)
1414 {
1415 	char *contents;
1416 	vnode_t *xattr = NULL;
1417 	int size;
1418 	vsecattr_t *vsp;
1419 	int freed;
1420 	bool_t rdc = FALSE;
1421 
1422 	/*
1423 	 * Free any held credentials and caches which
1424 	 * may be associated with this rnode.
1425 	 */
1426 	if (!mutex_tryenter(&rp->r_statelock))
1427 		return (0);
1428 	contents = rp->r_symlink.contents;
1429 	size = rp->r_symlink.size;
1430 	rp->r_symlink.contents = NULL;
1431 	vsp = rp->r_secattr;
1432 	rp->r_secattr = NULL;
1433 	if (rp->r_dir != NULL)
1434 		rdc = TRUE;
1435 	/*
1436 	 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1437 	 * on the same r_hashq queue. We are not mandated to free all caches.
1438 	 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1439 	 * rnode 'rp' is freed or put on the free list.
1440 	 *
1441 	 * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1442 	 * - it has no associated rnode4_t (its v_data is NULL),
1443 	 * - it is preallocated statically and will never go away,
1444 	 * so we cannot save anything by releasing it.
1445 	 */
1446 	if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1447 	    VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1448 		xattr = rp->r_xattr_dir;
1449 		rp->r_xattr_dir = NULL;
1450 	}
1451 	mutex_exit(&rp->r_statelock);
1452 
1453 	/*
1454 	 * Free the access cache entries.
1455 	 */
1456 	freed = nfs4_access_purge_rp(rp);
1457 
1458 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1459 		return (freed);
1460 
1461 	/*
1462 	 * Free the symbolic link cache.
1463 	 */
1464 	if (contents != NULL) {
1465 
1466 		kmem_free((void *)contents, size);
1467 	}
1468 
1469 	/*
1470 	 * Free any cached ACL.
1471 	 */
1472 	if (vsp != NULL)
1473 		nfs4_acl_free_cache(vsp);
1474 
1475 	nfs4_purge_rddir_cache(RTOV4(rp));
1476 
1477 	/*
1478 	 * Release the xattr directory vnode
1479 	 */
1480 	if (xattr != NULL)
1481 		VN_RELE(xattr);
1482 
1483 	return (1);
1484 }
1485 
1486 static int
1487 nfs4_free_reclaim(void)
1488 {
1489 	int freed;
1490 	rnode4_t *rp;
1491 
1492 #ifdef DEBUG
1493 	clstat4_debug.f_reclaim.value.ui64++;
1494 #endif
1495 	freed = 0;
1496 	mutex_enter(&rp4freelist_lock);
1497 	rp = rp4freelist;
1498 	if (rp != NULL) {
1499 		do {
1500 			if (nfs4_free_data_reclaim(rp))
1501 				freed = 1;
1502 		} while ((rp = rp->r_freef) != rp4freelist);
1503 	}
1504 	mutex_exit(&rp4freelist_lock);
1505 	return (freed);
1506 }
1507 
1508 static int
1509 nfs4_active_reclaim(void)
1510 {
1511 	int freed;
1512 	int index;
1513 	rnode4_t *rp;
1514 
1515 #ifdef DEBUG
1516 	clstat4_debug.a_reclaim.value.ui64++;
1517 #endif
1518 	freed = 0;
1519 	for (index = 0; index < rtable4size; index++) {
1520 		rw_enter(&rtable4[index].r_lock, RW_READER);
1521 		for (rp = rtable4[index].r_hashf;
1522 		    rp != (rnode4_t *)(&rtable4[index]);
1523 		    rp = rp->r_hashf) {
1524 			if (nfs4_active_data_reclaim(rp))
1525 				freed = 1;
1526 		}
1527 		rw_exit(&rtable4[index].r_lock);
1528 	}
1529 	return (freed);
1530 }
1531 
1532 static int
1533 nfs4_rnode_reclaim(void)
1534 {
1535 	int freed;
1536 	rnode4_t *rp;
1537 	vnode_t *vp;
1538 
1539 #ifdef DEBUG
1540 	clstat4_debug.r_reclaim.value.ui64++;
1541 #endif
1542 	freed = 0;
1543 	mutex_enter(&rp4freelist_lock);
1544 	while ((rp = rp4freelist) != NULL) {
1545 		rp4_rmfree(rp);
1546 		mutex_exit(&rp4freelist_lock);
1547 		if (rp->r_flags & R4HASHED) {
1548 			vp = RTOV4(rp);
1549 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1550 			mutex_enter(&vp->v_lock);
1551 			if (vp->v_count > 1) {
1552 				vp->v_count--;
1553 				mutex_exit(&vp->v_lock);
1554 				rw_exit(&rp->r_hashq->r_lock);
1555 				mutex_enter(&rp4freelist_lock);
1556 				continue;
1557 			}
1558 			mutex_exit(&vp->v_lock);
1559 			rp4_rmhash_locked(rp);
1560 			rw_exit(&rp->r_hashq->r_lock);
1561 		}
1562 		/*
1563 		 * This call to rp_addfree will end up destroying the
1564 		 * rnode, but in a safe way with the appropriate set
1565 		 * of checks done.
1566 		 */
1567 		rp4_addfree(rp, CRED());
1568 		mutex_enter(&rp4freelist_lock);
1569 	}
1570 	mutex_exit(&rp4freelist_lock);
1571 	return (freed);
1572 }
1573 
1574 /*ARGSUSED*/
1575 static void
1576 nfs4_reclaim(void *cdrarg)
1577 {
1578 #ifdef DEBUG
1579 	clstat4_debug.reclaim.value.ui64++;
1580 #endif
1581 	if (nfs4_free_reclaim())
1582 		return;
1583 
1584 	if (nfs4_active_reclaim())
1585 		return;
1586 
1587 	(void) nfs4_rnode_reclaim();
1588 }
1589 
1590 /*
1591  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1592  * clientid can change if the caller drops mi_recovlock.
1593  */
1594 
1595 clientid4
1596 mi2clientid(mntinfo4_t *mi)
1597 {
1598 	nfs4_server_t	*sp;
1599 	clientid4	clientid = 0;
1600 
1601 	/* this locks down sp if it is found */
1602 	sp = find_nfs4_server(mi);
1603 	if (sp != NULL) {
1604 		clientid = sp->clientid;
1605 		mutex_exit(&sp->s_lock);
1606 		nfs4_server_rele(sp);
1607 	}
1608 	return (clientid);
1609 }
1610 
1611 /*
1612  * Return the current lease time for the server associated with the given
1613  * file.  Note that the lease time could change immediately after this
1614  * call.
1615  */
1616 
1617 time_t
1618 r2lease_time(rnode4_t *rp)
1619 {
1620 	nfs4_server_t	*sp;
1621 	time_t		lease_time;
1622 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1623 
1624 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1625 
1626 	/* this locks down sp if it is found */
1627 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1628 
1629 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1630 		if (sp != NULL) {
1631 			mutex_exit(&sp->s_lock);
1632 			nfs4_server_rele(sp);
1633 		}
1634 		nfs_rw_exit(&mi->mi_recovlock);
1635 		return (1);		/* 1 second */
1636 	}
1637 
1638 	ASSERT(sp != NULL);
1639 
1640 	lease_time = sp->s_lease_time;
1641 
1642 	mutex_exit(&sp->s_lock);
1643 	nfs4_server_rele(sp);
1644 	nfs_rw_exit(&mi->mi_recovlock);
1645 
1646 	return (lease_time);
1647 }
1648 
1649 /*
1650  * Return a list with information about all the known open instances for
1651  * a filesystem. The caller must call r4releopenlist() when done with the
1652  * list.
1653  *
1654  * We are safe at looking at os_valid and os_pending_close across dropping
1655  * the 'os_sync_lock' to count up the number of open streams and then
1656  * allocate memory for the osp list due to:
1657  *	-Looking at os_pending_close is safe since this routine is
1658  *	only called via recovery, and os_pending_close can only be set via
1659  *	a non-recovery operation (which are all blocked when recovery
1660  *	is active).
1661  *
1662  *	-Examining os_valid is safe since non-recovery operations, which
1663  *	could potentially switch os_valid to 0, are blocked (via
1664  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1665  *	(which means we are the only recovery thread potentially acting
1666  *	on this open stream).
1667  */
1668 
1669 nfs4_opinst_t *
1670 r4mkopenlist(mntinfo4_t *mi)
1671 {
1672 	nfs4_opinst_t *reopenlist, *rep;
1673 	rnode4_t *rp;
1674 	vnode_t *vp;
1675 	vfs_t *vfsp = mi->mi_vfsp;
1676 	int numosp;
1677 	nfs4_open_stream_t *osp;
1678 	int index;
1679 	open_delegation_type4 dtype;
1680 	int hold_vnode;
1681 
1682 	reopenlist = NULL;
1683 
1684 	for (index = 0; index < rtable4size; index++) {
1685 		rw_enter(&rtable4[index].r_lock, RW_READER);
1686 		for (rp = rtable4[index].r_hashf;
1687 		    rp != (rnode4_t *)(&rtable4[index]);
1688 		    rp = rp->r_hashf) {
1689 
1690 			vp = RTOV4(rp);
1691 			if (vp->v_vfsp != vfsp)
1692 				continue;
1693 			hold_vnode = 0;
1694 
1695 			mutex_enter(&rp->r_os_lock);
1696 
1697 			/* Count the number of valid open_streams of the file */
1698 			numosp = 0;
1699 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1700 			    osp = list_next(&rp->r_open_streams, osp)) {
1701 				mutex_enter(&osp->os_sync_lock);
1702 				if (osp->os_valid && !osp->os_pending_close)
1703 					numosp++;
1704 				mutex_exit(&osp->os_sync_lock);
1705 			}
1706 
1707 			/* Fill in the valid open streams per vp */
1708 			if (numosp > 0) {
1709 				int j;
1710 
1711 				hold_vnode = 1;
1712 
1713 				/*
1714 				 * Add a new open instance to the list
1715 				 */
1716 				rep = kmem_zalloc(sizeof (*reopenlist),
1717 				    KM_SLEEP);
1718 				rep->re_next = reopenlist;
1719 				reopenlist = rep;
1720 
1721 				rep->re_vp = vp;
1722 				rep->re_osp = kmem_zalloc(
1723 				    numosp * sizeof (*(rep->re_osp)),
1724 				    KM_SLEEP);
1725 				rep->re_numosp = numosp;
1726 
1727 				j = 0;
1728 				for (osp = list_head(&rp->r_open_streams);
1729 				    osp != NULL;
1730 				    osp = list_next(&rp->r_open_streams, osp)) {
1731 
1732 					mutex_enter(&osp->os_sync_lock);
1733 					if (osp->os_valid &&
1734 					    !osp->os_pending_close) {
1735 						osp->os_ref_count++;
1736 						rep->re_osp[j] = osp;
1737 						j++;
1738 					}
1739 					mutex_exit(&osp->os_sync_lock);
1740 				}
1741 				/*
1742 				 * Assuming valid osp(s) stays valid between
1743 				 * the time obtaining j and numosp.
1744 				 */
1745 				ASSERT(j == numosp);
1746 			}
1747 
1748 			mutex_exit(&rp->r_os_lock);
1749 			/* do this here to keep v_lock > r_os_lock */
1750 			if (hold_vnode)
1751 				VN_HOLD(vp);
1752 			mutex_enter(&rp->r_statev4_lock);
1753 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1754 				/*
1755 				 * If this rnode holds a delegation,
1756 				 * but if there are no valid open streams,
1757 				 * then just discard the delegation
1758 				 * without doing delegreturn.
1759 				 */
1760 				if (numosp > 0)
1761 					rp->r_deleg_needs_recovery =
1762 					    rp->r_deleg_type;
1763 			}
1764 			/* Save the delegation type for use outside the lock */
1765 			dtype = rp->r_deleg_type;
1766 			mutex_exit(&rp->r_statev4_lock);
1767 
1768 			/*
1769 			 * If we have a delegation then get rid of it.
1770 			 * We've set rp->r_deleg_needs_recovery so we have
1771 			 * enough information to recover.
1772 			 */
1773 			if (dtype != OPEN_DELEGATE_NONE) {
1774 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1775 			}
1776 		}
1777 		rw_exit(&rtable4[index].r_lock);
1778 	}
1779 	return (reopenlist);
1780 }
1781 
1782 /*
1783  * Given a filesystem id, check to see if any rnodes
1784  * within this fsid reside in the rnode cache, other
1785  * than one we know about.
1786  *
1787  * Return 1 if an rnode is found, 0 otherwise
1788  */
1789 int
1790 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1791 {
1792 	rnode4_t *rp;
1793 	vnode_t *vp;
1794 	vfs_t *vfsp = mi->mi_vfsp;
1795 	fattr4_fsid *fsid;
1796 	int index, found = 0;
1797 
1798 	for (index = 0; index < rtable4size; index++) {
1799 		rw_enter(&rtable4[index].r_lock, RW_READER);
1800 		for (rp = rtable4[index].r_hashf;
1801 		    rp != (rnode4_t *)(&rtable4[index]);
1802 		    rp = rp->r_hashf) {
1803 
1804 			vp = RTOV4(rp);
1805 			if (vp->v_vfsp != vfsp)
1806 				continue;
1807 
1808 			/*
1809 			 * XXX there might be a case where a
1810 			 * replicated fs may have the same fsid
1811 			 * across two different servers. This
1812 			 * check isn't good enough in that case
1813 			 */
1814 			fsid = &rp->r_srv_fsid;
1815 			if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1816 				found = 1;
1817 				break;
1818 			}
1819 		}
1820 		rw_exit(&rtable4[index].r_lock);
1821 
1822 		if (found)
1823 			break;
1824 	}
1825 	return (found);
1826 }
1827 
1828 /*
1829  * Release the list of open instance references.
1830  */
1831 
1832 void
1833 r4releopenlist(nfs4_opinst_t *reopenp)
1834 {
1835 	nfs4_opinst_t *rep, *next;
1836 	int i;
1837 
1838 	for (rep = reopenp; rep; rep = next) {
1839 		next = rep->re_next;
1840 
1841 		for (i = 0; i < rep->re_numosp; i++)
1842 			open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1843 
1844 		VN_RELE(rep->re_vp);
1845 		kmem_free(rep->re_osp,
1846 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1847 
1848 		kmem_free(rep, sizeof (*rep));
1849 	}
1850 }
1851 
1852 int
1853 nfs4_rnode_init(void)
1854 {
1855 	ulong_t nrnode4_max;
1856 	int i;
1857 
1858 	/*
1859 	 * Compute the size of the rnode4 hash table
1860 	 */
1861 	if (nrnode <= 0)
1862 		nrnode = ncsize;
1863 	nrnode4_max =
1864 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1865 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1866 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1867 		    "setting nrnode to max value of %ld", nrnode4_max);
1868 		nrnode = nrnode4_max;
1869 	}
1870 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1871 	rtable4mask = rtable4size - 1;
1872 
1873 	/*
1874 	 * Allocate and initialize the hash buckets
1875 	 */
1876 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1877 	for (i = 0; i < rtable4size; i++) {
1878 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1879 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1880 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1881 	}
1882 
1883 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1884 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1885 
1886 	return (0);
1887 }
1888 
1889 int
1890 nfs4_rnode_fini(void)
1891 {
1892 	int i;
1893 
1894 	/*
1895 	 * Deallocate the rnode hash queues
1896 	 */
1897 	kmem_cache_destroy(rnode4_cache);
1898 
1899 	for (i = 0; i < rtable4size; i++)
1900 		rw_destroy(&rtable4[i].r_lock);
1901 
1902 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1903 
1904 	return (0);
1905 }
1906 
1907 /*
1908  * Return non-zero if the given filehandle refers to the root filehandle
1909  * for the given rnode.
1910  */
1911 
1912 static int
1913 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1914 {
1915 	int isroot;
1916 
1917 	isroot = 0;
1918 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1919 		isroot = 1;
1920 
1921 	return (isroot);
1922 }
1923 
1924 /*
1925  * The r4_stub_* routines assume that the rnode is newly activated, and
1926  * that the caller either holds the hash bucket r_lock for this rnode as
1927  * RW_WRITER, or holds r_statelock.
1928  */
1929 static void
1930 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1931 {
1932 	vnode_t *vp = RTOV4(rp);
1933 	krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1934 
1935 	ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1936 
1937 	rp->r_stub_type = type;
1938 
1939 	/*
1940 	 * Safely switch this vnode to the trigger vnodeops.
1941 	 *
1942 	 * Currently, we don't ever switch a trigger vnode back to using
1943 	 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1944 	 * a new v4 object is not a trigger, and it will already have the
1945 	 * correct v4 vnodeops by default. So, no "else" case required here.
1946 	 */
1947 	if (type != NFS4_STUB_NONE)
1948 		vn_setops(vp, nfs4_trigger_vnodeops);
1949 }
1950 
1951 void
1952 r4_stub_mirrormount(rnode4_t *rp)
1953 {
1954 	r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
1955 }
1956 
1957 void
1958 r4_stub_referral(rnode4_t *rp)
1959 {
1960 	DTRACE_PROBE1(nfs4clnt__func__referral__moved,
1961 	    vnode_t *, RTOV4(rp));
1962 	r4_stub_set(rp, NFS4_STUB_REFERRAL);
1963 }
1964 
1965 void
1966 r4_stub_none(rnode4_t *rp)
1967 {
1968 	r4_stub_set(rp, NFS4_STUB_NONE);
1969 }
1970 
1971 #ifdef DEBUG
1972 
1973 /*
1974  * Look in the rnode table for other rnodes that have the same filehandle.
1975  * Assume the lock is held for the hash chain of checkrp
1976  */
1977 
1978 static void
1979 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1980 {
1981 	rnode4_t *rp;
1982 	vnode_t *tvp;
1983 	nfs4_fhandle_t fh, fh2;
1984 	int index;
1985 
1986 	if (!r4_check_for_dups)
1987 		return;
1988 
1989 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1990 
1991 	sfh4_copyval(checkrp->r_fh, &fh);
1992 
1993 	for (index = 0; index < rtable4size; index++) {
1994 
1995 		if (&rtable4[index] != checkrp->r_hashq)
1996 			rw_enter(&rtable4[index].r_lock, RW_READER);
1997 
1998 		for (rp = rtable4[index].r_hashf;
1999 		    rp != (rnode4_t *)(&rtable4[index]);
2000 		    rp = rp->r_hashf) {
2001 
2002 			if (rp == checkrp)
2003 				continue;
2004 
2005 			tvp = RTOV4(rp);
2006 			if (tvp->v_vfsp != vfsp)
2007 				continue;
2008 
2009 			sfh4_copyval(rp->r_fh, &fh2);
2010 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2011 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
2012 				    "(%p, %p)", (void *)checkrp, (void *)rp);
2013 			}
2014 		}
2015 
2016 		if (&rtable4[index] != checkrp->r_hashq)
2017 			rw_exit(&rtable4[index].r_lock);
2018 	}
2019 }
2020 
2021 #endif /* DEBUG */
2022