xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision 4151f947)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/proc.h>
37 #include <sys/user.h>
38 #include <sys/time.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/socket.h>
43 #include <sys/uio.h>
44 #include <sys/tiuser.h>
45 #include <sys/swap.h>
46 #include <sys/errno.h>
47 #include <sys/debug.h>
48 #include <sys/kmem.h>
49 #include <sys/kstat.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vtrace.h>
52 #include <sys/session.h>
53 #include <sys/dnlc.h>
54 #include <sys/bitmap.h>
55 #include <sys/acl.h>
56 #include <sys/ddi.h>
57 #include <sys/pathname.h>
58 #include <sys/flock.h>
59 #include <sys/dirent.h>
60 #include <sys/flock.h>
61 #include <sys/callb.h>
62 
63 #include <rpc/types.h>
64 #include <rpc/xdr.h>
65 #include <rpc/auth.h>
66 #include <rpc/rpcsec_gss.h>
67 #include <rpc/clnt.h>
68 
69 #include <nfs/nfs.h>
70 #include <nfs/nfs_clnt.h>
71 #include <nfs/nfs_acl.h>
72 
73 #include <nfs/nfs4.h>
74 #include <nfs/rnode4.h>
75 #include <nfs/nfs4_clnt.h>
76 
77 /*
78  * The hash queues for the access to active and cached rnodes
79  * are organized as doubly linked lists.  A reader/writer lock
80  * for each hash bucket is used to control access and to synchronize
81  * lookups, additions, and deletions from the hash queue.
82  *
83  * The rnode freelist is organized as a doubly linked list with
84  * a head pointer.  Additions and deletions are synchronized via
85  * a single mutex.
86  *
87  * In order to add an rnode to the free list, it must be hashed into
88  * a hash queue and the exclusive lock to the hash queue be held.
89  * If an rnode is not hashed into a hash queue, then it is destroyed
90  * because it represents no valuable information that can be reused
91  * about the file.  The exclusive lock to the hash queue must be
92  * held in order to prevent a lookup in the hash queue from finding
93  * the rnode and using it and assuming that the rnode is not on the
94  * freelist.  The lookup in the hash queue will have the hash queue
95  * locked, either exclusive or shared.
96  *
97  * The vnode reference count for each rnode is not allowed to drop
98  * below 1.  This prevents external entities, such as the VM
99  * subsystem, from acquiring references to vnodes already on the
100  * freelist and then trying to place them back on the freelist
101  * when their reference is released.  This means that the when an
102  * rnode is looked up in the hash queues, then either the rnode
103  * is removed from the freelist and that reference is transferred to
104  * the new reference or the vnode reference count must be incremented
105  * accordingly.  The mutex for the freelist must be held in order to
106  * accurately test to see if the rnode is on the freelist or not.
107  * The hash queue lock might be held shared and it is possible that
108  * two different threads may race to remove the rnode from the
109  * freelist.  This race can be resolved by holding the mutex for the
110  * freelist.  Please note that the mutex for the freelist does not
111  * need to be held if the rnode is not on the freelist.  It can not be
112  * placed on the freelist due to the requirement that the thread
113  * putting the rnode on the freelist must hold the exclusive lock
114  * to the hash queue and the thread doing the lookup in the hash
115  * queue is holding either a shared or exclusive lock to the hash
116  * queue.
117  *
118  * The lock ordering is:
119  *
120  *	hash bucket lock -> vnode lock
121  *	hash bucket lock -> freelist lock -> r_statelock
122  */
123 r4hashq_t *rtable4;
124 
125 static kmutex_t rp4freelist_lock;
126 static rnode4_t *rp4freelist = NULL;
127 static long rnode4_new = 0;
128 int rtable4size;
129 static int rtable4mask;
130 static struct kmem_cache *rnode4_cache;
131 static int rnode4_hashlen = 4;
132 
133 static void	r4inactive(rnode4_t *, cred_t *);
134 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
135 		    struct vnodeops *,
136 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
137 		    cred_t *),
138 		    int *, cred_t *);
139 static void	rp4_rmfree(rnode4_t *);
140 int		nfs4_free_data_reclaim(rnode4_t *);
141 static int	nfs4_active_data_reclaim(rnode4_t *);
142 static int	nfs4_free_reclaim(void);
143 static int	nfs4_active_reclaim(void);
144 static int	nfs4_rnode_reclaim(void);
145 static void	nfs4_reclaim(void *);
146 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
147 static void	uninit_rnode4(rnode4_t *);
148 static void	destroy_rnode4(rnode4_t *);
149 static void	r4_stub_set(rnode4_t *, nfs4_stub_type_t);
150 
151 #ifdef DEBUG
152 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
153 static int nfs4_rnode_debug = 0;
154 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
155 static int nfs4_rnode_nofreelist = 0;
156 /* give messages on colliding shared filehandles */
157 static void	r4_dup_check(rnode4_t *, vfs_t *);
158 #endif
159 
160 /*
161  * If the vnode has pages, run the list and check for any that are
162  * still dangling.  We call this routine before putting an rnode on
163  * the free list.
164  */
165 static int
166 nfs4_dross_pages(vnode_t *vp)
167 {
168 	page_t *pp;
169 	kmutex_t *vphm;
170 
171 	vphm = page_vnode_mutex(vp);
172 	mutex_enter(vphm);
173 	if ((pp = vp->v_pages) != NULL) {
174 		do {
175 			if (pp->p_fsdata != C_NOCOMMIT) {
176 				mutex_exit(vphm);
177 				return (1);
178 			}
179 		} while ((pp = pp->p_vpnext) != vp->v_pages);
180 	}
181 	mutex_exit(vphm);
182 
183 	return (0);
184 }
185 
186 /*
187  * Flush any pages left on this rnode.
188  */
189 static void
190 r4flushpages(rnode4_t *rp, cred_t *cr)
191 {
192 	vnode_t *vp;
193 	int error;
194 
195 	/*
196 	 * Before freeing anything, wait until all asynchronous
197 	 * activity is done on this rnode.  This will allow all
198 	 * asynchronous read ahead and write behind i/o's to
199 	 * finish.
200 	 */
201 	mutex_enter(&rp->r_statelock);
202 	while (rp->r_count > 0)
203 		cv_wait(&rp->r_cv, &rp->r_statelock);
204 	mutex_exit(&rp->r_statelock);
205 
206 	/*
207 	 * Flush and invalidate all pages associated with the vnode.
208 	 */
209 	vp = RTOV4(rp);
210 	if (nfs4_has_pages(vp)) {
211 		ASSERT(vp->v_type != VCHR);
212 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
213 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
214 			if (error && (error == ENOSPC || error == EDQUOT)) {
215 				mutex_enter(&rp->r_statelock);
216 				if (!rp->r_error)
217 					rp->r_error = error;
218 				mutex_exit(&rp->r_statelock);
219 			}
220 		}
221 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
222 	}
223 }
224 
225 /*
226  * Free the resources associated with an rnode.
227  */
228 static void
229 r4inactive(rnode4_t *rp, cred_t *cr)
230 {
231 	vnode_t *vp;
232 	char *contents;
233 	int size;
234 	vsecattr_t *vsp;
235 	vnode_t *xattr;
236 
237 	r4flushpages(rp, cr);
238 
239 	vp = RTOV4(rp);
240 
241 	/*
242 	 * Free any held caches which may be
243 	 * associated with this rnode.
244 	 */
245 	mutex_enter(&rp->r_statelock);
246 	contents = rp->r_symlink.contents;
247 	size = rp->r_symlink.size;
248 	rp->r_symlink.contents = NULL;
249 	vsp = rp->r_secattr;
250 	rp->r_secattr = NULL;
251 	xattr = rp->r_xattr_dir;
252 	rp->r_xattr_dir = NULL;
253 	mutex_exit(&rp->r_statelock);
254 
255 	/*
256 	 * Free the access cache entries.
257 	 */
258 	(void) nfs4_access_purge_rp(rp);
259 
260 	/*
261 	 * Free the readdir cache entries.
262 	 */
263 	nfs4_purge_rddir_cache(vp);
264 
265 	/*
266 	 * Free the symbolic link cache.
267 	 */
268 	if (contents != NULL) {
269 
270 		kmem_free((void *)contents, size);
271 	}
272 
273 	/*
274 	 * Free any cached ACL.
275 	 */
276 	if (vsp != NULL)
277 		nfs4_acl_free_cache(vsp);
278 
279 	/*
280 	 * Release the cached xattr_dir
281 	 */
282 	if (xattr != NULL)
283 		VN_RELE(xattr);
284 }
285 
286 /*
287  * We have seen a case that the fh passed in is for "." which
288  * should be a VROOT node, however, the fh is different from the
289  * root fh stored in the mntinfo4_t. The invalid fh might be
290  * from a misbehaved server and will panic the client system at
291  * a later time. To avoid the panic, we drop the bad fh, use
292  * the root fh from mntinfo4_t, and print an error message
293  * for attention.
294  */
295 nfs4_sharedfh_t *
296 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
297     int *wasbad)
298 {
299 	char *s;
300 
301 	*wasbad = 0;
302 	s = fn_name(nm);
303 	ASSERT(strcmp(s, "..") != 0);
304 
305 	if ((s[0] == '.' && s[1] == '\0') && fh &&
306 	    !SFH4_SAME(mi->mi_rootfh, fh)) {
307 #ifdef DEBUG
308 		nfs4_fhandle_t fhandle;
309 
310 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
311 		    "Server %s returns a different "
312 		    "root filehandle for the path %s:",
313 		    mi->mi_curr_serv->sv_hostname,
314 		    mi->mi_curr_serv->sv_path);
315 
316 		/* print the bad fh */
317 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
318 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
319 		    fhandle.fh_len);
320 		nfs4_printfhandle(&fhandle);
321 
322 		/* print mi_rootfh */
323 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
324 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
325 		    fhandle.fh_len);
326 		nfs4_printfhandle(&fhandle);
327 #endif
328 		/* use mi_rootfh instead; fh will be rele by the caller */
329 		fh = mi->mi_rootfh;
330 		*wasbad = 1;
331 	}
332 
333 	kmem_free(s, MAXNAMELEN);
334 	return (fh);
335 }
336 
337 void
338 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
339     hrtime_t t, cred_t *cr, int index)
340 {
341 	int is_stub;
342 	vattr_t *attr;
343 	/*
344 	 * Don't add to attrcache if time overflow, but
345 	 * no need to check because either attr is null or the time
346 	 * values in it were processed by nfs4_time_ntov(), which checks
347 	 * for time overflows.
348 	 */
349 	attr = garp ? &garp->n4g_va : NULL;
350 
351 	if (attr) {
352 		if (!newnode) {
353 			rw_exit(&rtable4[index].r_lock);
354 #ifdef DEBUG
355 			if (vp->v_type != attr->va_type &&
356 			    vp->v_type != VNON && attr->va_type != VNON) {
357 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
358 				    "makenfs4node: type (%d) doesn't "
359 				    "match type of found node at %p (%d)",
360 				    attr->va_type, (void *)vp, vp->v_type);
361 			}
362 #endif
363 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
364 		} else {
365 			rnode4_t *rp = VTOR4(vp);
366 
367 			vp->v_type = attr->va_type;
368 			vp->v_rdev = attr->va_rdev;
369 
370 			/*
371 			 * Turn this object into a "stub" object if we
372 			 * crossed an underlying server fs boundary.
373 			 * To make this check, during mount we save the
374 			 * fsid of the server object being mounted.
375 			 * Here we compare this object's server fsid
376 			 * with the fsid we saved at mount.  If they
377 			 * are different, we crossed server fs boundary.
378 			 *
379 			 * The stub type is set (or not) at rnode
380 			 * creation time and it never changes for life
381 			 * of the rnode.
382 			 *
383 			 * The stub type is also set during RO failover,
384 			 * nfs4_remap_file().
385 			 *
386 			 * This stub will be for a mirror-mount.
387 			 *
388 			 * We don't bother with taking r_state_lock to
389 			 * set the stub type because this is a new rnode
390 			 * and we're holding the hash bucket r_lock RW_WRITER.
391 			 * No other thread could have obtained access
392 			 * to this rnode.
393 			 */
394 			is_stub = 0;
395 			if (garp->n4g_fsid_valid) {
396 				fattr4_fsid ga_fsid = garp->n4g_fsid;
397 				servinfo4_t *svp = rp->r_server;
398 
399 				rp->r_srv_fsid = ga_fsid;
400 
401 				(void) nfs_rw_enter_sig(&svp->sv_lock,
402 				    RW_READER, 0);
403 				if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
404 					is_stub = 1;
405 				nfs_rw_exit(&svp->sv_lock);
406 			}
407 
408 			if (is_stub)
409 				r4_stub_mirrormount(rp);
410 			else
411 				r4_stub_none(rp);
412 
413 			/* Can not cache partial attr */
414 			if (attr->va_mask == AT_ALL)
415 				nfs4_attrcache_noinval(vp, garp, t);
416 			else
417 				PURGE_ATTRCACHE4(vp);
418 
419 			rw_exit(&rtable4[index].r_lock);
420 		}
421 	} else {
422 		if (newnode) {
423 			PURGE_ATTRCACHE4(vp);
424 		}
425 		rw_exit(&rtable4[index].r_lock);
426 	}
427 }
428 
429 /*
430  * Find or create an rnode based primarily on filehandle.  To be
431  * used when dvp (vnode for parent directory) is not available;
432  * otherwise, makenfs4node() should be used.
433  *
434  * The nfs4_fname_t argument *npp is consumed and nulled out.
435  */
436 
437 vnode_t *
438 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
439     nfs4_fname_t **npp, nfs4_ga_res_t *garp,
440     mntinfo4_t *mi, cred_t *cr, hrtime_t t)
441 {
442 	vfs_t *vfsp = mi->mi_vfsp;
443 	int newnode = 0;
444 	vnode_t *vp;
445 	rnode4_t *rp;
446 	svnode_t *svp;
447 	nfs4_fname_t *name, *svpname;
448 	int index;
449 
450 	ASSERT(npp && *npp);
451 	name = *npp;
452 	*npp = NULL;
453 
454 	index = rtable4hash(sfh);
455 	rw_enter(&rtable4[index].r_lock, RW_READER);
456 
457 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
458 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
459 
460 	svp = VTOSV(vp);
461 	rp = VTOR4(vp);
462 	if (newnode) {
463 		svp->sv_forw = svp->sv_back = svp;
464 		svp->sv_name = name;
465 		if (psfh != NULL)
466 			sfh4_hold(psfh);
467 		svp->sv_dfh = psfh;
468 	} else {
469 		/*
470 		 * It is possible that due to a server
471 		 * side rename fnames have changed.
472 		 * update the fname here.
473 		 */
474 		mutex_enter(&rp->r_svlock);
475 		svpname = svp->sv_name;
476 		if (svp->sv_name != name) {
477 			svp->sv_name = name;
478 			mutex_exit(&rp->r_svlock);
479 			fn_rele(&svpname);
480 		} else {
481 			mutex_exit(&rp->r_svlock);
482 			fn_rele(&name);
483 		}
484 	}
485 
486 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
487 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
488 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
489 
490 	return (vp);
491 }
492 
493 /*
494  * Find or create a vnode for the given filehandle, filesystem, parent, and
495  * name.  The reference to nm is consumed, so the caller must first do an
496  * fn_hold() if it wants to continue using nm after this call.
497  */
498 vnode_t *
499 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
500     hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
501 {
502 	vnode_t *vp;
503 	int newnode;
504 	int index;
505 	mntinfo4_t *mi = VFTOMI4(vfsp);
506 	int had_badfh = 0;
507 	rnode4_t *rp;
508 
509 	ASSERT(dvp != NULL);
510 
511 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
512 
513 	index = rtable4hash(fh);
514 	rw_enter(&rtable4[index].r_lock, RW_READER);
515 
516 	/*
517 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
518 	 */
519 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
520 	    nfs4_putapage, &newnode, cr);
521 
522 	rp = VTOR4(vp);
523 	sv_activate(&vp, dvp, &nm, newnode);
524 	if (dvp->v_flag & V_XATTRDIR) {
525 		mutex_enter(&rp->r_statelock);
526 		rp->r_flags |= R4ISXATTR;
527 		mutex_exit(&rp->r_statelock);
528 	}
529 
530 	/* if getting a bad file handle, do not cache the attributes. */
531 	if (had_badfh) {
532 		rw_exit(&rtable4[index].r_lock);
533 		return (vp);
534 	}
535 
536 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
537 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
538 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
539 
540 	return (vp);
541 }
542 
543 /*
544  * Hash on address of filehandle object.
545  * XXX totally untuned.
546  */
547 
548 int
549 rtable4hash(nfs4_sharedfh_t *fh)
550 {
551 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
552 }
553 
554 /*
555  * Find or create the vnode for the given filehandle and filesystem.
556  * *newnode is set to zero if the vnode already existed; non-zero if it had
557  * to be created.
558  *
559  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
560  */
561 
562 static vnode_t *
563 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
564     struct vnodeops *vops,
565     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
566     int *newnode, cred_t *cr)
567 {
568 	rnode4_t *rp;
569 	rnode4_t *trp;
570 	vnode_t *vp;
571 	mntinfo4_t *mi;
572 
573 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
574 
575 	mi = VFTOMI4(vfsp);
576 
577 start:
578 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
579 		vp = RTOV4(rp);
580 		*newnode = 0;
581 		return (vp);
582 	}
583 	rw_exit(&rhtp->r_lock);
584 
585 	mutex_enter(&rp4freelist_lock);
586 
587 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
588 		rp = rp4freelist;
589 		rp4_rmfree(rp);
590 		mutex_exit(&rp4freelist_lock);
591 
592 		vp = RTOV4(rp);
593 
594 		if (rp->r_flags & R4HASHED) {
595 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
596 			mutex_enter(&vp->v_lock);
597 			if (vp->v_count > 1) {
598 				vp->v_count--;
599 				mutex_exit(&vp->v_lock);
600 				rw_exit(&rp->r_hashq->r_lock);
601 				rw_enter(&rhtp->r_lock, RW_READER);
602 				goto start;
603 			}
604 			mutex_exit(&vp->v_lock);
605 			rp4_rmhash_locked(rp);
606 			rw_exit(&rp->r_hashq->r_lock);
607 		}
608 
609 		r4inactive(rp, cr);
610 
611 		mutex_enter(&vp->v_lock);
612 		if (vp->v_count > 1) {
613 			vp->v_count--;
614 			mutex_exit(&vp->v_lock);
615 			rw_enter(&rhtp->r_lock, RW_READER);
616 			goto start;
617 		}
618 		mutex_exit(&vp->v_lock);
619 		vn_invalid(vp);
620 
621 		/*
622 		 * destroy old locks before bzero'ing and
623 		 * recreating the locks below.
624 		 */
625 		uninit_rnode4(rp);
626 
627 		/*
628 		 * Make sure that if rnode is recycled then
629 		 * VFS count is decremented properly before
630 		 * reuse.
631 		 */
632 		VFS_RELE(vp->v_vfsp);
633 		vn_reinit(vp);
634 	} else {
635 		vnode_t *new_vp;
636 
637 		mutex_exit(&rp4freelist_lock);
638 
639 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
640 		new_vp = vn_alloc(KM_SLEEP);
641 
642 		atomic_add_long((ulong_t *)&rnode4_new, 1);
643 #ifdef DEBUG
644 		clstat4_debug.nrnode.value.ui64++;
645 #endif
646 		vp = new_vp;
647 	}
648 
649 	bzero(rp, sizeof (*rp));
650 	rp->r_vnode = vp;
651 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
652 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
653 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
654 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
655 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
656 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
657 	rp->created_v4 = 0;
658 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
659 	    offsetof(nfs4_open_stream_t, os_node));
660 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
661 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
662 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
663 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
664 	rp->r_flags = R4READDIRWATTR;
665 	rp->r_fh = fh;
666 	rp->r_hashq = rhtp;
667 	sfh4_hold(rp->r_fh);
668 	rp->r_server = mi->mi_curr_serv;
669 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
670 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
671 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
672 
673 	rddir4_cache_create(rp);
674 	rp->r_putapage = putapage;
675 	vn_setops(vp, vops);
676 	vp->v_data = (caddr_t)rp;
677 	vp->v_vfsp = vfsp;
678 	VFS_HOLD(vfsp);
679 	vp->v_type = VNON;
680 	if (isrootfh(fh, rp))
681 		vp->v_flag = VROOT;
682 	vn_exists(vp);
683 
684 	/*
685 	 * There is a race condition if someone else
686 	 * alloc's the rnode while no locks are held, so we
687 	 * check again and recover if found.
688 	 */
689 	rw_enter(&rhtp->r_lock, RW_WRITER);
690 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
691 		vp = RTOV4(trp);
692 		*newnode = 0;
693 		rw_exit(&rhtp->r_lock);
694 		rp4_addfree(rp, cr);
695 		rw_enter(&rhtp->r_lock, RW_READER);
696 		return (vp);
697 	}
698 	rp4_addhash(rp);
699 	*newnode = 1;
700 	return (vp);
701 }
702 
703 static void
704 uninit_rnode4(rnode4_t *rp)
705 {
706 	vnode_t *vp = RTOV4(rp);
707 
708 	ASSERT(rp != NULL);
709 	ASSERT(vp != NULL);
710 	ASSERT(vp->v_count == 1);
711 	ASSERT(rp->r_count == 0);
712 	ASSERT(rp->r_mapcnt == 0);
713 	if (rp->r_flags & R4LODANGLERS) {
714 		nfs4_flush_lock_owners(rp);
715 	}
716 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
717 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
718 	ASSERT(!(rp->r_flags & R4HASHED));
719 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
720 	nfs4_clear_open_streams(rp);
721 	list_destroy(&rp->r_open_streams);
722 
723 	/*
724 	 * Destroy the rddir cache first since we need to grab the r_statelock.
725 	 */
726 	mutex_enter(&rp->r_statelock);
727 	rddir4_cache_destroy(rp);
728 	mutex_exit(&rp->r_statelock);
729 	sv_uninit(&rp->r_svnode);
730 	sfh4_rele(&rp->r_fh);
731 	nfs_rw_destroy(&rp->r_rwlock);
732 	nfs_rw_destroy(&rp->r_lkserlock);
733 	mutex_destroy(&rp->r_statelock);
734 	mutex_destroy(&rp->r_statev4_lock);
735 	mutex_destroy(&rp->r_os_lock);
736 	cv_destroy(&rp->r_cv);
737 	cv_destroy(&rp->r_commit.c_cv);
738 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
739 	if (rp->r_flags & R4DELMAPLIST)
740 		list_destroy(&rp->r_indelmap);
741 }
742 
743 /*
744  * Put an rnode on the free list.
745  *
746  * Rnodes which were allocated above and beyond the normal limit
747  * are immediately freed.
748  */
749 void
750 rp4_addfree(rnode4_t *rp, cred_t *cr)
751 {
752 	vnode_t *vp;
753 	vnode_t *xattr;
754 	struct vfs *vfsp;
755 
756 	vp = RTOV4(rp);
757 	ASSERT(vp->v_count >= 1);
758 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
759 
760 	/*
761 	 * If we have too many rnodes allocated and there are no
762 	 * references to this rnode, or if the rnode is no longer
763 	 * accessible by it does not reside in the hash queues,
764 	 * or if an i/o error occurred while writing to the file,
765 	 * then just free it instead of putting it on the rnode
766 	 * freelist.
767 	 */
768 	vfsp = vp->v_vfsp;
769 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
770 #ifdef DEBUG
771 	    (nfs4_rnode_nofreelist != 0) ||
772 #endif
773 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
774 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
775 		if (rp->r_flags & R4HASHED) {
776 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
777 			mutex_enter(&vp->v_lock);
778 			if (vp->v_count > 1) {
779 				vp->v_count--;
780 				mutex_exit(&vp->v_lock);
781 				rw_exit(&rp->r_hashq->r_lock);
782 				return;
783 			}
784 			mutex_exit(&vp->v_lock);
785 			rp4_rmhash_locked(rp);
786 			rw_exit(&rp->r_hashq->r_lock);
787 		}
788 
789 		/*
790 		 * Make sure we don't have a delegation on this rnode
791 		 * before destroying it.
792 		 */
793 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
794 			(void) nfs4delegreturn(rp,
795 			    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
796 		}
797 
798 		r4inactive(rp, cr);
799 
800 		/*
801 		 * Recheck the vnode reference count.  We need to
802 		 * make sure that another reference has not been
803 		 * acquired while we were not holding v_lock.  The
804 		 * rnode is not in the rnode hash queues; one
805 		 * way for a reference to have been acquired
806 		 * is for a VOP_PUTPAGE because the rnode was marked
807 		 * with R4DIRTY or for a modified page.  This
808 		 * reference may have been acquired before our call
809 		 * to r4inactive.  The i/o may have been completed,
810 		 * thus allowing r4inactive to complete, but the
811 		 * reference to the vnode may not have been released
812 		 * yet.  In any case, the rnode can not be destroyed
813 		 * until the other references to this vnode have been
814 		 * released.  The other references will take care of
815 		 * either destroying the rnode or placing it on the
816 		 * rnode freelist.  If there are no other references,
817 		 * then the rnode may be safely destroyed.
818 		 */
819 		mutex_enter(&vp->v_lock);
820 		if (vp->v_count > 1) {
821 			vp->v_count--;
822 			mutex_exit(&vp->v_lock);
823 			return;
824 		}
825 		mutex_exit(&vp->v_lock);
826 
827 		destroy_rnode4(rp);
828 		return;
829 	}
830 
831 	/*
832 	 * Lock the hash queue and then recheck the reference count
833 	 * to ensure that no other threads have acquired a reference
834 	 * to indicate that the rnode should not be placed on the
835 	 * freelist.  If another reference has been acquired, then
836 	 * just release this one and let the other thread complete
837 	 * the processing of adding this rnode to the freelist.
838 	 */
839 again:
840 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
841 
842 	mutex_enter(&vp->v_lock);
843 	if (vp->v_count > 1) {
844 		vp->v_count--;
845 		mutex_exit(&vp->v_lock);
846 		rw_exit(&rp->r_hashq->r_lock);
847 		return;
848 	}
849 	mutex_exit(&vp->v_lock);
850 
851 	/*
852 	 * Make sure we don't put an rnode with a delegation
853 	 * on the free list.
854 	 */
855 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
856 		rw_exit(&rp->r_hashq->r_lock);
857 		(void) nfs4delegreturn(rp,
858 		    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
859 		goto again;
860 	}
861 
862 	/*
863 	 * Now that we have the hash queue lock, and we know there
864 	 * are not anymore references on the vnode, check to make
865 	 * sure there aren't any open streams still on the rnode.
866 	 * If so, drop the hash queue lock, remove the open streams,
867 	 * and recheck the v_count.
868 	 */
869 	mutex_enter(&rp->r_os_lock);
870 	if (list_head(&rp->r_open_streams) != NULL) {
871 		mutex_exit(&rp->r_os_lock);
872 		rw_exit(&rp->r_hashq->r_lock);
873 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
874 			nfs4_clear_open_streams(rp);
875 		else
876 			(void) nfs4close_all(vp, cr);
877 		goto again;
878 	}
879 	mutex_exit(&rp->r_os_lock);
880 
881 	/*
882 	 * Before we put it on the freelist, make sure there are no pages.
883 	 * If there are, flush and commit of all of the dirty and
884 	 * uncommitted pages, assuming the file system isn't read only.
885 	 */
886 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
887 		rw_exit(&rp->r_hashq->r_lock);
888 		r4flushpages(rp, cr);
889 		goto again;
890 	}
891 
892 	/*
893 	 * Before we put it on the freelist, make sure there is no
894 	 * active xattr directory cached, the freelist will not
895 	 * have its entries r4inactive'd if there is still an active
896 	 * rnode, thus nothing in the freelist can hold another
897 	 * rnode active.
898 	 */
899 	xattr = rp->r_xattr_dir;
900 	rp->r_xattr_dir = NULL;
901 
902 	/*
903 	 * If there is no cached data or metadata for this file, then
904 	 * put the rnode on the front of the freelist so that it will
905 	 * be reused before other rnodes which may have cached data or
906 	 * metadata associated with them.
907 	 */
908 	mutex_enter(&rp4freelist_lock);
909 	if (rp4freelist == NULL) {
910 		rp->r_freef = rp;
911 		rp->r_freeb = rp;
912 		rp4freelist = rp;
913 	} else {
914 		rp->r_freef = rp4freelist;
915 		rp->r_freeb = rp4freelist->r_freeb;
916 		rp4freelist->r_freeb->r_freef = rp;
917 		rp4freelist->r_freeb = rp;
918 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
919 		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
920 			rp4freelist = rp;
921 	}
922 	mutex_exit(&rp4freelist_lock);
923 
924 	rw_exit(&rp->r_hashq->r_lock);
925 
926 	if (xattr)
927 		VN_RELE(xattr);
928 }
929 
930 /*
931  * Remove an rnode from the free list.
932  *
933  * The caller must be holding rp4freelist_lock and the rnode
934  * must be on the freelist.
935  */
936 static void
937 rp4_rmfree(rnode4_t *rp)
938 {
939 
940 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
941 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
942 
943 	if (rp == rp4freelist) {
944 		rp4freelist = rp->r_freef;
945 		if (rp == rp4freelist)
946 			rp4freelist = NULL;
947 	}
948 	rp->r_freeb->r_freef = rp->r_freef;
949 	rp->r_freef->r_freeb = rp->r_freeb;
950 
951 	rp->r_freef = rp->r_freeb = NULL;
952 }
953 
954 /*
955  * Put a rnode in the hash table.
956  *
957  * The caller must be holding the exclusive hash queue lock
958  */
959 void
960 rp4_addhash(rnode4_t *rp)
961 {
962 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
963 	ASSERT(!(rp->r_flags & R4HASHED));
964 
965 #ifdef DEBUG
966 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
967 #endif
968 
969 	rp->r_hashf = rp->r_hashq->r_hashf;
970 	rp->r_hashq->r_hashf = rp;
971 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
972 	rp->r_hashf->r_hashb = rp;
973 
974 	mutex_enter(&rp->r_statelock);
975 	rp->r_flags |= R4HASHED;
976 	mutex_exit(&rp->r_statelock);
977 }
978 
979 /*
980  * Remove a rnode from the hash table.
981  *
982  * The caller must be holding the hash queue lock.
983  */
984 void
985 rp4_rmhash_locked(rnode4_t *rp)
986 {
987 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
988 	ASSERT(rp->r_flags & R4HASHED);
989 
990 	rp->r_hashb->r_hashf = rp->r_hashf;
991 	rp->r_hashf->r_hashb = rp->r_hashb;
992 
993 	mutex_enter(&rp->r_statelock);
994 	rp->r_flags &= ~R4HASHED;
995 	mutex_exit(&rp->r_statelock);
996 }
997 
998 /*
999  * Remove a rnode from the hash table.
1000  *
1001  * The caller must not be holding the hash queue lock.
1002  */
1003 void
1004 rp4_rmhash(rnode4_t *rp)
1005 {
1006 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1007 	rp4_rmhash_locked(rp);
1008 	rw_exit(&rp->r_hashq->r_lock);
1009 }
1010 
1011 /*
1012  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1013  * Returns NULL if no match.  If an rnode is returned, the reference count
1014  * on the master vnode is incremented.
1015  *
1016  * The caller must be holding the hash queue lock, either shared or exclusive.
1017  */
1018 rnode4_t *
1019 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1020 {
1021 	rnode4_t *rp;
1022 	vnode_t *vp;
1023 
1024 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1025 
1026 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1027 		vp = RTOV4(rp);
1028 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1029 
1030 			mutex_enter(&rp->r_statelock);
1031 			if (rp->r_flags & R4RECOVERR) {
1032 				mutex_exit(&rp->r_statelock);
1033 				continue;
1034 			}
1035 			mutex_exit(&rp->r_statelock);
1036 #ifdef DEBUG
1037 			r4_dup_check(rp, vfsp);
1038 #endif
1039 			if (rp->r_freef != NULL) {
1040 				mutex_enter(&rp4freelist_lock);
1041 				/*
1042 				 * If the rnode is on the freelist,
1043 				 * then remove it and use that reference
1044 				 * as the new reference.  Otherwise,
1045 				 * need to increment the reference count.
1046 				 */
1047 				if (rp->r_freef != NULL) {
1048 					rp4_rmfree(rp);
1049 					mutex_exit(&rp4freelist_lock);
1050 				} else {
1051 					mutex_exit(&rp4freelist_lock);
1052 					VN_HOLD(vp);
1053 				}
1054 			} else
1055 				VN_HOLD(vp);
1056 
1057 			/*
1058 			 * if root vnode, set v_flag to indicate that
1059 			 */
1060 			if (isrootfh(fh, rp)) {
1061 				if (!(vp->v_flag & VROOT)) {
1062 					mutex_enter(&vp->v_lock);
1063 					vp->v_flag |= VROOT;
1064 					mutex_exit(&vp->v_lock);
1065 				}
1066 			}
1067 			return (rp);
1068 		}
1069 	}
1070 	return (NULL);
1071 }
1072 
1073 /*
1074  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1075  * that assumes the caller hasn't already got the lock
1076  * on the hash bucket.
1077  */
1078 rnode4_t *
1079 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1080 {
1081 	rnode4_t *rp;
1082 	int index;
1083 
1084 	index = rtable4hash(fh);
1085 	rw_enter(&rtable4[index].r_lock, RW_READER);
1086 	rp = r4find(&rtable4[index], fh, vfsp);
1087 	rw_exit(&rtable4[index].r_lock);
1088 
1089 	return (rp);
1090 }
1091 
1092 /*
1093  * Return 1 if there is a active vnode belonging to this vfs in the
1094  * rtable4 cache.
1095  *
1096  * Several of these checks are done without holding the usual
1097  * locks.  This is safe because destroy_rtable(), rp_addfree(),
1098  * etc. will redo the necessary checks before actually destroying
1099  * any rnodes.
1100  */
1101 int
1102 check_rtable4(struct vfs *vfsp)
1103 {
1104 	rnode4_t *rp;
1105 	vnode_t *vp;
1106 	char *busy = NULL;
1107 	int index;
1108 
1109 	for (index = 0; index < rtable4size; index++) {
1110 		rw_enter(&rtable4[index].r_lock, RW_READER);
1111 
1112 		for (rp = rtable4[index].r_hashf;
1113 		    rp != (rnode4_t *)(&rtable4[index]);
1114 		    rp = rp->r_hashf) {
1115 
1116 			vp = RTOV4(rp);
1117 			if (vp->v_vfsp == vfsp) {
1118 				if (rp->r_freef == NULL) {
1119 					busy = "not on free list";
1120 				} else if (nfs4_has_pages(vp) &&
1121 				    (rp->r_flags & R4DIRTY)) {
1122 					busy = "dirty pages";
1123 				} else if (rp->r_count > 0) {
1124 					busy = "r_count > 0";
1125 				}
1126 
1127 				if (busy != NULL) {
1128 #ifdef DEBUG
1129 					char *path;
1130 
1131 					path = fn_path(rp->r_svnode.sv_name);
1132 					NFS4_DEBUG(nfs4_rnode_debug,
1133 					    (CE_NOTE, "check_rtable4: " "%s %s",
1134 					    path, busy));
1135 					kmem_free(path, strlen(path)+1);
1136 #endif
1137 					rw_exit(&rtable4[index].r_lock);
1138 					return (1);
1139 				}
1140 			}
1141 		}
1142 		rw_exit(&rtable4[index].r_lock);
1143 	}
1144 	return (0);
1145 }
1146 
1147 /*
1148  * Destroy inactive vnodes from the hash queues which
1149  * belong to this vfs. All of the vnodes should be inactive.
1150  * It is essential that we destroy all rnodes in case of
1151  * forced unmount as well as in normal unmount case.
1152  */
1153 
1154 void
1155 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1156 {
1157 	int index;
1158 	vnode_t *vp;
1159 	rnode4_t *rp, *r_hashf, *rlist;
1160 
1161 	rlist = NULL;
1162 
1163 	for (index = 0; index < rtable4size; index++) {
1164 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1165 		for (rp = rtable4[index].r_hashf;
1166 		    rp != (rnode4_t *)(&rtable4[index]);
1167 		    rp = r_hashf) {
1168 			/* save the hash pointer before destroying */
1169 			r_hashf = rp->r_hashf;
1170 
1171 			vp = RTOV4(rp);
1172 			if (vp->v_vfsp == vfsp) {
1173 				mutex_enter(&rp4freelist_lock);
1174 				if (rp->r_freef != NULL) {
1175 					rp4_rmfree(rp);
1176 					mutex_exit(&rp4freelist_lock);
1177 					rp4_rmhash_locked(rp);
1178 					rp->r_hashf = rlist;
1179 					rlist = rp;
1180 				} else
1181 					mutex_exit(&rp4freelist_lock);
1182 			}
1183 		}
1184 		rw_exit(&rtable4[index].r_lock);
1185 	}
1186 
1187 	for (rp = rlist; rp != NULL; rp = r_hashf) {
1188 		r_hashf = rp->r_hashf;
1189 		/*
1190 		 * This call to rp4_addfree will end up destroying the
1191 		 * rnode, but in a safe way with the appropriate set
1192 		 * of checks done.
1193 		 */
1194 		rp4_addfree(rp, cr);
1195 	}
1196 }
1197 
1198 /*
1199  * This routine destroys all the resources of an rnode
1200  * and finally the rnode itself.
1201  */
1202 static void
1203 destroy_rnode4(rnode4_t *rp)
1204 {
1205 	vnode_t *vp;
1206 	vfs_t *vfsp;
1207 
1208 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1209 
1210 	vp = RTOV4(rp);
1211 	vfsp = vp->v_vfsp;
1212 
1213 	uninit_rnode4(rp);
1214 	atomic_add_long((ulong_t *)&rnode4_new, -1);
1215 #ifdef DEBUG
1216 	clstat4_debug.nrnode.value.ui64--;
1217 #endif
1218 	kmem_cache_free(rnode4_cache, rp);
1219 	vn_invalid(vp);
1220 	vn_free(vp);
1221 	VFS_RELE(vfsp);
1222 }
1223 
1224 /*
1225  * Invalidate the attributes on all rnodes forcing the next getattr
1226  * to go over the wire.  Used to flush stale uid and gid mappings.
1227  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1228  */
1229 void
1230 nfs4_rnode_invalidate(struct vfs *vfsp)
1231 {
1232 	int index;
1233 	rnode4_t *rp;
1234 	vnode_t *vp;
1235 
1236 	/*
1237 	 * Walk the hash queues looking for rnodes.
1238 	 */
1239 	for (index = 0; index < rtable4size; index++) {
1240 		rw_enter(&rtable4[index].r_lock, RW_READER);
1241 		for (rp = rtable4[index].r_hashf;
1242 		    rp != (rnode4_t *)(&rtable4[index]);
1243 		    rp = rp->r_hashf) {
1244 			vp = RTOV4(rp);
1245 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1246 				continue;
1247 
1248 			if (!mutex_tryenter(&rp->r_statelock))
1249 				continue;
1250 
1251 			/*
1252 			 * Expire the attributes by resetting the change
1253 			 * and attr timeout.
1254 			 */
1255 			rp->r_change = 0;
1256 			PURGE_ATTRCACHE4_LOCKED(rp);
1257 			mutex_exit(&rp->r_statelock);
1258 		}
1259 		rw_exit(&rtable4[index].r_lock);
1260 	}
1261 }
1262 
1263 /*
1264  * Flush all vnodes in this (or every) vfs.
1265  * Used by nfs_sync and by nfs_unmount.
1266  */
1267 void
1268 r4flush(struct vfs *vfsp, cred_t *cr)
1269 {
1270 	int index;
1271 	rnode4_t *rp;
1272 	vnode_t *vp, **vplist;
1273 	long num, cnt;
1274 
1275 	/*
1276 	 * Check to see whether there is anything to do.
1277 	 */
1278 	num = rnode4_new;
1279 	if (num == 0)
1280 		return;
1281 
1282 	/*
1283 	 * Allocate a slot for all currently active rnodes on the
1284 	 * supposition that they all may need flushing.
1285 	 */
1286 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1287 	cnt = 0;
1288 
1289 	/*
1290 	 * Walk the hash queues looking for rnodes with page
1291 	 * lists associated with them.  Make a list of these
1292 	 * files.
1293 	 */
1294 	for (index = 0; index < rtable4size; index++) {
1295 		rw_enter(&rtable4[index].r_lock, RW_READER);
1296 		for (rp = rtable4[index].r_hashf;
1297 		    rp != (rnode4_t *)(&rtable4[index]);
1298 		    rp = rp->r_hashf) {
1299 			vp = RTOV4(rp);
1300 			/*
1301 			 * Don't bother sync'ing a vp if it
1302 			 * is part of virtual swap device or
1303 			 * if VFS is read-only
1304 			 */
1305 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1306 				continue;
1307 			/*
1308 			 * If flushing all mounted file systems or
1309 			 * the vnode belongs to this vfs, has pages
1310 			 * and is marked as either dirty or mmap'd,
1311 			 * hold and add this vnode to the list of
1312 			 * vnodes to flush.
1313 			 */
1314 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1315 			    nfs4_has_pages(vp) &&
1316 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1317 				VN_HOLD(vp);
1318 				vplist[cnt++] = vp;
1319 				if (cnt == num) {
1320 					rw_exit(&rtable4[index].r_lock);
1321 					goto toomany;
1322 				}
1323 			}
1324 		}
1325 		rw_exit(&rtable4[index].r_lock);
1326 	}
1327 toomany:
1328 
1329 	/*
1330 	 * Flush and release all of the files on the list.
1331 	 */
1332 	while (cnt-- > 0) {
1333 		vp = vplist[cnt];
1334 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1335 		VN_RELE(vp);
1336 	}
1337 
1338 	/*
1339 	 * Free the space allocated to hold the list.
1340 	 */
1341 	kmem_free(vplist, num * sizeof (*vplist));
1342 }
1343 
1344 int
1345 nfs4_free_data_reclaim(rnode4_t *rp)
1346 {
1347 	char *contents;
1348 	vnode_t *xattr;
1349 	int size;
1350 	vsecattr_t *vsp;
1351 	int freed;
1352 	bool_t rdc = FALSE;
1353 
1354 	/*
1355 	 * Free any held caches which may
1356 	 * be associated with this rnode.
1357 	 */
1358 	mutex_enter(&rp->r_statelock);
1359 	if (rp->r_dir != NULL)
1360 		rdc = TRUE;
1361 	contents = rp->r_symlink.contents;
1362 	size = rp->r_symlink.size;
1363 	rp->r_symlink.contents = NULL;
1364 	vsp = rp->r_secattr;
1365 	rp->r_secattr = NULL;
1366 	xattr = rp->r_xattr_dir;
1367 	rp->r_xattr_dir = NULL;
1368 	mutex_exit(&rp->r_statelock);
1369 
1370 	/*
1371 	 * Free the access cache entries.
1372 	 */
1373 	freed = nfs4_access_purge_rp(rp);
1374 
1375 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1376 		return (freed);
1377 
1378 	/*
1379 	 * Free the readdir cache entries, incompletely if we can't block.
1380 	 */
1381 	nfs4_purge_rddir_cache(RTOV4(rp));
1382 
1383 	/*
1384 	 * Free the symbolic link cache.
1385 	 */
1386 	if (contents != NULL) {
1387 
1388 		kmem_free((void *)contents, size);
1389 	}
1390 
1391 	/*
1392 	 * Free any cached ACL.
1393 	 */
1394 	if (vsp != NULL)
1395 		nfs4_acl_free_cache(vsp);
1396 
1397 	/*
1398 	 * Release the xattr directory vnode
1399 	 */
1400 	if (xattr != NULL)
1401 		VN_RELE(xattr);
1402 
1403 	return (1);
1404 }
1405 
1406 static int
1407 nfs4_active_data_reclaim(rnode4_t *rp)
1408 {
1409 	char *contents;
1410 	vnode_t *xattr;
1411 	int size;
1412 	vsecattr_t *vsp;
1413 	int freed;
1414 	bool_t rdc = FALSE;
1415 
1416 	/*
1417 	 * Free any held credentials and caches which
1418 	 * may be associated with this rnode.
1419 	 */
1420 	if (!mutex_tryenter(&rp->r_statelock))
1421 		return (0);
1422 	contents = rp->r_symlink.contents;
1423 	size = rp->r_symlink.size;
1424 	rp->r_symlink.contents = NULL;
1425 	vsp = rp->r_secattr;
1426 	rp->r_secattr = NULL;
1427 	if (rp->r_dir != NULL)
1428 		rdc = TRUE;
1429 	xattr = rp->r_xattr_dir;
1430 	rp->r_xattr_dir = NULL;
1431 	mutex_exit(&rp->r_statelock);
1432 
1433 	/*
1434 	 * Free the access cache entries.
1435 	 */
1436 	freed = nfs4_access_purge_rp(rp);
1437 
1438 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1439 		return (freed);
1440 
1441 	/*
1442 	 * Free the symbolic link cache.
1443 	 */
1444 	if (contents != NULL) {
1445 
1446 		kmem_free((void *)contents, size);
1447 	}
1448 
1449 	/*
1450 	 * Free any cached ACL.
1451 	 */
1452 	if (vsp != NULL)
1453 		nfs4_acl_free_cache(vsp);
1454 
1455 	nfs4_purge_rddir_cache(RTOV4(rp));
1456 
1457 	/*
1458 	 * Release the xattr directory vnode
1459 	 */
1460 	if (xattr != NULL)
1461 		VN_RELE(xattr);
1462 
1463 	return (1);
1464 }
1465 
1466 static int
1467 nfs4_free_reclaim(void)
1468 {
1469 	int freed;
1470 	rnode4_t *rp;
1471 
1472 #ifdef DEBUG
1473 	clstat4_debug.f_reclaim.value.ui64++;
1474 #endif
1475 	freed = 0;
1476 	mutex_enter(&rp4freelist_lock);
1477 	rp = rp4freelist;
1478 	if (rp != NULL) {
1479 		do {
1480 			if (nfs4_free_data_reclaim(rp))
1481 				freed = 1;
1482 		} while ((rp = rp->r_freef) != rp4freelist);
1483 	}
1484 	mutex_exit(&rp4freelist_lock);
1485 	return (freed);
1486 }
1487 
1488 static int
1489 nfs4_active_reclaim(void)
1490 {
1491 	int freed;
1492 	int index;
1493 	rnode4_t *rp;
1494 
1495 #ifdef DEBUG
1496 	clstat4_debug.a_reclaim.value.ui64++;
1497 #endif
1498 	freed = 0;
1499 	for (index = 0; index < rtable4size; index++) {
1500 		rw_enter(&rtable4[index].r_lock, RW_READER);
1501 		for (rp = rtable4[index].r_hashf;
1502 		    rp != (rnode4_t *)(&rtable4[index]);
1503 		    rp = rp->r_hashf) {
1504 			if (nfs4_active_data_reclaim(rp))
1505 				freed = 1;
1506 		}
1507 		rw_exit(&rtable4[index].r_lock);
1508 	}
1509 	return (freed);
1510 }
1511 
1512 static int
1513 nfs4_rnode_reclaim(void)
1514 {
1515 	int freed;
1516 	rnode4_t *rp;
1517 	vnode_t *vp;
1518 
1519 #ifdef DEBUG
1520 	clstat4_debug.r_reclaim.value.ui64++;
1521 #endif
1522 	freed = 0;
1523 	mutex_enter(&rp4freelist_lock);
1524 	while ((rp = rp4freelist) != NULL) {
1525 		rp4_rmfree(rp);
1526 		mutex_exit(&rp4freelist_lock);
1527 		if (rp->r_flags & R4HASHED) {
1528 			vp = RTOV4(rp);
1529 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1530 			mutex_enter(&vp->v_lock);
1531 			if (vp->v_count > 1) {
1532 				vp->v_count--;
1533 				mutex_exit(&vp->v_lock);
1534 				rw_exit(&rp->r_hashq->r_lock);
1535 				mutex_enter(&rp4freelist_lock);
1536 				continue;
1537 			}
1538 			mutex_exit(&vp->v_lock);
1539 			rp4_rmhash_locked(rp);
1540 			rw_exit(&rp->r_hashq->r_lock);
1541 		}
1542 		/*
1543 		 * This call to rp_addfree will end up destroying the
1544 		 * rnode, but in a safe way with the appropriate set
1545 		 * of checks done.
1546 		 */
1547 		rp4_addfree(rp, CRED());
1548 		mutex_enter(&rp4freelist_lock);
1549 	}
1550 	mutex_exit(&rp4freelist_lock);
1551 	return (freed);
1552 }
1553 
1554 /*ARGSUSED*/
1555 static void
1556 nfs4_reclaim(void *cdrarg)
1557 {
1558 #ifdef DEBUG
1559 	clstat4_debug.reclaim.value.ui64++;
1560 #endif
1561 	if (nfs4_free_reclaim())
1562 		return;
1563 
1564 	if (nfs4_active_reclaim())
1565 		return;
1566 
1567 	(void) nfs4_rnode_reclaim();
1568 }
1569 
1570 /*
1571  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1572  * clientid can change if the caller drops mi_recovlock.
1573  */
1574 
1575 clientid4
1576 mi2clientid(mntinfo4_t *mi)
1577 {
1578 	nfs4_server_t	*sp;
1579 	clientid4	clientid = 0;
1580 
1581 	/* this locks down sp if it is found */
1582 	sp = find_nfs4_server(mi);
1583 	if (sp != NULL) {
1584 		clientid = sp->clientid;
1585 		mutex_exit(&sp->s_lock);
1586 		nfs4_server_rele(sp);
1587 	}
1588 	return (clientid);
1589 }
1590 
1591 /*
1592  * Return the current lease time for the server associated with the given
1593  * file.  Note that the lease time could change immediately after this
1594  * call.
1595  */
1596 
1597 time_t
1598 r2lease_time(rnode4_t *rp)
1599 {
1600 	nfs4_server_t	*sp;
1601 	time_t		lease_time;
1602 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1603 
1604 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1605 
1606 	/* this locks down sp if it is found */
1607 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1608 
1609 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1610 		if (sp != NULL) {
1611 			mutex_exit(&sp->s_lock);
1612 			nfs4_server_rele(sp);
1613 		}
1614 		nfs_rw_exit(&mi->mi_recovlock);
1615 		return (1);		/* 1 second */
1616 	}
1617 
1618 	ASSERT(sp != NULL);
1619 
1620 	lease_time = sp->s_lease_time;
1621 
1622 	mutex_exit(&sp->s_lock);
1623 	nfs4_server_rele(sp);
1624 	nfs_rw_exit(&mi->mi_recovlock);
1625 
1626 	return (lease_time);
1627 }
1628 
1629 /*
1630  * Return a list with information about all the known open instances for
1631  * a filesystem. The caller must call r4releopenlist() when done with the
1632  * list.
1633  *
1634  * We are safe at looking at os_valid and os_pending_close across dropping
1635  * the 'os_sync_lock' to count up the number of open streams and then
1636  * allocate memory for the osp list due to:
1637  *	-Looking at os_pending_close is safe since this routine is
1638  *	only called via recovery, and os_pending_close can only be set via
1639  *	a non-recovery operation (which are all blocked when recovery
1640  *	is active).
1641  *
1642  *	-Examining os_valid is safe since non-recovery operations, which
1643  *	could potentially switch os_valid to 0, are blocked (via
1644  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1645  *	(which means we are the only recovery thread potentially acting
1646  *	on this open stream).
1647  */
1648 
1649 nfs4_opinst_t *
1650 r4mkopenlist(mntinfo4_t *mi)
1651 {
1652 	nfs4_opinst_t *reopenlist, *rep;
1653 	rnode4_t *rp;
1654 	vnode_t *vp;
1655 	vfs_t *vfsp = mi->mi_vfsp;
1656 	int numosp;
1657 	nfs4_open_stream_t *osp;
1658 	int index;
1659 	open_delegation_type4 dtype;
1660 	int hold_vnode;
1661 
1662 	reopenlist = NULL;
1663 
1664 	for (index = 0; index < rtable4size; index++) {
1665 		rw_enter(&rtable4[index].r_lock, RW_READER);
1666 		for (rp = rtable4[index].r_hashf;
1667 		    rp != (rnode4_t *)(&rtable4[index]);
1668 		    rp = rp->r_hashf) {
1669 
1670 			vp = RTOV4(rp);
1671 			if (vp->v_vfsp != vfsp)
1672 				continue;
1673 			hold_vnode = 0;
1674 
1675 			mutex_enter(&rp->r_os_lock);
1676 
1677 			/* Count the number of valid open_streams of the file */
1678 			numosp = 0;
1679 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1680 			    osp = list_next(&rp->r_open_streams, osp)) {
1681 				mutex_enter(&osp->os_sync_lock);
1682 				if (osp->os_valid && !osp->os_pending_close)
1683 					numosp++;
1684 				mutex_exit(&osp->os_sync_lock);
1685 			}
1686 
1687 			/* Fill in the valid open streams per vp */
1688 			if (numosp > 0) {
1689 				int j;
1690 
1691 				hold_vnode = 1;
1692 
1693 				/*
1694 				 * Add a new open instance to the list
1695 				 */
1696 				rep = kmem_zalloc(sizeof (*reopenlist),
1697 				    KM_SLEEP);
1698 				rep->re_next = reopenlist;
1699 				reopenlist = rep;
1700 
1701 				rep->re_vp = vp;
1702 				rep->re_osp = kmem_zalloc(
1703 				    numosp * sizeof (*(rep->re_osp)),
1704 				    KM_SLEEP);
1705 				rep->re_numosp = numosp;
1706 
1707 				j = 0;
1708 				for (osp = list_head(&rp->r_open_streams);
1709 				    osp != NULL;
1710 				    osp = list_next(&rp->r_open_streams, osp)) {
1711 
1712 					mutex_enter(&osp->os_sync_lock);
1713 					if (osp->os_valid &&
1714 					    !osp->os_pending_close) {
1715 						osp->os_ref_count++;
1716 						rep->re_osp[j] = osp;
1717 						j++;
1718 					}
1719 					mutex_exit(&osp->os_sync_lock);
1720 				}
1721 				/*
1722 				 * Assuming valid osp(s) stays valid between
1723 				 * the time obtaining j and numosp.
1724 				 */
1725 				ASSERT(j == numosp);
1726 			}
1727 
1728 			mutex_exit(&rp->r_os_lock);
1729 			/* do this here to keep v_lock > r_os_lock */
1730 			if (hold_vnode)
1731 				VN_HOLD(vp);
1732 			mutex_enter(&rp->r_statev4_lock);
1733 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1734 				/*
1735 				 * If this rnode holds a delegation,
1736 				 * but if there are no valid open streams,
1737 				 * then just discard the delegation
1738 				 * without doing delegreturn.
1739 				 */
1740 				if (numosp > 0)
1741 					rp->r_deleg_needs_recovery =
1742 					    rp->r_deleg_type;
1743 			}
1744 			/* Save the delegation type for use outside the lock */
1745 			dtype = rp->r_deleg_type;
1746 			mutex_exit(&rp->r_statev4_lock);
1747 
1748 			/*
1749 			 * If we have a delegation then get rid of it.
1750 			 * We've set rp->r_deleg_needs_recovery so we have
1751 			 * enough information to recover.
1752 			 */
1753 			if (dtype != OPEN_DELEGATE_NONE) {
1754 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1755 			}
1756 		}
1757 		rw_exit(&rtable4[index].r_lock);
1758 	}
1759 	return (reopenlist);
1760 }
1761 
1762 /*
1763  * Release the list of open instance references.
1764  */
1765 
1766 void
1767 r4releopenlist(nfs4_opinst_t *reopenp)
1768 {
1769 	nfs4_opinst_t *rep, *next;
1770 	int i;
1771 
1772 	for (rep = reopenp; rep; rep = next) {
1773 		next = rep->re_next;
1774 
1775 		for (i = 0; i < rep->re_numosp; i++)
1776 			open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1777 
1778 		VN_RELE(rep->re_vp);
1779 		kmem_free(rep->re_osp,
1780 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1781 
1782 		kmem_free(rep, sizeof (*rep));
1783 	}
1784 }
1785 
1786 int
1787 nfs4_rnode_init(void)
1788 {
1789 	ulong_t nrnode4_max;
1790 	int i;
1791 
1792 	/*
1793 	 * Compute the size of the rnode4 hash table
1794 	 */
1795 	if (nrnode <= 0)
1796 		nrnode = ncsize;
1797 	nrnode4_max =
1798 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1799 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1800 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1801 		    "setting nrnode to max value of %ld", nrnode4_max);
1802 		nrnode = nrnode4_max;
1803 	}
1804 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1805 	rtable4mask = rtable4size - 1;
1806 
1807 	/*
1808 	 * Allocate and initialize the hash buckets
1809 	 */
1810 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1811 	for (i = 0; i < rtable4size; i++) {
1812 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1813 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1814 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1815 	}
1816 
1817 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1818 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1819 
1820 	return (0);
1821 }
1822 
1823 int
1824 nfs4_rnode_fini(void)
1825 {
1826 	int i;
1827 
1828 	/*
1829 	 * Deallocate the rnode hash queues
1830 	 */
1831 	kmem_cache_destroy(rnode4_cache);
1832 
1833 	for (i = 0; i < rtable4size; i++)
1834 		rw_destroy(&rtable4[i].r_lock);
1835 
1836 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1837 
1838 	return (0);
1839 }
1840 
1841 /*
1842  * Return non-zero if the given filehandle refers to the root filehandle
1843  * for the given rnode.
1844  */
1845 
1846 static int
1847 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1848 {
1849 	int isroot;
1850 
1851 	isroot = 0;
1852 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1853 		isroot = 1;
1854 
1855 	return (isroot);
1856 }
1857 
1858 /*
1859  * The r4_stub_* routines assume that the rnode is newly activated, and
1860  * that the caller either holds the hash bucket r_lock for this rnode as
1861  * RW_WRITER, or holds r_statelock.
1862  */
1863 static void
1864 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1865 {
1866 	vnode_t *vp = RTOV4(rp);
1867 	krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1868 
1869 	ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1870 
1871 	rp->r_stub_type = type;
1872 
1873 	/*
1874 	 * Safely switch this vnode to the trigger vnodeops.
1875 	 *
1876 	 * Currently, we don't ever switch a trigger vnode back to using
1877 	 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1878 	 * a new v4 object is not a trigger, and it will already have the
1879 	 * correct v4 vnodeops by default. So, no "else" case required here.
1880 	 */
1881 	if (type != NFS4_STUB_NONE)
1882 		vn_setops(vp, nfs4_trigger_vnodeops);
1883 }
1884 
1885 void
1886 r4_stub_mirrormount(rnode4_t *rp)
1887 {
1888 	r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
1889 }
1890 
1891 void
1892 r4_stub_none(rnode4_t *rp)
1893 {
1894 	r4_stub_set(rp, NFS4_STUB_NONE);
1895 }
1896 
1897 #ifdef DEBUG
1898 
1899 /*
1900  * Look in the rnode table for other rnodes that have the same filehandle.
1901  * Assume the lock is held for the hash chain of checkrp
1902  */
1903 
1904 static void
1905 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1906 {
1907 	rnode4_t *rp;
1908 	vnode_t *tvp;
1909 	nfs4_fhandle_t fh, fh2;
1910 	int index;
1911 
1912 	if (!r4_check_for_dups)
1913 		return;
1914 
1915 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1916 
1917 	sfh4_copyval(checkrp->r_fh, &fh);
1918 
1919 	for (index = 0; index < rtable4size; index++) {
1920 
1921 		if (&rtable4[index] != checkrp->r_hashq)
1922 			rw_enter(&rtable4[index].r_lock, RW_READER);
1923 
1924 		for (rp = rtable4[index].r_hashf;
1925 		    rp != (rnode4_t *)(&rtable4[index]);
1926 		    rp = rp->r_hashf) {
1927 
1928 			if (rp == checkrp)
1929 				continue;
1930 
1931 			tvp = RTOV4(rp);
1932 			if (tvp->v_vfsp != vfsp)
1933 				continue;
1934 
1935 			sfh4_copyval(rp->r_fh, &fh2);
1936 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
1937 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
1938 				    "(%p, %p)", (void *)checkrp, (void *)rp);
1939 			}
1940 		}
1941 
1942 		if (&rtable4[index] != checkrp->r_hashq)
1943 			rw_exit(&rtable4[index].r_lock);
1944 	}
1945 }
1946 
1947 #endif /* DEBUG */
1948