1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28 *	All Rights Reserved
29 */
30
31/*
32 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
33 * Copyright (c) 2017 by Delphix. All rights reserved.
34 */
35
36#include <sys/param.h>
37#include <sys/types.h>
38#include <sys/systm.h>
39#include <sys/cred.h>
40#include <sys/proc.h>
41#include <sys/user.h>
42#include <sys/time.h>
43#include <sys/buf.h>
44#include <sys/vfs.h>
45#include <sys/vnode.h>
46#include <sys/socket.h>
47#include <sys/uio.h>
48#include <sys/tiuser.h>
49#include <sys/swap.h>
50#include <sys/errno.h>
51#include <sys/debug.h>
52#include <sys/kmem.h>
53#include <sys/kstat.h>
54#include <sys/cmn_err.h>
55#include <sys/vtrace.h>
56#include <sys/session.h>
57#include <sys/dnlc.h>
58#include <sys/bitmap.h>
59#include <sys/acl.h>
60#include <sys/ddi.h>
61#include <sys/pathname.h>
62#include <sys/flock.h>
63#include <sys/dirent.h>
64#include <sys/flock.h>
65#include <sys/callb.h>
66#include <sys/sdt.h>
67
68#include <vm/pvn.h>
69
70#include <rpc/types.h>
71#include <rpc/xdr.h>
72#include <rpc/auth.h>
73#include <rpc/rpcsec_gss.h>
74#include <rpc/clnt.h>
75
76#include <nfs/nfs.h>
77#include <nfs/nfs_clnt.h>
78#include <nfs/nfs_acl.h>
79
80#include <nfs/nfs4.h>
81#include <nfs/rnode4.h>
82#include <nfs/nfs4_clnt.h>
83
84/*
85 * The hash queues for the access to active and cached rnodes
86 * are organized as doubly linked lists.  A reader/writer lock
87 * for each hash bucket is used to control access and to synchronize
88 * lookups, additions, and deletions from the hash queue.
89 *
90 * The rnode freelist is organized as a doubly linked list with
91 * a head pointer.  Additions and deletions are synchronized via
92 * a single mutex.
93 *
94 * In order to add an rnode to the free list, it must be hashed into
95 * a hash queue and the exclusive lock to the hash queue be held.
96 * If an rnode is not hashed into a hash queue, then it is destroyed
97 * because it represents no valuable information that can be reused
98 * about the file.  The exclusive lock to the hash queue must be
99 * held in order to prevent a lookup in the hash queue from finding
100 * the rnode and using it and assuming that the rnode is not on the
101 * freelist.  The lookup in the hash queue will have the hash queue
102 * locked, either exclusive or shared.
103 *
104 * The vnode reference count for each rnode is not allowed to drop
105 * below 1.  This prevents external entities, such as the VM
106 * subsystem, from acquiring references to vnodes already on the
107 * freelist and then trying to place them back on the freelist
108 * when their reference is released.  This means that the when an
109 * rnode is looked up in the hash queues, then either the rnode
110 * is removed from the freelist and that reference is transferred to
111 * the new reference or the vnode reference count must be incremented
112 * accordingly.  The mutex for the freelist must be held in order to
113 * accurately test to see if the rnode is on the freelist or not.
114 * The hash queue lock might be held shared and it is possible that
115 * two different threads may race to remove the rnode from the
116 * freelist.  This race can be resolved by holding the mutex for the
117 * freelist.  Please note that the mutex for the freelist does not
118 * need to be held if the rnode is not on the freelist.  It can not be
119 * placed on the freelist due to the requirement that the thread
120 * putting the rnode on the freelist must hold the exclusive lock
121 * to the hash queue and the thread doing the lookup in the hash
122 * queue is holding either a shared or exclusive lock to the hash
123 * queue.
124 *
125 * The lock ordering is:
126 *
127 *	hash bucket lock -> vnode lock
128 *	hash bucket lock -> freelist lock -> r_statelock
129 */
130r4hashq_t *rtable4;
131
132static kmutex_t rp4freelist_lock;
133static rnode4_t *rp4freelist = NULL;
134static long rnode4_new = 0;
135int rtable4size;
136static int rtable4mask;
137static struct kmem_cache *rnode4_cache;
138static int rnode4_hashlen = 4;
139
140static void	r4inactive(rnode4_t *, cred_t *);
141static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
142		    struct vnodeops *,
143		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
144		    cred_t *),
145		    int *, cred_t *);
146static void	rp4_rmfree(rnode4_t *);
147int		nfs4_free_data_reclaim(rnode4_t *);
148static int	nfs4_active_data_reclaim(rnode4_t *);
149static int	nfs4_free_reclaim(void);
150static int	nfs4_active_reclaim(void);
151static int	nfs4_rnode_reclaim(void);
152static void	nfs4_reclaim(void *);
153static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
154static void	uninit_rnode4(rnode4_t *);
155static void	destroy_rnode4(rnode4_t *);
156static void	r4_stub_set(rnode4_t *, nfs4_stub_type_t);
157
158#ifdef DEBUG
159static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
160static int nfs4_rnode_debug = 0;
161/* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
162static int nfs4_rnode_nofreelist = 0;
163/* give messages on colliding shared filehandles */
164static void	r4_dup_check(rnode4_t *, vfs_t *);
165#endif
166
167/*
168 * If the vnode has pages, run the list and check for any that are
169 * still dangling.  We call this routine before putting an rnode on
170 * the free list.
171 */
172static int
173nfs4_dross_pages(vnode_t *vp)
174{
175	page_t *pp;
176	kmutex_t *vphm;
177
178	vphm = page_vnode_mutex(vp);
179	mutex_enter(vphm);
180	if ((pp = vp->v_pages) != NULL) {
181		do {
182			if (pp->p_hash != PVN_VPLIST_HASH_TAG &&
183			    pp->p_fsdata != C_NOCOMMIT) {
184				mutex_exit(vphm);
185				return (1);
186			}
187		} while ((pp = pp->p_vpnext) != vp->v_pages);
188	}
189	mutex_exit(vphm);
190
191	return (0);
192}
193
194/*
195 * Flush any pages left on this rnode.
196 */
197static void
198r4flushpages(rnode4_t *rp, cred_t *cr)
199{
200	vnode_t *vp;
201	int error;
202
203	/*
204	 * Before freeing anything, wait until all asynchronous
205	 * activity is done on this rnode.  This will allow all
206	 * asynchronous read ahead and write behind i/o's to
207	 * finish.
208	 */
209	mutex_enter(&rp->r_statelock);
210	while (rp->r_count > 0)
211		cv_wait(&rp->r_cv, &rp->r_statelock);
212	mutex_exit(&rp->r_statelock);
213
214	/*
215	 * Flush and invalidate all pages associated with the vnode.
216	 */
217	vp = RTOV4(rp);
218	if (nfs4_has_pages(vp)) {
219		ASSERT(vp->v_type != VCHR);
220		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
221			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
222			if (error && (error == ENOSPC || error == EDQUOT)) {
223				mutex_enter(&rp->r_statelock);
224				if (!rp->r_error)
225					rp->r_error = error;
226				mutex_exit(&rp->r_statelock);
227			}
228		}
229		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
230	}
231}
232
233/*
234 * Free the resources associated with an rnode.
235 */
236static void
237r4inactive(rnode4_t *rp, cred_t *cr)
238{
239	vnode_t *vp;
240	char *contents;
241	int size;
242	vsecattr_t *vsp;
243	vnode_t *xattr;
244
245	r4flushpages(rp, cr);
246
247	vp = RTOV4(rp);
248
249	/*
250	 * Free any held caches which may be
251	 * associated with this rnode.
252	 */
253	mutex_enter(&rp->r_statelock);
254	contents = rp->r_symlink.contents;
255	size = rp->r_symlink.size;
256	rp->r_symlink.contents = NULL;
257	vsp = rp->r_secattr;
258	rp->r_secattr = NULL;
259	xattr = rp->r_xattr_dir;
260	rp->r_xattr_dir = NULL;
261	mutex_exit(&rp->r_statelock);
262
263	/*
264	 * Free the access cache entries.
265	 */
266	(void) nfs4_access_purge_rp(rp);
267
268	/*
269	 * Free the readdir cache entries.
270	 */
271	nfs4_purge_rddir_cache(vp);
272
273	/*
274	 * Free the symbolic link cache.
275	 */
276	if (contents != NULL) {
277
278		kmem_free((void *)contents, size);
279	}
280
281	/*
282	 * Free any cached ACL.
283	 */
284	if (vsp != NULL)
285		nfs4_acl_free_cache(vsp);
286
287	/*
288	 * Release the cached xattr_dir
289	 */
290	if (xattr != NULL)
291		VN_RELE(xattr);
292}
293
294/*
295 * We have seen a case that the fh passed in is for "." which
296 * should be a VROOT node, however, the fh is different from the
297 * root fh stored in the mntinfo4_t. The invalid fh might be
298 * from a misbehaved server and will panic the client system at
299 * a later time. To avoid the panic, we drop the bad fh, use
300 * the root fh from mntinfo4_t, and print an error message
301 * for attention.
302 */
303nfs4_sharedfh_t *
304badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
305    int *wasbad)
306{
307	char *s;
308
309	*wasbad = 0;
310	s = fn_name(nm);
311	ASSERT(strcmp(s, "..") != 0);
312
313	if ((s[0] == '.' && s[1] == '\0') && fh &&
314	    !SFH4_SAME(mi->mi_rootfh, fh)) {
315#ifdef DEBUG
316		nfs4_fhandle_t fhandle;
317
318		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
319		    "Server %s returns a different "
320		    "root filehandle for the path %s:",
321		    mi->mi_curr_serv->sv_hostname,
322		    mi->mi_curr_serv->sv_path);
323
324		/* print the bad fh */
325		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
326		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
327		    fhandle.fh_len);
328		nfs4_printfhandle(&fhandle);
329
330		/* print mi_rootfh */
331		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
332		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
333		    fhandle.fh_len);
334		nfs4_printfhandle(&fhandle);
335#endif
336		/* use mi_rootfh instead; fh will be rele by the caller */
337		fh = mi->mi_rootfh;
338		*wasbad = 1;
339	}
340
341	kmem_free(s, MAXNAMELEN);
342	return (fh);
343}
344
345void
346r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
347    hrtime_t t, cred_t *cr, int index)
348{
349	int is_stub;
350	vattr_t *attr;
351	/*
352	 * Don't add to attrcache if time overflow, but
353	 * no need to check because either attr is null or the time
354	 * values in it were processed by nfs4_time_ntov(), which checks
355	 * for time overflows.
356	 */
357	attr = garp ? &garp->n4g_va : NULL;
358
359	if (attr) {
360		if (!newnode) {
361			rw_exit(&rtable4[index].r_lock);
362#ifdef DEBUG
363			if (vp->v_type != attr->va_type &&
364			    vp->v_type != VNON && attr->va_type != VNON) {
365				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
366				    "makenfs4node: type (%d) doesn't "
367				    "match type of found node at %p (%d)",
368				    attr->va_type, (void *)vp, vp->v_type);
369			}
370#endif
371			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
372		} else {
373			rnode4_t *rp = VTOR4(vp);
374
375			vp->v_type = attr->va_type;
376			vp->v_rdev = attr->va_rdev;
377
378			/*
379			 * Turn this object into a "stub" object if we
380			 * crossed an underlying server fs boundary.
381			 * To make this check, during mount we save the
382			 * fsid of the server object being mounted.
383			 * Here we compare this object's server fsid
384			 * with the fsid we saved at mount.  If they
385			 * are different, we crossed server fs boundary.
386			 *
387			 * The stub type is set (or not) at rnode
388			 * creation time and it never changes for life
389			 * of the rnode.
390			 *
391			 * This stub will be for a mirror-mount, rather than
392			 * a referral (the latter also sets R4SRVSTUB).
393			 *
394			 * The stub type is also set during RO failover,
395			 * nfs4_remap_file().
396			 *
397			 * We don't bother with taking r_state_lock to
398			 * set the stub type because this is a new rnode
399			 * and we're holding the hash bucket r_lock RW_WRITER.
400			 * No other thread could have obtained access
401			 * to this rnode.
402			 */
403			is_stub = 0;
404			if (garp->n4g_fsid_valid) {
405				fattr4_fsid ga_fsid = garp->n4g_fsid;
406				servinfo4_t *svp = rp->r_server;
407
408				rp->r_srv_fsid = ga_fsid;
409
410				(void) nfs_rw_enter_sig(&svp->sv_lock,
411				    RW_READER, 0);
412				if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
413					is_stub = 1;
414				nfs_rw_exit(&svp->sv_lock);
415			}
416
417			if (is_stub)
418				r4_stub_mirrormount(rp);
419			else
420				r4_stub_none(rp);
421
422			/* Can not cache partial attr */
423			if (attr->va_mask == AT_ALL)
424				nfs4_attrcache_noinval(vp, garp, t);
425			else
426				PURGE_ATTRCACHE4(vp);
427
428			rw_exit(&rtable4[index].r_lock);
429		}
430	} else {
431		if (newnode) {
432			PURGE_ATTRCACHE4(vp);
433		}
434		rw_exit(&rtable4[index].r_lock);
435	}
436}
437
438/*
439 * Find or create an rnode based primarily on filehandle.  To be
440 * used when dvp (vnode for parent directory) is not available;
441 * otherwise, makenfs4node() should be used.
442 *
443 * The nfs4_fname_t argument *npp is consumed and nulled out.
444 */
445
446vnode_t *
447makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
448    nfs4_fname_t **npp, nfs4_ga_res_t *garp,
449    mntinfo4_t *mi, cred_t *cr, hrtime_t t)
450{
451	vfs_t *vfsp = mi->mi_vfsp;
452	int newnode = 0;
453	vnode_t *vp;
454	rnode4_t *rp;
455	svnode_t *svp;
456	nfs4_fname_t *name, *svpname;
457	int index;
458
459	ASSERT(npp && *npp);
460	name = *npp;
461	*npp = NULL;
462
463	index = rtable4hash(sfh);
464	rw_enter(&rtable4[index].r_lock, RW_READER);
465
466	vp = make_rnode4(sfh, &rtable4[index], vfsp,
467	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
468
469	svp = VTOSV(vp);
470	rp = VTOR4(vp);
471	if (newnode) {
472		svp->sv_forw = svp->sv_back = svp;
473		svp->sv_name = name;
474		if (psfh != NULL)
475			sfh4_hold(psfh);
476		svp->sv_dfh = psfh;
477	} else {
478		/*
479		 * It is possible that due to a server
480		 * side rename fnames have changed.
481		 * update the fname here.
482		 */
483		mutex_enter(&rp->r_svlock);
484		svpname = svp->sv_name;
485		if (svp->sv_name != name) {
486			svp->sv_name = name;
487			mutex_exit(&rp->r_svlock);
488			fn_rele(&svpname);
489		} else {
490			mutex_exit(&rp->r_svlock);
491			fn_rele(&name);
492		}
493	}
494
495	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
496	r4_do_attrcache(vp, garp, newnode, t, cr, index);
497	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
498
499	return (vp);
500}
501
502/*
503 * Find or create a vnode for the given filehandle, filesystem, parent, and
504 * name.  The reference to nm is consumed, so the caller must first do an
505 * fn_hold() if it wants to continue using nm after this call.
506 */
507vnode_t *
508makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
509    hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
510{
511	vnode_t *vp;
512	int newnode;
513	int index;
514	mntinfo4_t *mi = VFTOMI4(vfsp);
515	int had_badfh = 0;
516	rnode4_t *rp;
517
518	ASSERT(dvp != NULL);
519
520	fh = badrootfh_check(fh, nm, mi, &had_badfh);
521
522	index = rtable4hash(fh);
523	rw_enter(&rtable4[index].r_lock, RW_READER);
524
525	/*
526	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
527	 */
528	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
529	    nfs4_putapage, &newnode, cr);
530
531	rp = VTOR4(vp);
532	sv_activate(&vp, dvp, &nm, newnode);
533	if (dvp->v_flag & V_XATTRDIR) {
534		mutex_enter(&rp->r_statelock);
535		rp->r_flags |= R4ISXATTR;
536		mutex_exit(&rp->r_statelock);
537	}
538
539	/* if getting a bad file handle, do not cache the attributes. */
540	if (had_badfh) {
541		rw_exit(&rtable4[index].r_lock);
542		return (vp);
543	}
544
545	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
546	r4_do_attrcache(vp, garp, newnode, t, cr, index);
547	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
548
549	return (vp);
550}
551
552/*
553 * Hash on address of filehandle object.
554 * XXX totally untuned.
555 */
556
557int
558rtable4hash(nfs4_sharedfh_t *fh)
559{
560	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
561}
562
563/*
564 * Find or create the vnode for the given filehandle and filesystem.
565 * *newnode is set to zero if the vnode already existed; non-zero if it had
566 * to be created.
567 *
568 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
569 */
570
571static vnode_t *
572make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
573    struct vnodeops *vops,
574    int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
575    int *newnode, cred_t *cr)
576{
577	rnode4_t *rp;
578	rnode4_t *trp;
579	vnode_t *vp;
580	mntinfo4_t *mi;
581
582	ASSERT(RW_READ_HELD(&rhtp->r_lock));
583
584	mi = VFTOMI4(vfsp);
585
586start:
587	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
588		vp = RTOV4(rp);
589		*newnode = 0;
590		return (vp);
591	}
592	rw_exit(&rhtp->r_lock);
593
594	mutex_enter(&rp4freelist_lock);
595
596	if (rp4freelist != NULL && rnode4_new >= nrnode) {
597		rp = rp4freelist;
598		rp4_rmfree(rp);
599		mutex_exit(&rp4freelist_lock);
600
601		vp = RTOV4(rp);
602
603		if (rp->r_flags & R4HASHED) {
604			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
605			mutex_enter(&vp->v_lock);
606			if (vp->v_count > 1) {
607				VN_RELE_LOCKED(vp);
608				mutex_exit(&vp->v_lock);
609				rw_exit(&rp->r_hashq->r_lock);
610				rw_enter(&rhtp->r_lock, RW_READER);
611				goto start;
612			}
613			mutex_exit(&vp->v_lock);
614			rp4_rmhash_locked(rp);
615			rw_exit(&rp->r_hashq->r_lock);
616		}
617
618		r4inactive(rp, cr);
619
620		mutex_enter(&vp->v_lock);
621		if (vp->v_count > 1) {
622			VN_RELE_LOCKED(vp);
623			mutex_exit(&vp->v_lock);
624			rw_enter(&rhtp->r_lock, RW_READER);
625			goto start;
626		}
627		mutex_exit(&vp->v_lock);
628		vn_invalid(vp);
629
630		/*
631		 * destroy old locks before bzero'ing and
632		 * recreating the locks below.
633		 */
634		uninit_rnode4(rp);
635
636		/*
637		 * Make sure that if rnode is recycled then
638		 * VFS count is decremented properly before
639		 * reuse.
640		 */
641		VFS_RELE(vp->v_vfsp);
642		vn_reinit(vp);
643	} else {
644		vnode_t *new_vp;
645
646		mutex_exit(&rp4freelist_lock);
647
648		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
649		new_vp = vn_alloc(KM_SLEEP);
650
651		atomic_inc_ulong((ulong_t *)&rnode4_new);
652#ifdef DEBUG
653		clstat4_debug.nrnode.value.ui64++;
654#endif
655		vp = new_vp;
656	}
657
658	bzero(rp, sizeof (*rp));
659	rp->r_vnode = vp;
660	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
661	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
662	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
663	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
664	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
665	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
666	rp->created_v4 = 0;
667	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
668	    offsetof(nfs4_open_stream_t, os_node));
669	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
670	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
671	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
672	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
673	rp->r_flags = R4READDIRWATTR;
674	rp->r_fh = fh;
675	rp->r_hashq = rhtp;
676	sfh4_hold(rp->r_fh);
677	rp->r_server = mi->mi_curr_serv;
678	rp->r_deleg_type = OPEN_DELEGATE_NONE;
679	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
680	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
681
682	rddir4_cache_create(rp);
683	rp->r_putapage = putapage;
684	vn_setops(vp, vops);
685	vp->v_data = (caddr_t)rp;
686	vp->v_vfsp = vfsp;
687	VFS_HOLD(vfsp);
688	vp->v_type = VNON;
689	vp->v_flag |= VMODSORT;
690	if (isrootfh(fh, rp))
691		vp->v_flag = VROOT;
692	vn_exists(vp);
693
694	/*
695	 * There is a race condition if someone else
696	 * alloc's the rnode while no locks are held, so we
697	 * check again and recover if found.
698	 */
699	rw_enter(&rhtp->r_lock, RW_WRITER);
700	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
701		vp = RTOV4(trp);
702		*newnode = 0;
703		rw_exit(&rhtp->r_lock);
704		rp4_addfree(rp, cr);
705		rw_enter(&rhtp->r_lock, RW_READER);
706		return (vp);
707	}
708	rp4_addhash(rp);
709	*newnode = 1;
710	return (vp);
711}
712
713static void
714uninit_rnode4(rnode4_t *rp)
715{
716	vnode_t *vp = RTOV4(rp);
717
718	ASSERT(rp != NULL);
719	ASSERT(vp != NULL);
720	ASSERT(vp->v_count == 1);
721	ASSERT(rp->r_count == 0);
722	ASSERT(rp->r_mapcnt == 0);
723	if (rp->r_flags & R4LODANGLERS) {
724		nfs4_flush_lock_owners(rp);
725	}
726	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
727	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
728	ASSERT(!(rp->r_flags & R4HASHED));
729	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
730	nfs4_clear_open_streams(rp);
731	list_destroy(&rp->r_open_streams);
732
733	/*
734	 * Destroy the rddir cache first since we need to grab the r_statelock.
735	 */
736	mutex_enter(&rp->r_statelock);
737	rddir4_cache_destroy(rp);
738	mutex_exit(&rp->r_statelock);
739	sv_uninit(&rp->r_svnode);
740	sfh4_rele(&rp->r_fh);
741	nfs_rw_destroy(&rp->r_rwlock);
742	nfs_rw_destroy(&rp->r_lkserlock);
743	mutex_destroy(&rp->r_statelock);
744	mutex_destroy(&rp->r_statev4_lock);
745	mutex_destroy(&rp->r_os_lock);
746	cv_destroy(&rp->r_cv);
747	cv_destroy(&rp->r_commit.c_cv);
748	nfs_rw_destroy(&rp->r_deleg_recall_lock);
749	if (rp->r_flags & R4DELMAPLIST)
750		list_destroy(&rp->r_indelmap);
751}
752
753/*
754 * Put an rnode on the free list.
755 *
756 * Rnodes which were allocated above and beyond the normal limit
757 * are immediately freed.
758 */
759void
760rp4_addfree(rnode4_t *rp, cred_t *cr)
761{
762	vnode_t *vp;
763	vnode_t *xattr;
764	struct vfs *vfsp;
765
766	vp = RTOV4(rp);
767	ASSERT(vp->v_count >= 1);
768	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
769
770	/*
771	 * If we have too many rnodes allocated and there are no
772	 * references to this rnode, or if the rnode is no longer
773	 * accessible by it does not reside in the hash queues,
774	 * or if an i/o error occurred while writing to the file,
775	 * then just free it instead of putting it on the rnode
776	 * freelist.
777	 */
778	vfsp = vp->v_vfsp;
779	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
780#ifdef DEBUG
781	    (nfs4_rnode_nofreelist != 0) ||
782#endif
783	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
784	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
785		if (rp->r_flags & R4HASHED) {
786			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
787			mutex_enter(&vp->v_lock);
788			if (vp->v_count > 1) {
789				VN_RELE_LOCKED(vp);
790				mutex_exit(&vp->v_lock);
791				rw_exit(&rp->r_hashq->r_lock);
792				return;
793			}
794			mutex_exit(&vp->v_lock);
795			rp4_rmhash_locked(rp);
796			rw_exit(&rp->r_hashq->r_lock);
797		}
798
799		/*
800		 * Make sure we don't have a delegation on this rnode
801		 * before destroying it.
802		 */
803		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
804			(void) nfs4delegreturn(rp,
805			    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
806		}
807
808		r4inactive(rp, cr);
809
810		/*
811		 * Recheck the vnode reference count.  We need to
812		 * make sure that another reference has not been
813		 * acquired while we were not holding v_lock.  The
814		 * rnode is not in the rnode hash queues; one
815		 * way for a reference to have been acquired
816		 * is for a VOP_PUTPAGE because the rnode was marked
817		 * with R4DIRTY or for a modified page.  This
818		 * reference may have been acquired before our call
819		 * to r4inactive.  The i/o may have been completed,
820		 * thus allowing r4inactive to complete, but the
821		 * reference to the vnode may not have been released
822		 * yet.  In any case, the rnode can not be destroyed
823		 * until the other references to this vnode have been
824		 * released.  The other references will take care of
825		 * either destroying the rnode or placing it on the
826		 * rnode freelist.  If there are no other references,
827		 * then the rnode may be safely destroyed.
828		 */
829		mutex_enter(&vp->v_lock);
830		if (vp->v_count > 1) {
831			VN_RELE_LOCKED(vp);
832			mutex_exit(&vp->v_lock);
833			return;
834		}
835		mutex_exit(&vp->v_lock);
836
837		destroy_rnode4(rp);
838		return;
839	}
840
841	/*
842	 * Lock the hash queue and then recheck the reference count
843	 * to ensure that no other threads have acquired a reference
844	 * to indicate that the rnode should not be placed on the
845	 * freelist.  If another reference has been acquired, then
846	 * just release this one and let the other thread complete
847	 * the processing of adding this rnode to the freelist.
848	 */
849again:
850	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
851
852	mutex_enter(&vp->v_lock);
853	if (vp->v_count > 1) {
854		VN_RELE_LOCKED(vp);
855		mutex_exit(&vp->v_lock);
856		rw_exit(&rp->r_hashq->r_lock);
857		return;
858	}
859	mutex_exit(&vp->v_lock);
860
861	/*
862	 * Make sure we don't put an rnode with a delegation
863	 * on the free list.
864	 */
865	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
866		rw_exit(&rp->r_hashq->r_lock);
867		(void) nfs4delegreturn(rp,
868		    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
869		goto again;
870	}
871
872	/*
873	 * Now that we have the hash queue lock, and we know there
874	 * are not anymore references on the vnode, check to make
875	 * sure there aren't any open streams still on the rnode.
876	 * If so, drop the hash queue lock, remove the open streams,
877	 * and recheck the v_count.
878	 */
879	mutex_enter(&rp->r_os_lock);
880	if (list_head(&rp->r_open_streams) != NULL) {
881		mutex_exit(&rp->r_os_lock);
882		rw_exit(&rp->r_hashq->r_lock);
883		if (nfs_zone() != VTOMI4(vp)->mi_zone)
884			nfs4_clear_open_streams(rp);
885		else
886			(void) nfs4close_all(vp, cr);
887		goto again;
888	}
889	mutex_exit(&rp->r_os_lock);
890
891	/*
892	 * Before we put it on the freelist, make sure there are no pages.
893	 * If there are, flush and commit of all of the dirty and
894	 * uncommitted pages, assuming the file system isn't read only.
895	 */
896	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
897		rw_exit(&rp->r_hashq->r_lock);
898		r4flushpages(rp, cr);
899		goto again;
900	}
901
902	/*
903	 * Before we put it on the freelist, make sure there is no
904	 * active xattr directory cached, the freelist will not
905	 * have its entries r4inactive'd if there is still an active
906	 * rnode, thus nothing in the freelist can hold another
907	 * rnode active.
908	 */
909	xattr = rp->r_xattr_dir;
910	rp->r_xattr_dir = NULL;
911
912	/*
913	 * If there is no cached data or metadata for this file, then
914	 * put the rnode on the front of the freelist so that it will
915	 * be reused before other rnodes which may have cached data or
916	 * metadata associated with them.
917	 */
918	mutex_enter(&rp4freelist_lock);
919	if (rp4freelist == NULL) {
920		rp->r_freef = rp;
921		rp->r_freeb = rp;
922		rp4freelist = rp;
923	} else {
924		rp->r_freef = rp4freelist;
925		rp->r_freeb = rp4freelist->r_freeb;
926		rp4freelist->r_freeb->r_freef = rp;
927		rp4freelist->r_freeb = rp;
928		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
929		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
930			rp4freelist = rp;
931	}
932	mutex_exit(&rp4freelist_lock);
933
934	rw_exit(&rp->r_hashq->r_lock);
935
936	if (xattr)
937		VN_RELE(xattr);
938}
939
940/*
941 * Remove an rnode from the free list.
942 *
943 * The caller must be holding rp4freelist_lock and the rnode
944 * must be on the freelist.
945 */
946static void
947rp4_rmfree(rnode4_t *rp)
948{
949
950	ASSERT(MUTEX_HELD(&rp4freelist_lock));
951	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
952
953	if (rp == rp4freelist) {
954		rp4freelist = rp->r_freef;
955		if (rp == rp4freelist)
956			rp4freelist = NULL;
957	}
958	rp->r_freeb->r_freef = rp->r_freef;
959	rp->r_freef->r_freeb = rp->r_freeb;
960
961	rp->r_freef = rp->r_freeb = NULL;
962}
963
964/*
965 * Put a rnode in the hash table.
966 *
967 * The caller must be holding the exclusive hash queue lock
968 */
969void
970rp4_addhash(rnode4_t *rp)
971{
972	mntinfo4_t *mi;
973
974	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
975	ASSERT(!(rp->r_flags & R4HASHED));
976
977#ifdef DEBUG
978	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
979#endif
980
981	rp->r_hashf = rp->r_hashq->r_hashf;
982	rp->r_hashq->r_hashf = rp;
983	rp->r_hashb = (rnode4_t *)rp->r_hashq;
984	rp->r_hashf->r_hashb = rp;
985
986	mutex_enter(&rp->r_statelock);
987	rp->r_flags |= R4HASHED;
988	mutex_exit(&rp->r_statelock);
989
990	mi = VTOMI4(RTOV4(rp));
991	mutex_enter(&mi->mi_rnodes_lock);
992	list_insert_tail(&mi->mi_rnodes, rp);
993	mutex_exit(&mi->mi_rnodes_lock);
994}
995
996/*
997 * Remove a rnode from the hash table.
998 *
999 * The caller must be holding the hash queue lock.
1000 */
1001void
1002rp4_rmhash_locked(rnode4_t *rp)
1003{
1004	mntinfo4_t *mi;
1005
1006	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
1007	ASSERT(rp->r_flags & R4HASHED);
1008
1009	rp->r_hashb->r_hashf = rp->r_hashf;
1010	rp->r_hashf->r_hashb = rp->r_hashb;
1011
1012	mutex_enter(&rp->r_statelock);
1013	rp->r_flags &= ~R4HASHED;
1014	mutex_exit(&rp->r_statelock);
1015
1016	mi = VTOMI4(RTOV4(rp));
1017	mutex_enter(&mi->mi_rnodes_lock);
1018	if (list_link_active(&rp->r_mi_link))
1019		list_remove(&mi->mi_rnodes, rp);
1020	mutex_exit(&mi->mi_rnodes_lock);
1021}
1022
1023/*
1024 * Remove a rnode from the hash table.
1025 *
1026 * The caller must not be holding the hash queue lock.
1027 */
1028void
1029rp4_rmhash(rnode4_t *rp)
1030{
1031	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1032	rp4_rmhash_locked(rp);
1033	rw_exit(&rp->r_hashq->r_lock);
1034}
1035
1036/*
1037 * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1038 * Returns NULL if no match.  If an rnode is returned, the reference count
1039 * on the master vnode is incremented.
1040 *
1041 * The caller must be holding the hash queue lock, either shared or exclusive.
1042 */
1043rnode4_t *
1044r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1045{
1046	rnode4_t *rp;
1047	vnode_t *vp;
1048
1049	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1050
1051	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1052		vp = RTOV4(rp);
1053		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1054
1055			mutex_enter(&rp->r_statelock);
1056			if (rp->r_flags & R4RECOVERR) {
1057				mutex_exit(&rp->r_statelock);
1058				continue;
1059			}
1060			mutex_exit(&rp->r_statelock);
1061#ifdef DEBUG
1062			r4_dup_check(rp, vfsp);
1063#endif
1064			if (rp->r_freef != NULL) {
1065				mutex_enter(&rp4freelist_lock);
1066				/*
1067				 * If the rnode is on the freelist,
1068				 * then remove it and use that reference
1069				 * as the new reference.  Otherwise,
1070				 * need to increment the reference count.
1071				 */
1072				if (rp->r_freef != NULL) {
1073					rp4_rmfree(rp);
1074					mutex_exit(&rp4freelist_lock);
1075				} else {
1076					mutex_exit(&rp4freelist_lock);
1077					VN_HOLD(vp);
1078				}
1079			} else
1080				VN_HOLD(vp);
1081
1082			/*
1083			 * if root vnode, set v_flag to indicate that
1084			 */
1085			if (isrootfh(fh, rp)) {
1086				if (!(vp->v_flag & VROOT)) {
1087					mutex_enter(&vp->v_lock);
1088					vp->v_flag |= VROOT;
1089					mutex_exit(&vp->v_lock);
1090				}
1091			}
1092			return (rp);
1093		}
1094	}
1095	return (NULL);
1096}
1097
1098/*
1099 * Lookup an rnode by fhandle. Just a wrapper for r4find()
1100 * that assumes the caller hasn't already got the lock
1101 * on the hash bucket.
1102 */
1103rnode4_t *
1104r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1105{
1106	rnode4_t *rp;
1107	int index;
1108
1109	index = rtable4hash(fh);
1110	rw_enter(&rtable4[index].r_lock, RW_READER);
1111	rp = r4find(&rtable4[index], fh, vfsp);
1112	rw_exit(&rtable4[index].r_lock);
1113
1114	return (rp);
1115}
1116
1117/*
1118 * Return 1 if there is an active vnode belonging to this vfs in the
1119 * rtable4 cache.
1120 *
1121 * Several of these checks are done without holding the usual
1122 * locks.  This is safe because destroy_rtable4(), rp4_addfree(),
1123 * etc. will redo the necessary checks before actually destroying
1124 * any rnodes.
1125 */
1126int
1127check_rtable4(struct vfs *vfsp)
1128{
1129	rnode4_t *rp;
1130	vnode_t *vp;
1131	mntinfo4_t *mi;
1132
1133	ASSERT(vfsp != NULL);
1134	mi = VFTOMI4(vfsp);
1135
1136	mutex_enter(&mi->mi_rnodes_lock);
1137	for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1138	    rp = list_next(&mi->mi_rnodes, rp)) {
1139		vp = RTOV4(rp);
1140
1141		if (rp->r_freef == NULL ||
1142		    (nfs4_has_pages(vp) && (rp->r_flags & R4DIRTY)) ||
1143		    rp->r_count > 0) {
1144			mutex_exit(&mi->mi_rnodes_lock);
1145			return (1);
1146		}
1147	}
1148	mutex_exit(&mi->mi_rnodes_lock);
1149
1150	return (0);
1151}
1152
1153/*
1154 * Destroy inactive vnodes from the hash queues which
1155 * belong to this vfs. All of the vnodes should be inactive.
1156 * It is essential that we destroy all rnodes in case of
1157 * forced unmount as well as in normal unmount case.
1158 */
1159
1160void
1161destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1162{
1163	rnode4_t *rp;
1164	mntinfo4_t *mi;
1165
1166	ASSERT(vfsp != NULL);
1167
1168	mi = VFTOMI4(vfsp);
1169
1170	mutex_enter(&rp4freelist_lock);
1171	mutex_enter(&mi->mi_rnodes_lock);
1172	while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
1173		/*
1174		 * If the rnode is no longer on the freelist it is not
1175		 * ours and it will be handled by some other thread, so
1176		 * skip it.
1177		 */
1178		if (rp->r_freef == NULL)
1179			continue;
1180		mutex_exit(&mi->mi_rnodes_lock);
1181
1182		rp4_rmfree(rp);
1183		mutex_exit(&rp4freelist_lock);
1184
1185		rp4_rmhash(rp);
1186
1187		/*
1188		 * This call to rp4_addfree will end up destroying the
1189		 * rnode, but in a safe way with the appropriate set
1190		 * of checks done.
1191		 */
1192		rp4_addfree(rp, cr);
1193
1194		mutex_enter(&rp4freelist_lock);
1195		mutex_enter(&mi->mi_rnodes_lock);
1196	}
1197	mutex_exit(&mi->mi_rnodes_lock);
1198	mutex_exit(&rp4freelist_lock);
1199}
1200
1201/*
1202 * This routine destroys all the resources of an rnode
1203 * and finally the rnode itself.
1204 */
1205static void
1206destroy_rnode4(rnode4_t *rp)
1207{
1208	vnode_t *vp;
1209	vfs_t *vfsp;
1210
1211	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1212
1213	vp = RTOV4(rp);
1214	vfsp = vp->v_vfsp;
1215
1216	uninit_rnode4(rp);
1217	atomic_dec_ulong((ulong_t *)&rnode4_new);
1218#ifdef DEBUG
1219	clstat4_debug.nrnode.value.ui64--;
1220#endif
1221	kmem_cache_free(rnode4_cache, rp);
1222	vn_invalid(vp);
1223	vn_free(vp);
1224	VFS_RELE(vfsp);
1225}
1226
1227/*
1228 * Invalidate the attributes on all rnodes forcing the next getattr
1229 * to go over the wire.  Used to flush stale uid and gid mappings.
1230 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1231 */
1232void
1233nfs4_rnode_invalidate(struct vfs *vfsp)
1234{
1235	int index;
1236	rnode4_t *rp;
1237	vnode_t *vp;
1238
1239	/*
1240	 * Walk the hash queues looking for rnodes.
1241	 */
1242	for (index = 0; index < rtable4size; index++) {
1243		rw_enter(&rtable4[index].r_lock, RW_READER);
1244		for (rp = rtable4[index].r_hashf;
1245		    rp != (rnode4_t *)(&rtable4[index]);
1246		    rp = rp->r_hashf) {
1247			vp = RTOV4(rp);
1248			if (vfsp != NULL && vp->v_vfsp != vfsp)
1249				continue;
1250
1251			if (!mutex_tryenter(&rp->r_statelock))
1252				continue;
1253
1254			/*
1255			 * Expire the attributes by resetting the change
1256			 * and attr timeout.
1257			 */
1258			rp->r_change = 0;
1259			PURGE_ATTRCACHE4_LOCKED(rp);
1260			mutex_exit(&rp->r_statelock);
1261		}
1262		rw_exit(&rtable4[index].r_lock);
1263	}
1264}
1265
1266/*
1267 * Flush all vnodes in this (or every) vfs.
1268 * Used by nfs_sync and by nfs_unmount.
1269 */
1270void
1271r4flush(struct vfs *vfsp, cred_t *cr)
1272{
1273	int index;
1274	rnode4_t *rp;
1275	vnode_t *vp, **vplist;
1276	long num, cnt;
1277
1278	/*
1279	 * Check to see whether there is anything to do.
1280	 */
1281	num = rnode4_new;
1282	if (num == 0)
1283		return;
1284
1285	/*
1286	 * Allocate a slot for all currently active rnodes on the
1287	 * supposition that they all may need flushing.
1288	 */
1289	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1290	cnt = 0;
1291
1292	/*
1293	 * If the vfs is known we can do fast path by iterating all rnodes that
1294	 * belongs to this vfs.  This is much faster than the traditional way
1295	 * of iterating rtable4 (below) in a case there is a lot of rnodes that
1296	 * does not belong to our vfs.
1297	 */
1298	if (vfsp != NULL) {
1299		mntinfo4_t *mi = VFTOMI4(vfsp);
1300
1301		mutex_enter(&mi->mi_rnodes_lock);
1302		for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1303		    rp = list_next(&mi->mi_rnodes, rp)) {
1304			vp = RTOV4(rp);
1305			/*
1306			 * Don't bother sync'ing a vp if it
1307			 * is part of virtual swap device or
1308			 * if VFS is read-only
1309			 */
1310			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1311				continue;
1312			/*
1313			 * If the vnode has pages and is marked as either dirty
1314			 * or mmap'd, hold and add this vnode to the list of
1315			 * vnodes to flush.
1316			 */
1317			ASSERT(vp->v_vfsp == vfsp);
1318			if (nfs4_has_pages(vp) &&
1319			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1320				VN_HOLD(vp);
1321				vplist[cnt++] = vp;
1322				if (cnt == num) {
1323					/*
1324					 * The vplist is full because there is
1325					 * too many rnodes.  We are done for
1326					 * now.
1327					 */
1328					break;
1329				}
1330			}
1331		}
1332		mutex_exit(&mi->mi_rnodes_lock);
1333
1334		goto done;
1335	}
1336
1337	ASSERT(vfsp == NULL);
1338
1339	/*
1340	 * Walk the hash queues looking for rnodes with page
1341	 * lists associated with them.  Make a list of these
1342	 * files.
1343	 */
1344	for (index = 0; index < rtable4size; index++) {
1345		rw_enter(&rtable4[index].r_lock, RW_READER);
1346		for (rp = rtable4[index].r_hashf;
1347		    rp != (rnode4_t *)(&rtable4[index]);
1348		    rp = rp->r_hashf) {
1349			vp = RTOV4(rp);
1350			/*
1351			 * Don't bother sync'ing a vp if it
1352			 * is part of virtual swap device or
1353			 * if VFS is read-only
1354			 */
1355			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1356				continue;
1357			/*
1358			 * If the vnode has pages and is marked as either dirty
1359			 * or mmap'd, hold and add this vnode to the list of
1360			 * vnodes to flush.
1361			 */
1362			if (nfs4_has_pages(vp) &&
1363			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1364				VN_HOLD(vp);
1365				vplist[cnt++] = vp;
1366				if (cnt == num) {
1367					rw_exit(&rtable4[index].r_lock);
1368					/*
1369					 * The vplist is full because there is
1370					 * too many rnodes.  We are done for
1371					 * now.
1372					 */
1373					goto done;
1374				}
1375			}
1376		}
1377		rw_exit(&rtable4[index].r_lock);
1378	}
1379
1380done:
1381
1382	/*
1383	 * Flush and release all of the files on the list.
1384	 */
1385	while (cnt-- > 0) {
1386		vp = vplist[cnt];
1387		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1388		VN_RELE(vp);
1389	}
1390
1391	/*
1392	 * Free the space allocated to hold the list.
1393	 */
1394	kmem_free(vplist, num * sizeof (*vplist));
1395}
1396
1397int
1398nfs4_free_data_reclaim(rnode4_t *rp)
1399{
1400	char *contents;
1401	vnode_t *xattr;
1402	int size;
1403	vsecattr_t *vsp;
1404	int freed;
1405	bool_t rdc = FALSE;
1406
1407	/*
1408	 * Free any held caches which may
1409	 * be associated with this rnode.
1410	 */
1411	mutex_enter(&rp->r_statelock);
1412	if (rp->r_dir != NULL)
1413		rdc = TRUE;
1414	contents = rp->r_symlink.contents;
1415	size = rp->r_symlink.size;
1416	rp->r_symlink.contents = NULL;
1417	vsp = rp->r_secattr;
1418	rp->r_secattr = NULL;
1419	xattr = rp->r_xattr_dir;
1420	rp->r_xattr_dir = NULL;
1421	mutex_exit(&rp->r_statelock);
1422
1423	/*
1424	 * Free the access cache entries.
1425	 */
1426	freed = nfs4_access_purge_rp(rp);
1427
1428	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1429		return (freed);
1430
1431	/*
1432	 * Free the readdir cache entries, incompletely if we can't block.
1433	 */
1434	nfs4_purge_rddir_cache(RTOV4(rp));
1435
1436	/*
1437	 * Free the symbolic link cache.
1438	 */
1439	if (contents != NULL) {
1440
1441		kmem_free((void *)contents, size);
1442	}
1443
1444	/*
1445	 * Free any cached ACL.
1446	 */
1447	if (vsp != NULL)
1448		nfs4_acl_free_cache(vsp);
1449
1450	/*
1451	 * Release the xattr directory vnode
1452	 */
1453	if (xattr != NULL)
1454		VN_RELE(xattr);
1455
1456	return (1);
1457}
1458
1459static int
1460nfs4_active_data_reclaim(rnode4_t *rp)
1461{
1462	char *contents;
1463	vnode_t *xattr = NULL;
1464	int size;
1465	vsecattr_t *vsp;
1466	int freed;
1467	bool_t rdc = FALSE;
1468
1469	/*
1470	 * Free any held credentials and caches which
1471	 * may be associated with this rnode.
1472	 */
1473	if (!mutex_tryenter(&rp->r_statelock))
1474		return (0);
1475	contents = rp->r_symlink.contents;
1476	size = rp->r_symlink.size;
1477	rp->r_symlink.contents = NULL;
1478	vsp = rp->r_secattr;
1479	rp->r_secattr = NULL;
1480	if (rp->r_dir != NULL)
1481		rdc = TRUE;
1482	/*
1483	 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1484	 * on the same r_hashq queue. We are not mandated to free all caches.
1485	 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1486	 * rnode 'rp' is freed or put on the free list.
1487	 *
1488	 * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1489	 * - it has no associated rnode4_t (its v_data is NULL),
1490	 * - it is preallocated statically and will never go away,
1491	 * so we cannot save anything by releasing it.
1492	 */
1493	if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1494	    VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1495		xattr = rp->r_xattr_dir;
1496		rp->r_xattr_dir = NULL;
1497	}
1498	mutex_exit(&rp->r_statelock);
1499
1500	/*
1501	 * Free the access cache entries.
1502	 */
1503	freed = nfs4_access_purge_rp(rp);
1504
1505	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1506		return (freed);
1507
1508	/*
1509	 * Free the symbolic link cache.
1510	 */
1511	if (contents != NULL) {
1512
1513		kmem_free((void *)contents, size);
1514	}
1515
1516	/*
1517	 * Free any cached ACL.
1518	 */
1519	if (vsp != NULL)
1520		nfs4_acl_free_cache(vsp);
1521
1522	nfs4_purge_rddir_cache(RTOV4(rp));
1523
1524	/*
1525	 * Release the xattr directory vnode
1526	 */
1527	if (xattr != NULL)
1528		VN_RELE(xattr);
1529
1530	return (1);
1531}
1532
1533static int
1534nfs4_free_reclaim(void)
1535{
1536	int freed;
1537	rnode4_t *rp;
1538
1539#ifdef DEBUG
1540	clstat4_debug.f_reclaim.value.ui64++;
1541#endif
1542	freed = 0;
1543	mutex_enter(&rp4freelist_lock);
1544	rp = rp4freelist;
1545	if (rp != NULL) {
1546		do {
1547			if (nfs4_free_data_reclaim(rp))
1548				freed = 1;
1549		} while ((rp = rp->r_freef) != rp4freelist);
1550	}
1551	mutex_exit(&rp4freelist_lock);
1552	return (freed);
1553}
1554
1555static int
1556nfs4_active_reclaim(void)
1557{
1558	int freed;
1559	int index;
1560	rnode4_t *rp;
1561
1562#ifdef DEBUG
1563	clstat4_debug.a_reclaim.value.ui64++;
1564#endif
1565	freed = 0;
1566	for (index = 0; index < rtable4size; index++) {
1567		rw_enter(&rtable4[index].r_lock, RW_READER);
1568		for (rp = rtable4[index].r_hashf;
1569		    rp != (rnode4_t *)(&rtable4[index]);
1570		    rp = rp->r_hashf) {
1571			if (nfs4_active_data_reclaim(rp))
1572				freed = 1;
1573		}
1574		rw_exit(&rtable4[index].r_lock);
1575	}
1576	return (freed);
1577}
1578
1579static int
1580nfs4_rnode_reclaim(void)
1581{
1582	int freed;
1583	rnode4_t *rp;
1584	vnode_t *vp;
1585
1586#ifdef DEBUG
1587	clstat4_debug.r_reclaim.value.ui64++;
1588#endif
1589	freed = 0;
1590	mutex_enter(&rp4freelist_lock);
1591	while ((rp = rp4freelist) != NULL) {
1592		rp4_rmfree(rp);
1593		mutex_exit(&rp4freelist_lock);
1594		if (rp->r_flags & R4HASHED) {
1595			vp = RTOV4(rp);
1596			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1597			mutex_enter(&vp->v_lock);
1598			if (vp->v_count > 1) {
1599				VN_RELE_LOCKED(vp);
1600				mutex_exit(&vp->v_lock);
1601				rw_exit(&rp->r_hashq->r_lock);
1602				mutex_enter(&rp4freelist_lock);
1603				continue;
1604			}
1605			mutex_exit(&vp->v_lock);
1606			rp4_rmhash_locked(rp);
1607			rw_exit(&rp->r_hashq->r_lock);
1608		}
1609		/*
1610		 * This call to rp_addfree will end up destroying the
1611		 * rnode, but in a safe way with the appropriate set
1612		 * of checks done.
1613		 */
1614		rp4_addfree(rp, CRED());
1615		mutex_enter(&rp4freelist_lock);
1616	}
1617	mutex_exit(&rp4freelist_lock);
1618	return (freed);
1619}
1620
1621/*ARGSUSED*/
1622static void
1623nfs4_reclaim(void *cdrarg)
1624{
1625#ifdef DEBUG
1626	clstat4_debug.reclaim.value.ui64++;
1627#endif
1628	if (nfs4_free_reclaim())
1629		return;
1630
1631	if (nfs4_active_reclaim())
1632		return;
1633
1634	(void) nfs4_rnode_reclaim();
1635}
1636
1637/*
1638 * Returns the clientid4 to use for the given mntinfo4.  Note that the
1639 * clientid can change if the caller drops mi_recovlock.
1640 */
1641
1642clientid4
1643mi2clientid(mntinfo4_t *mi)
1644{
1645	nfs4_server_t	*sp;
1646	clientid4	clientid = 0;
1647
1648	/* this locks down sp if it is found */
1649	sp = find_nfs4_server(mi);
1650	if (sp != NULL) {
1651		clientid = sp->clientid;
1652		mutex_exit(&sp->s_lock);
1653		nfs4_server_rele(sp);
1654	}
1655	return (clientid);
1656}
1657
1658/*
1659 * Return the current lease time for the server associated with the given
1660 * file.  Note that the lease time could change immediately after this
1661 * call.
1662 */
1663
1664time_t
1665r2lease_time(rnode4_t *rp)
1666{
1667	nfs4_server_t	*sp;
1668	time_t		lease_time;
1669	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1670
1671	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1672
1673	/* this locks down sp if it is found */
1674	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1675
1676	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1677		if (sp != NULL) {
1678			mutex_exit(&sp->s_lock);
1679			nfs4_server_rele(sp);
1680		}
1681		nfs_rw_exit(&mi->mi_recovlock);
1682		return (1);		/* 1 second */
1683	}
1684
1685	ASSERT(sp != NULL);
1686
1687	lease_time = sp->s_lease_time;
1688
1689	mutex_exit(&sp->s_lock);
1690	nfs4_server_rele(sp);
1691	nfs_rw_exit(&mi->mi_recovlock);
1692
1693	return (lease_time);
1694}
1695
1696/*
1697 * Return a list with information about all the known open instances for
1698 * a filesystem. The caller must call r4releopenlist() when done with the
1699 * list.
1700 *
1701 * We are safe at looking at os_valid and os_pending_close across dropping
1702 * the 'os_sync_lock' to count up the number of open streams and then
1703 * allocate memory for the osp list due to:
1704 *	-Looking at os_pending_close is safe since this routine is
1705 *	only called via recovery, and os_pending_close can only be set via
1706 *	a non-recovery operation (which are all blocked when recovery
1707 *	is active).
1708 *
1709 *	-Examining os_valid is safe since non-recovery operations, which
1710 *	could potentially switch os_valid to 0, are blocked (via
1711 *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1712 *	(which means we are the only recovery thread potentially acting
1713 *	on this open stream).
1714 */
1715
1716nfs4_opinst_t *
1717r4mkopenlist(mntinfo4_t *mi)
1718{
1719	nfs4_opinst_t *reopenlist, *rep;
1720	rnode4_t *rp;
1721	vnode_t *vp;
1722	vfs_t *vfsp = mi->mi_vfsp;
1723	int numosp;
1724	nfs4_open_stream_t *osp;
1725	int index;
1726	open_delegation_type4 dtype;
1727	int hold_vnode;
1728
1729	reopenlist = NULL;
1730
1731	for (index = 0; index < rtable4size; index++) {
1732		rw_enter(&rtable4[index].r_lock, RW_READER);
1733		for (rp = rtable4[index].r_hashf;
1734		    rp != (rnode4_t *)(&rtable4[index]);
1735		    rp = rp->r_hashf) {
1736
1737			vp = RTOV4(rp);
1738			if (vp->v_vfsp != vfsp)
1739				continue;
1740			hold_vnode = 0;
1741
1742			mutex_enter(&rp->r_os_lock);
1743
1744			/* Count the number of valid open_streams of the file */
1745			numosp = 0;
1746			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1747			    osp = list_next(&rp->r_open_streams, osp)) {
1748				mutex_enter(&osp->os_sync_lock);
1749				if (osp->os_valid && !osp->os_pending_close)
1750					numosp++;
1751				mutex_exit(&osp->os_sync_lock);
1752			}
1753
1754			/* Fill in the valid open streams per vp */
1755			if (numosp > 0) {
1756				int j;
1757
1758				hold_vnode = 1;
1759
1760				/*
1761				 * Add a new open instance to the list
1762				 */
1763				rep = kmem_zalloc(sizeof (*reopenlist),
1764				    KM_SLEEP);
1765				rep->re_next = reopenlist;
1766				reopenlist = rep;
1767
1768				rep->re_vp = vp;
1769				rep->re_osp = kmem_zalloc(
1770				    numosp * sizeof (*(rep->re_osp)),
1771				    KM_SLEEP);
1772				rep->re_numosp = numosp;
1773
1774				j = 0;
1775				for (osp = list_head(&rp->r_open_streams);
1776				    osp != NULL;
1777				    osp = list_next(&rp->r_open_streams, osp)) {
1778
1779					mutex_enter(&osp->os_sync_lock);
1780					if (osp->os_valid &&
1781					    !osp->os_pending_close) {
1782						osp->os_ref_count++;
1783						rep->re_osp[j] = osp;
1784						j++;
1785					}
1786					mutex_exit(&osp->os_sync_lock);
1787				}
1788				/*
1789				 * Assuming valid osp(s) stays valid between
1790				 * the time obtaining j and numosp.
1791				 */
1792				ASSERT(j == numosp);
1793			}
1794
1795			mutex_exit(&rp->r_os_lock);
1796			/* do this here to keep v_lock > r_os_lock */
1797			if (hold_vnode)
1798				VN_HOLD(vp);
1799			mutex_enter(&rp->r_statev4_lock);
1800			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1801				/*
1802				 * If this rnode holds a delegation,
1803				 * but if there are no valid open streams,
1804				 * then just discard the delegation
1805				 * without doing delegreturn.
1806				 */
1807				if (numosp > 0)
1808					rp->r_deleg_needs_recovery =
1809					    rp->r_deleg_type;
1810			}
1811			/* Save the delegation type for use outside the lock */
1812			dtype = rp->r_deleg_type;
1813			mutex_exit(&rp->r_statev4_lock);
1814
1815			/*
1816			 * If we have a delegation then get rid of it.
1817			 * We've set rp->r_deleg_needs_recovery so we have
1818			 * enough information to recover.
1819			 */
1820			if (dtype != OPEN_DELEGATE_NONE) {
1821				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1822			}
1823		}
1824		rw_exit(&rtable4[index].r_lock);
1825	}
1826	return (reopenlist);
1827}
1828
1829/*
1830 * Given a filesystem id, check to see if any rnodes
1831 * within this fsid reside in the rnode cache, other
1832 * than one we know about.
1833 *
1834 * Return 1 if an rnode is found, 0 otherwise
1835 */
1836int
1837r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1838{
1839	rnode4_t *rp;
1840	vnode_t *vp;
1841	vfs_t *vfsp = mi->mi_vfsp;
1842	fattr4_fsid *fsid;
1843	int index, found = 0;
1844
1845	for (index = 0; index < rtable4size; index++) {
1846		rw_enter(&rtable4[index].r_lock, RW_READER);
1847		for (rp = rtable4[index].r_hashf;
1848		    rp != (rnode4_t *)(&rtable4[index]);
1849		    rp = rp->r_hashf) {
1850
1851			vp = RTOV4(rp);
1852			if (vp->v_vfsp != vfsp)
1853				continue;
1854
1855			/*
1856			 * XXX there might be a case where a
1857			 * replicated fs may have the same fsid
1858			 * across two different servers. This
1859			 * check isn't good enough in that case
1860			 */
1861			fsid = &rp->r_srv_fsid;
1862			if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1863				found = 1;
1864				break;
1865			}
1866		}
1867		rw_exit(&rtable4[index].r_lock);
1868
1869		if (found)
1870			break;
1871	}
1872	return (found);
1873}
1874
1875/*
1876 * Release the list of open instance references.
1877 */
1878
1879void
1880r4releopenlist(nfs4_opinst_t *reopenp)
1881{
1882	nfs4_opinst_t *rep, *next;
1883	int i;
1884
1885	for (rep = reopenp; rep; rep = next) {
1886		next = rep->re_next;
1887
1888		for (i = 0; i < rep->re_numosp; i++)
1889			open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1890
1891		VN_RELE(rep->re_vp);
1892		kmem_free(rep->re_osp,
1893		    rep->re_numosp * sizeof (*(rep->re_osp)));
1894
1895		kmem_free(rep, sizeof (*rep));
1896	}
1897}
1898
1899int
1900nfs4_rnode_init(void)
1901{
1902	ulong_t nrnode4_max;
1903	int i;
1904
1905	/*
1906	 * Compute the size of the rnode4 hash table
1907	 */
1908	if (nrnode <= 0)
1909		nrnode = ncsize;
1910	nrnode4_max =
1911	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1912	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1913		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1914		    "!setting nrnode to max value of %ld", nrnode4_max);
1915		nrnode = nrnode4_max;
1916	}
1917	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1918	rtable4mask = rtable4size - 1;
1919
1920	/*
1921	 * Allocate and initialize the hash buckets
1922	 */
1923	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1924	for (i = 0; i < rtable4size; i++) {
1925		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1926		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1927		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1928	}
1929
1930	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1931	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1932
1933	return (0);
1934}
1935
1936int
1937nfs4_rnode_fini(void)
1938{
1939	int i;
1940
1941	/*
1942	 * Deallocate the rnode hash queues
1943	 */
1944	kmem_cache_destroy(rnode4_cache);
1945
1946	for (i = 0; i < rtable4size; i++)
1947		rw_destroy(&rtable4[i].r_lock);
1948
1949	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1950
1951	return (0);
1952}
1953
1954/*
1955 * Return non-zero if the given filehandle refers to the root filehandle
1956 * for the given rnode.
1957 */
1958
1959static int
1960isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1961{
1962	int isroot;
1963
1964	isroot = 0;
1965	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1966		isroot = 1;
1967
1968	return (isroot);
1969}
1970
1971/*
1972 * The r4_stub_* routines assume that the rnode is newly activated, and
1973 * that the caller either holds the hash bucket r_lock for this rnode as
1974 * RW_WRITER, or holds r_statelock.
1975 */
1976static void
1977r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1978{
1979	vnode_t *vp = RTOV4(rp);
1980	krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1981
1982	ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1983
1984	rp->r_stub_type = type;
1985
1986	/*
1987	 * Safely switch this vnode to the trigger vnodeops.
1988	 *
1989	 * Currently, we don't ever switch a trigger vnode back to using
1990	 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1991	 * a new v4 object is not a trigger, and it will already have the
1992	 * correct v4 vnodeops by default. So, no "else" case required here.
1993	 */
1994	if (type != NFS4_STUB_NONE)
1995		vn_setops(vp, nfs4_trigger_vnodeops);
1996}
1997
1998void
1999r4_stub_mirrormount(rnode4_t *rp)
2000{
2001	r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
2002}
2003
2004void
2005r4_stub_referral(rnode4_t *rp)
2006{
2007	DTRACE_PROBE1(nfs4clnt__func__referral__moved,
2008	    vnode_t *, RTOV4(rp));
2009	r4_stub_set(rp, NFS4_STUB_REFERRAL);
2010}
2011
2012void
2013r4_stub_none(rnode4_t *rp)
2014{
2015	r4_stub_set(rp, NFS4_STUB_NONE);
2016}
2017
2018#ifdef DEBUG
2019
2020/*
2021 * Look in the rnode table for other rnodes that have the same filehandle.
2022 * Assume the lock is held for the hash chain of checkrp
2023 */
2024
2025static void
2026r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
2027{
2028	rnode4_t *rp;
2029	vnode_t *tvp;
2030	nfs4_fhandle_t fh, fh2;
2031	int index;
2032
2033	if (!r4_check_for_dups)
2034		return;
2035
2036	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
2037
2038	sfh4_copyval(checkrp->r_fh, &fh);
2039
2040	for (index = 0; index < rtable4size; index++) {
2041
2042		if (&rtable4[index] != checkrp->r_hashq)
2043			rw_enter(&rtable4[index].r_lock, RW_READER);
2044
2045		for (rp = rtable4[index].r_hashf;
2046		    rp != (rnode4_t *)(&rtable4[index]);
2047		    rp = rp->r_hashf) {
2048
2049			if (rp == checkrp)
2050				continue;
2051
2052			tvp = RTOV4(rp);
2053			if (tvp->v_vfsp != vfsp)
2054				continue;
2055
2056			sfh4_copyval(rp->r_fh, &fh2);
2057			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2058				cmn_err(CE_PANIC, "rnodes with same fs, fh "
2059				    "(%p, %p)", (void *)checkrp, (void *)rp);
2060			}
2061		}
2062
2063		if (&rtable4[index] != checkrp->r_hashq)
2064			rw_exit(&rtable4[index].r_lock);
2065	}
2066}
2067
2068#endif /* DEBUG */
2069