17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
550a83466Sjwahlig  * Common Development and Distribution License (the "License").
650a83466Sjwahlig  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22ddbc368aSRick Mesta  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /*
277c478bd9Sstevel@tonic-gate  * NFS Version 4 state recovery code.
287c478bd9Sstevel@tonic-gate  */
297c478bd9Sstevel@tonic-gate 
307c478bd9Sstevel@tonic-gate #include <nfs/nfs4_clnt.h>
317c478bd9Sstevel@tonic-gate #include <nfs/nfs4.h>
327c478bd9Sstevel@tonic-gate #include <nfs/rnode4.h>
337c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
347c478bd9Sstevel@tonic-gate #include <sys/cred.h>
357c478bd9Sstevel@tonic-gate #include <sys/systm.h>
367c478bd9Sstevel@tonic-gate #include <sys/flock.h>
377c478bd9Sstevel@tonic-gate #include <sys/dnlc.h>
387c478bd9Sstevel@tonic-gate #include <sys/ddi.h>
397c478bd9Sstevel@tonic-gate #include <sys/disp.h>
407c478bd9Sstevel@tonic-gate #include <sys/list.h>
417c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
422f172c55SRobert Thurlow #include <sys/mount.h>
432f172c55SRobert Thurlow #include <sys/door.h>
442f172c55SRobert Thurlow #include <nfs/nfssys.h>
452f172c55SRobert Thurlow #include <nfs/nfsid_map.h>
462f172c55SRobert Thurlow #include <nfs/nfs4_idmap_impl.h>
477c478bd9Sstevel@tonic-gate 
487c478bd9Sstevel@tonic-gate extern r4hashq_t *rtable4;
497c478bd9Sstevel@tonic-gate 
507c478bd9Sstevel@tonic-gate /*
517c478bd9Sstevel@tonic-gate  * Information that describes what needs to be done for recovery.  It is
527c478bd9Sstevel@tonic-gate  * passed to a client recovery thread as well as passed to various recovery
537c478bd9Sstevel@tonic-gate  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
547c478bd9Sstevel@tonic-gate  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
557c478bd9Sstevel@tonic-gate  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
567c478bd9Sstevel@tonic-gate  * lock or open/close request, and it holds reference counts for the
577c478bd9Sstevel@tonic-gate  * various objects (vnode, etc.).  The recovery thread also uses flags set
587c478bd9Sstevel@tonic-gate  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
597c478bd9Sstevel@tonic-gate  * to save the error that originally triggered the recovery event -- will
607c478bd9Sstevel@tonic-gate  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
617c478bd9Sstevel@tonic-gate  * contains information about the request that got NFS4ERR_BAD_SEQID, and
627c478bd9Sstevel@tonic-gate  * it holds reference count for the various objects (vnode, open owner,
637c478bd9Sstevel@tonic-gate  * open stream, lock owner).
647c478bd9Sstevel@tonic-gate  */
657c478bd9Sstevel@tonic-gate 
667c478bd9Sstevel@tonic-gate typedef struct {
677c478bd9Sstevel@tonic-gate 	mntinfo4_t *rc_mi;
687c478bd9Sstevel@tonic-gate 	vnode_t *rc_vp1;
697c478bd9Sstevel@tonic-gate 	vnode_t *rc_vp2;
707c478bd9Sstevel@tonic-gate 	nfs4_recov_t rc_action;
717c478bd9Sstevel@tonic-gate 	stateid4 rc_stateid;
727c478bd9Sstevel@tonic-gate 	bool_t rc_srv_reboot;		/* server has rebooted */
737c478bd9Sstevel@tonic-gate 	nfs4_lost_rqst_t *rc_lost_rqst;
747c478bd9Sstevel@tonic-gate 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
757c478bd9Sstevel@tonic-gate 	int rc_error;
767c478bd9Sstevel@tonic-gate 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
772f172c55SRobert Thurlow 	vnode_t *rc_moved_vp;
782f172c55SRobert Thurlow 	char *rc_moved_nm;
797c478bd9Sstevel@tonic-gate } recov_info_t;
807c478bd9Sstevel@tonic-gate 
817c478bd9Sstevel@tonic-gate /*
827c478bd9Sstevel@tonic-gate  * How long to wait before trying again if there is an error doing
837c478bd9Sstevel@tonic-gate  * recovery, in seconds.
847c478bd9Sstevel@tonic-gate  */
857c478bd9Sstevel@tonic-gate 
867c478bd9Sstevel@tonic-gate static int recov_err_delay = 1;
877c478bd9Sstevel@tonic-gate 
887c478bd9Sstevel@tonic-gate /*
897c478bd9Sstevel@tonic-gate  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
907c478bd9Sstevel@tonic-gate  * errors.  Expressed in seconds.  Default is defined as
917c478bd9Sstevel@tonic-gate  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
927c478bd9Sstevel@tonic-gate  */
937c478bd9Sstevel@tonic-gate time_t nfs4err_delay_time = 0;
947c478bd9Sstevel@tonic-gate 
957c478bd9Sstevel@tonic-gate /*
967c478bd9Sstevel@tonic-gate  * Tuneable to limit how many time "exempt" ops go OTW
977c478bd9Sstevel@tonic-gate  * after a recovery error.  Exempt op hints are OH_CLOSE,
987c478bd9Sstevel@tonic-gate  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
997c478bd9Sstevel@tonic-gate  * OTW even after rnode was "dead" due to recovery errors.
1007c478bd9Sstevel@tonic-gate  *
1017c478bd9Sstevel@tonic-gate  * The tuneable below limits the number of times a start_fop
1027c478bd9Sstevel@tonic-gate  * invocation will retry the exempt hints.  After the limit
1037c478bd9Sstevel@tonic-gate  * is reached, nfs4_start_fop will return an error just like
1047c478bd9Sstevel@tonic-gate  * it would for non-exempt op hints.
1057c478bd9Sstevel@tonic-gate  */
1067c478bd9Sstevel@tonic-gate int nfs4_max_recov_error_retry = 3;
1077c478bd9Sstevel@tonic-gate 
1087c478bd9Sstevel@tonic-gate /*
1097c478bd9Sstevel@tonic-gate  * Number of seconds the recovery thread should pause before retry when the
1107c478bd9Sstevel@tonic-gate  * filesystem has been forcibly unmounted.
1117c478bd9Sstevel@tonic-gate  */
1127c478bd9Sstevel@tonic-gate 
1137c478bd9Sstevel@tonic-gate int nfs4_unmount_delay = 1;
1147c478bd9Sstevel@tonic-gate 
1157c478bd9Sstevel@tonic-gate #ifdef DEBUG
1167c478bd9Sstevel@tonic-gate 
1177c478bd9Sstevel@tonic-gate /*
1187c478bd9Sstevel@tonic-gate  * How long to wait (in seconds) between recovery operations on a given
1197c478bd9Sstevel@tonic-gate  * file.  Normally zero, but could be set longer for testing purposes.
1207c478bd9Sstevel@tonic-gate  */
1217c478bd9Sstevel@tonic-gate static int nfs4_recovdelay = 0;
1227c478bd9Sstevel@tonic-gate 
1237c478bd9Sstevel@tonic-gate /*
1247c478bd9Sstevel@tonic-gate  * Switch that controls whether to go into the debugger when recovery
1257c478bd9Sstevel@tonic-gate  * fails.
1267c478bd9Sstevel@tonic-gate  */
1277c478bd9Sstevel@tonic-gate static int nfs4_fail_recov_stop = 0;
1287c478bd9Sstevel@tonic-gate 
1297c478bd9Sstevel@tonic-gate /*
1307c478bd9Sstevel@tonic-gate  * Tuneables to debug client namespace interaction with server
1317c478bd9Sstevel@tonic-gate  * mount points:
1327c478bd9Sstevel@tonic-gate  *
1337c478bd9Sstevel@tonic-gate  *	nfs4_srvmnt_fail_cnt:
1347c478bd9Sstevel@tonic-gate  *		number of times EACCES returned because client
1357c478bd9Sstevel@tonic-gate  *		attempted to cross server mountpoint
1367c478bd9Sstevel@tonic-gate  *
1377c478bd9Sstevel@tonic-gate  *	nfs4_srvmnt_debug:
1387c478bd9Sstevel@tonic-gate  *		trigger console printf whenever client attempts
1397c478bd9Sstevel@tonic-gate  *		to cross server mountpoint
1407c478bd9Sstevel@tonic-gate  */
1417c478bd9Sstevel@tonic-gate int nfs4_srvmnt_fail_cnt = 0;
1427c478bd9Sstevel@tonic-gate int nfs4_srvmnt_debug = 0;
1437c478bd9Sstevel@tonic-gate #endif
1447c478bd9Sstevel@tonic-gate 
1452f172c55SRobert Thurlow extern zone_key_t	nfs4clnt_zone_key;
1462f172c55SRobert Thurlow 
1477c478bd9Sstevel@tonic-gate /* forward references, in alphabetic order */
1487c478bd9Sstevel@tonic-gate static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
1497c478bd9Sstevel@tonic-gate 	nfs4_error_t *);
1507c478bd9Sstevel@tonic-gate static void errs_to_action(recov_info_t *,
1517c478bd9Sstevel@tonic-gate 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
1527c478bd9Sstevel@tonic-gate 	nfs_opnum4, nfs4_bseqid_entry_t *);
1537c478bd9Sstevel@tonic-gate static void flush_reinstate(nfs4_lost_rqst_t *);
1547c478bd9Sstevel@tonic-gate static void free_milist(mntinfo4_t **, int);
1557c478bd9Sstevel@tonic-gate static mntinfo4_t **make_milist(nfs4_server_t *, int *);
1567c478bd9Sstevel@tonic-gate static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
1577c478bd9Sstevel@tonic-gate 	nfs4_recov_state_t *, int, char *);
1587c478bd9Sstevel@tonic-gate static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
1597c478bd9Sstevel@tonic-gate static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
1607c478bd9Sstevel@tonic-gate static void nfs4_recov_thread(recov_info_t *);
1617c478bd9Sstevel@tonic-gate static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
1627c478bd9Sstevel@tonic-gate static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
1637c478bd9Sstevel@tonic-gate static cred_t *pid_to_cr(pid_t);
1647c478bd9Sstevel@tonic-gate static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
1657c478bd9Sstevel@tonic-gate static void recov_bad_seqid(recov_info_t *);
1667c478bd9Sstevel@tonic-gate static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
1677c478bd9Sstevel@tonic-gate static void recov_clientid(recov_info_t *, nfs4_server_t *);
1687c478bd9Sstevel@tonic-gate static void recov_done(mntinfo4_t *, recov_info_t *);
1697c478bd9Sstevel@tonic-gate static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
1707c478bd9Sstevel@tonic-gate static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
1717c478bd9Sstevel@tonic-gate static void recov_openfiles(recov_info_t *, nfs4_server_t *);
1727c478bd9Sstevel@tonic-gate static void recov_stale(mntinfo4_t *, vnode_t *);
1737c478bd9Sstevel@tonic-gate static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
1747c478bd9Sstevel@tonic-gate static void recov_throttle(recov_info_t *, vnode_t *);
175*faf39f17SMarcel Telka static void relock_skip_pid(vnode_t *, locklist_t *, pid_t);
1767c478bd9Sstevel@tonic-gate static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
1777c478bd9Sstevel@tonic-gate static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
1787c478bd9Sstevel@tonic-gate 	nfs4_server_t *);
1797c478bd9Sstevel@tonic-gate static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
1807c478bd9Sstevel@tonic-gate static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
1812f172c55SRobert Thurlow 	nfs4_server_t *, vnode_t *, char *);
1827c478bd9Sstevel@tonic-gate static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
1837c478bd9Sstevel@tonic-gate 	vnode_t *);
1847c478bd9Sstevel@tonic-gate static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
1857c478bd9Sstevel@tonic-gate 
1867c478bd9Sstevel@tonic-gate /*
1877c478bd9Sstevel@tonic-gate  * Return non-zero if the given errno, status, and rpc status codes
1887c478bd9Sstevel@tonic-gate  * in the nfs4_error_t indicate that client recovery is needed.
1897c478bd9Sstevel@tonic-gate  * "stateful" indicates whether the call that got the error establishes or
1907c478bd9Sstevel@tonic-gate  * removes state on the server (open, close, lock, unlock, delegreturn).
1917c478bd9Sstevel@tonic-gate  */
1927c478bd9Sstevel@tonic-gate 
1937c478bd9Sstevel@tonic-gate int
nfs4_needs_recovery(nfs4_error_t * ep,bool_t stateful,vfs_t * vfsp)1947c478bd9Sstevel@tonic-gate nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
1957c478bd9Sstevel@tonic-gate {
1967c478bd9Sstevel@tonic-gate 	int recov = 0;
1977c478bd9Sstevel@tonic-gate 	mntinfo4_t *mi;
1987c478bd9Sstevel@tonic-gate 
1997c478bd9Sstevel@tonic-gate 	/*
2007c478bd9Sstevel@tonic-gate 	 * Try failover if the error values justify it and if
2017c478bd9Sstevel@tonic-gate 	 * it's a failover mount.  Don't try if the mount is in
2027c478bd9Sstevel@tonic-gate 	 * progress, failures are handled explicitly by nfs4rootvp.
2037c478bd9Sstevel@tonic-gate 	 */
2047c478bd9Sstevel@tonic-gate 	if (nfs4_try_failover(ep)) {
2057c478bd9Sstevel@tonic-gate 		mi = VFTOMI4(vfsp);
2067c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
2077c478bd9Sstevel@tonic-gate 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
2087c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
2097c478bd9Sstevel@tonic-gate 		if (recov)
2107c478bd9Sstevel@tonic-gate 			return (recov);
2117c478bd9Sstevel@tonic-gate 	}
2127c478bd9Sstevel@tonic-gate 
2137c478bd9Sstevel@tonic-gate 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
2147c478bd9Sstevel@tonic-gate 		/*
2157c478bd9Sstevel@tonic-gate 		 * The server may have gotten the request, so for stateful
2167c478bd9Sstevel@tonic-gate 		 * ops we need to resynchronize and possibly back out the
2177c478bd9Sstevel@tonic-gate 		 * op.
2187c478bd9Sstevel@tonic-gate 		 */
2197c478bd9Sstevel@tonic-gate 		return (stateful);
2207c478bd9Sstevel@tonic-gate 	}
2217c478bd9Sstevel@tonic-gate 	if (ep->error != 0)
2227c478bd9Sstevel@tonic-gate 		return (0);
2237c478bd9Sstevel@tonic-gate 
2247c478bd9Sstevel@tonic-gate 	/* stat values are listed alphabetically */
2257c478bd9Sstevel@tonic-gate 	/*
2267c478bd9Sstevel@tonic-gate 	 * There are two lists here: the errors for which we have code, and
2277c478bd9Sstevel@tonic-gate 	 * the errors for which we plan to have code before FCS.  For the
2287c478bd9Sstevel@tonic-gate 	 * second list, print a warning message but don't attempt recovery.
2297c478bd9Sstevel@tonic-gate 	 */
2307c478bd9Sstevel@tonic-gate 	switch (ep->stat) {
2317c478bd9Sstevel@tonic-gate 	case NFS4ERR_BADHANDLE:
2327c478bd9Sstevel@tonic-gate 	case NFS4ERR_BAD_SEQID:
2337c478bd9Sstevel@tonic-gate 	case NFS4ERR_BAD_STATEID:
2347c478bd9Sstevel@tonic-gate 	case NFS4ERR_DELAY:
2357c478bd9Sstevel@tonic-gate 	case NFS4ERR_EXPIRED:
2367c478bd9Sstevel@tonic-gate 	case NFS4ERR_FHEXPIRED:
2377c478bd9Sstevel@tonic-gate 	case NFS4ERR_GRACE:
2387c478bd9Sstevel@tonic-gate 	case NFS4ERR_OLD_STATEID:
2397c478bd9Sstevel@tonic-gate 	case NFS4ERR_RESOURCE:
2407c478bd9Sstevel@tonic-gate 	case NFS4ERR_STALE_CLIENTID:
2417c478bd9Sstevel@tonic-gate 	case NFS4ERR_STALE_STATEID:
2427c478bd9Sstevel@tonic-gate 	case NFS4ERR_WRONGSEC:
2437c478bd9Sstevel@tonic-gate 	case NFS4ERR_STALE:
2447c478bd9Sstevel@tonic-gate 		recov = 1;
2457c478bd9Sstevel@tonic-gate 		break;
2467c478bd9Sstevel@tonic-gate #ifdef DEBUG
2477c478bd9Sstevel@tonic-gate 	case NFS4ERR_LEASE_MOVED:
2487c478bd9Sstevel@tonic-gate 	case NFS4ERR_MOVED:
2497c478bd9Sstevel@tonic-gate 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
2507c478bd9Sstevel@tonic-gate 		    CE_WARN, "!Can't yet recover from NFS status %d",
251b9238976Sth 		    ep->stat);
2527c478bd9Sstevel@tonic-gate 		break;
2537c478bd9Sstevel@tonic-gate #endif
2547c478bd9Sstevel@tonic-gate 	}
2557c478bd9Sstevel@tonic-gate 
2567c478bd9Sstevel@tonic-gate 	return (recov);
2577c478bd9Sstevel@tonic-gate }
2587c478bd9Sstevel@tonic-gate 
2597c478bd9Sstevel@tonic-gate /*
2607c478bd9Sstevel@tonic-gate  * Some operations such as DELEGRETURN want to avoid invoking
2617c478bd9Sstevel@tonic-gate  * recovery actions that will only mark the file dead.  If
2627c478bd9Sstevel@tonic-gate  * better handlers are invoked for any of these errors, this
2637c478bd9Sstevel@tonic-gate  * routine should be modified.
2647c478bd9Sstevel@tonic-gate  */
2657c478bd9Sstevel@tonic-gate int
nfs4_recov_marks_dead(nfsstat4 status)2667c478bd9Sstevel@tonic-gate nfs4_recov_marks_dead(nfsstat4 status)
2677c478bd9Sstevel@tonic-gate {
2687c478bd9Sstevel@tonic-gate 	if (status == NFS4ERR_BAD_SEQID ||
2697c478bd9Sstevel@tonic-gate 	    status == NFS4ERR_EXPIRED ||
2707c478bd9Sstevel@tonic-gate 	    status == NFS4ERR_BAD_STATEID ||
2717c478bd9Sstevel@tonic-gate 	    status == NFS4ERR_OLD_STATEID)
2727c478bd9Sstevel@tonic-gate 		return (1);
2737c478bd9Sstevel@tonic-gate 	return (0);
2747c478bd9Sstevel@tonic-gate }
2757c478bd9Sstevel@tonic-gate 
2767c478bd9Sstevel@tonic-gate /*
2777c478bd9Sstevel@tonic-gate  * Transfer the state recovery information in recovp to mi's resend queue,
2787c478bd9Sstevel@tonic-gate  * and mark mi as having a lost state request.
2797c478bd9Sstevel@tonic-gate  */
2807c478bd9Sstevel@tonic-gate static void
nfs4_enqueue_lost_rqst(recov_info_t * recovp,mntinfo4_t * mi)2817c478bd9Sstevel@tonic-gate nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
2827c478bd9Sstevel@tonic-gate {
2837c478bd9Sstevel@tonic-gate 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
2847c478bd9Sstevel@tonic-gate 
2857c478bd9Sstevel@tonic-gate 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
2867c478bd9Sstevel@tonic-gate 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2877c478bd9Sstevel@tonic-gate 
2887c478bd9Sstevel@tonic-gate 	ASSERT(lrp != NULL && lrp->lr_op != 0);
2897c478bd9Sstevel@tonic-gate 
2907c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
291b9238976Sth 	    "nfs4_enqueue_lost_rqst %p, op %d",
292b9238976Sth 	    (void *)lrp, lrp->lr_op));
2937c478bd9Sstevel@tonic-gate 
2947c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
2957c478bd9Sstevel@tonic-gate 	mi->mi_recovflags |= MI4R_LOST_STATE;
2967c478bd9Sstevel@tonic-gate 	if (lrp->lr_putfirst)
2977c478bd9Sstevel@tonic-gate 		list_insert_head(&mi->mi_lost_state, lrp);
2987c478bd9Sstevel@tonic-gate 	else
2997c478bd9Sstevel@tonic-gate 		list_insert_tail(&mi->mi_lost_state, lrp);
3007c478bd9Sstevel@tonic-gate 	recovp->rc_lost_rqst = NULL;
3017c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
3027c478bd9Sstevel@tonic-gate 
3037c478bd9Sstevel@tonic-gate 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
304b9238976Sth 	    lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3057c478bd9Sstevel@tonic-gate }
3067c478bd9Sstevel@tonic-gate 
3077c478bd9Sstevel@tonic-gate /*
3087c478bd9Sstevel@tonic-gate  * Transfer the bad seqid recovery information in recovp to mi's
3097c478bd9Sstevel@tonic-gate  * bad seqid queue, and mark mi as having a bad seqid request.
3107c478bd9Sstevel@tonic-gate  */
3117c478bd9Sstevel@tonic-gate void
enqueue_bseqid_rqst(recov_info_t * recovp,mntinfo4_t * mi)3127c478bd9Sstevel@tonic-gate enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
3137c478bd9Sstevel@tonic-gate {
3147c478bd9Sstevel@tonic-gate 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3157c478bd9Sstevel@tonic-gate 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3167c478bd9Sstevel@tonic-gate 	ASSERT(recovp->rc_bseqid_rqst != NULL);
3177c478bd9Sstevel@tonic-gate 
3187c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
3197c478bd9Sstevel@tonic-gate 	mi->mi_recovflags |= MI4R_BAD_SEQID;
3207c478bd9Sstevel@tonic-gate 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
3217c478bd9Sstevel@tonic-gate 	recovp->rc_bseqid_rqst = NULL;
3227c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
3237c478bd9Sstevel@tonic-gate }
3247c478bd9Sstevel@tonic-gate 
3257c478bd9Sstevel@tonic-gate /*
3267c478bd9Sstevel@tonic-gate  * Initiate recovery.
3277c478bd9Sstevel@tonic-gate  *
3287c478bd9Sstevel@tonic-gate  * The nfs4_error_t contains the return codes that triggered a recovery
3297c478bd9Sstevel@tonic-gate  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
3307c478bd9Sstevel@tonic-gate  * being operated on.  vp1 and vp2 may be NULL.
3317c478bd9Sstevel@tonic-gate  *
3327c478bd9Sstevel@tonic-gate  * Multiple calls are okay.  If recovery is already underway, the call
3337c478bd9Sstevel@tonic-gate  * updates the information about what state needs recovery but does not
3347c478bd9Sstevel@tonic-gate  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
3357c478bd9Sstevel@tonic-gate  * for proper synchronization with any recovery thread.
3367c478bd9Sstevel@tonic-gate  *
3377c478bd9Sstevel@tonic-gate  * This will return TRUE if recovery was aborted, and FALSE otherwise.
3387c478bd9Sstevel@tonic-gate  */
3397c478bd9Sstevel@tonic-gate bool_t
nfs4_start_recovery(nfs4_error_t * ep,mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,stateid4 * sid,nfs4_lost_rqst_t * lost_rqstp,nfs_opnum4 op,nfs4_bseqid_entry_t * bsep,vnode_t * moved_vp,char * moved_nm)3407c478bd9Sstevel@tonic-gate nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
3417c478bd9Sstevel@tonic-gate     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
3422f172c55SRobert Thurlow     nfs4_bseqid_entry_t *bsep, vnode_t *moved_vp, char *moved_nm)
3437c478bd9Sstevel@tonic-gate {
3447c478bd9Sstevel@tonic-gate 	recov_info_t *recovp;
3457c478bd9Sstevel@tonic-gate 	nfs4_server_t *sp;
3467c478bd9Sstevel@tonic-gate 	bool_t abort = FALSE;
3477c478bd9Sstevel@tonic-gate 	bool_t gone = FALSE;
3487c478bd9Sstevel@tonic-gate 
349108322fbScarlsonj 	ASSERT(nfs_zone() == mi->mi_zone);
3507c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
3517c478bd9Sstevel@tonic-gate 	/*
3527c478bd9Sstevel@tonic-gate 	 * If there is lost state, we need to kick off recovery even if the
3537c478bd9Sstevel@tonic-gate 	 * filesystem has been unmounted or the zone is shutting down.
3547c478bd9Sstevel@tonic-gate 	 */
3557c478bd9Sstevel@tonic-gate 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
3567c478bd9Sstevel@tonic-gate 	if (gone) {
3577c478bd9Sstevel@tonic-gate 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
3587c478bd9Sstevel@tonic-gate 		if (ep->error == EIO && lost_rqstp == NULL) {
3597c478bd9Sstevel@tonic-gate 			/* failed due to forced unmount, no new lost state */
3607c478bd9Sstevel@tonic-gate 			abort = TRUE;
3617c478bd9Sstevel@tonic-gate 		}
3627c478bd9Sstevel@tonic-gate 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
3637c478bd9Sstevel@tonic-gate 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
3647c478bd9Sstevel@tonic-gate 			/* some other failure, no existing lost state */
3657c478bd9Sstevel@tonic-gate 			abort = TRUE;
3667c478bd9Sstevel@tonic-gate 		}
3677c478bd9Sstevel@tonic-gate 		if (abort) {
3687c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
3697c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
370b9238976Sth 			    "nfs4_start_recovery: fs unmounted"));
3717c478bd9Sstevel@tonic-gate 			return (TRUE);
3727c478bd9Sstevel@tonic-gate 		}
3737c478bd9Sstevel@tonic-gate 	}
3747c478bd9Sstevel@tonic-gate 	mi->mi_in_recovery++;
3757c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
3767c478bd9Sstevel@tonic-gate 
3777c478bd9Sstevel@tonic-gate 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
3787c478bd9Sstevel@tonic-gate 	recovp->rc_orig_errors = *ep;
3797c478bd9Sstevel@tonic-gate 	sp = find_nfs4_server(mi);
380b9238976Sth 	errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
3817c478bd9Sstevel@tonic-gate 	if (sp != NULL)
3827c478bd9Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
3832f172c55SRobert Thurlow 	start_recovery(recovp, mi, vp1, vp2, sp, moved_vp, moved_nm);
3847c478bd9Sstevel@tonic-gate 	if (sp != NULL)
3857c478bd9Sstevel@tonic-gate 		nfs4_server_rele(sp);
3867c478bd9Sstevel@tonic-gate 	return (FALSE);
3877c478bd9Sstevel@tonic-gate }
3887c478bd9Sstevel@tonic-gate 
3897c478bd9Sstevel@tonic-gate /*
3907c478bd9Sstevel@tonic-gate  * Internal version of nfs4_start_recovery.  The difference is that the
3917c478bd9Sstevel@tonic-gate  * caller specifies the recovery action, rather than the errors leading to
3927c478bd9Sstevel@tonic-gate  * recovery.
3937c478bd9Sstevel@tonic-gate  */
3947c478bd9Sstevel@tonic-gate static void
start_recovery_action(nfs4_recov_t what,bool_t reboot,mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2)3957c478bd9Sstevel@tonic-gate start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
396b9238976Sth     vnode_t *vp1, vnode_t *vp2)
3977c478bd9Sstevel@tonic-gate {
3987c478bd9Sstevel@tonic-gate 	recov_info_t *recovp;
3997c478bd9Sstevel@tonic-gate 
400108322fbScarlsonj 	ASSERT(nfs_zone() == mi->mi_zone);
4017c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
4027c478bd9Sstevel@tonic-gate 	mi->mi_in_recovery++;
4037c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
4047c478bd9Sstevel@tonic-gate 
4057c478bd9Sstevel@tonic-gate 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
4067c478bd9Sstevel@tonic-gate 	recovp->rc_action = what;
4077c478bd9Sstevel@tonic-gate 	recovp->rc_srv_reboot = reboot;
4087c478bd9Sstevel@tonic-gate 	recovp->rc_error = EIO;
4092f172c55SRobert Thurlow 	start_recovery(recovp, mi, vp1, vp2, NULL, NULL, NULL);
4107c478bd9Sstevel@tonic-gate }
4117c478bd9Sstevel@tonic-gate 
4127c478bd9Sstevel@tonic-gate static void
start_recovery(recov_info_t * recovp,mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_server_t * sp,vnode_t * moved_vp,char * moved_nm)4137c478bd9Sstevel@tonic-gate start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
4142f172c55SRobert Thurlow     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp,
4152f172c55SRobert Thurlow     vnode_t *moved_vp, char *moved_nm)
4167c478bd9Sstevel@tonic-gate {
4177c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
418b9238976Sth 	    "start_recovery: mi %p, what %s", (void*)mi,
419b9238976Sth 	    nfs4_recov_action_to_str(recovp->rc_action)));
4207c478bd9Sstevel@tonic-gate 
4217c478bd9Sstevel@tonic-gate 	/*
4227c478bd9Sstevel@tonic-gate 	 * Bump the reference on the vfs so that we can pass it to the
4237c478bd9Sstevel@tonic-gate 	 * recovery thread.
4247c478bd9Sstevel@tonic-gate 	 */
4257c478bd9Sstevel@tonic-gate 	VFS_HOLD(mi->mi_vfsp);
42650a83466Sjwahlig 	MI4_HOLD(mi);
4277c478bd9Sstevel@tonic-gate again:
4287c478bd9Sstevel@tonic-gate 	switch (recovp->rc_action) {
4297c478bd9Sstevel@tonic-gate 	case NR_FAILOVER:
4307c478bd9Sstevel@tonic-gate 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4317c478bd9Sstevel@tonic-gate 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4327c478bd9Sstevel@tonic-gate 		if (mi->mi_servers->sv_next == NULL)
4337c478bd9Sstevel@tonic-gate 			goto out_no_thread;
4347c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
4357c478bd9Sstevel@tonic-gate 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
4367c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
4377c478bd9Sstevel@tonic-gate 
4387c478bd9Sstevel@tonic-gate 		if (recovp->rc_lost_rqst != NULL)
4397c478bd9Sstevel@tonic-gate 			nfs4_enqueue_lost_rqst(recovp, mi);
4407c478bd9Sstevel@tonic-gate 		break;
4417c478bd9Sstevel@tonic-gate 
4427c478bd9Sstevel@tonic-gate 	case NR_CLIENTID:
4437c478bd9Sstevel@tonic-gate 		/*
4447c478bd9Sstevel@tonic-gate 		 * If the filesystem has been unmounted, punt.
4457c478bd9Sstevel@tonic-gate 		 */
4467c478bd9Sstevel@tonic-gate 		if (sp == NULL)
4477c478bd9Sstevel@tonic-gate 			goto out_no_thread;
4487c478bd9Sstevel@tonic-gate 
4497c478bd9Sstevel@tonic-gate 		/*
4507c478bd9Sstevel@tonic-gate 		 * If nobody else is working on the clientid, mark the
4517c478bd9Sstevel@tonic-gate 		 * clientid as being no longer set.  Then mark the specific
4527c478bd9Sstevel@tonic-gate 		 * filesystem being worked on.
4537c478bd9Sstevel@tonic-gate 		 */
4547c478bd9Sstevel@tonic-gate 		if (!nfs4_server_in_recovery(sp)) {
4557c478bd9Sstevel@tonic-gate 			mutex_enter(&sp->s_lock);
4567c478bd9Sstevel@tonic-gate 			sp->s_flags &= ~N4S_CLIENTID_SET;
4577c478bd9Sstevel@tonic-gate 			mutex_exit(&sp->s_lock);
4587c478bd9Sstevel@tonic-gate 		}
4597c478bd9Sstevel@tonic-gate 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4607c478bd9Sstevel@tonic-gate 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4617c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
4627c478bd9Sstevel@tonic-gate 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
4637c478bd9Sstevel@tonic-gate 		if (recovp->rc_srv_reboot)
4647c478bd9Sstevel@tonic-gate 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
4657c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
4667c478bd9Sstevel@tonic-gate 		break;
4677c478bd9Sstevel@tonic-gate 
4687c478bd9Sstevel@tonic-gate 	case NR_OPENFILES:
4697c478bd9Sstevel@tonic-gate 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4707c478bd9Sstevel@tonic-gate 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4717c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
4727c478bd9Sstevel@tonic-gate 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
4737c478bd9Sstevel@tonic-gate 		if (recovp->rc_srv_reboot)
4747c478bd9Sstevel@tonic-gate 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
4757c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
4767c478bd9Sstevel@tonic-gate 		break;
4777c478bd9Sstevel@tonic-gate 
4787c478bd9Sstevel@tonic-gate 	case NR_WRONGSEC:
4797c478bd9Sstevel@tonic-gate 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4807c478bd9Sstevel@tonic-gate 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4817c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
4827c478bd9Sstevel@tonic-gate 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
4837c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
4847c478bd9Sstevel@tonic-gate 		break;
4857c478bd9Sstevel@tonic-gate 
4867c478bd9Sstevel@tonic-gate 	case NR_EXPIRED:
4877c478bd9Sstevel@tonic-gate 		if (vp1 != NULL)
4887c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
4897c478bd9Sstevel@tonic-gate 		if (vp2 != NULL)
4907c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
4917c478bd9Sstevel@tonic-gate 		goto out_no_thread;	/* no further recovery possible */
4927c478bd9Sstevel@tonic-gate 
4937c478bd9Sstevel@tonic-gate 	case NR_BAD_STATEID:
4947c478bd9Sstevel@tonic-gate 		if (vp1 != NULL)
4957c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
4967c478bd9Sstevel@tonic-gate 		if (vp2 != NULL)
4977c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
4987c478bd9Sstevel@tonic-gate 		goto out_no_thread;	/* no further recovery possible */
4997c478bd9Sstevel@tonic-gate 
5007c478bd9Sstevel@tonic-gate 	case NR_FHEXPIRED:
5017c478bd9Sstevel@tonic-gate 	case NR_BADHANDLE:
5027c478bd9Sstevel@tonic-gate 		if (vp1 != NULL)
5037c478bd9Sstevel@tonic-gate 			recov_throttle(recovp, vp1);
5047c478bd9Sstevel@tonic-gate 		if (vp2 != NULL)
5057c478bd9Sstevel@tonic-gate 			recov_throttle(recovp, vp2);
5067c478bd9Sstevel@tonic-gate 		/*
5077c478bd9Sstevel@tonic-gate 		 * Recover the filehandle now, rather than using a
5087c478bd9Sstevel@tonic-gate 		 * separate thread.  We can do this because filehandle
5097c478bd9Sstevel@tonic-gate 		 * recovery is independent of any other state, and because
5107c478bd9Sstevel@tonic-gate 		 * we know that we are not competing with the recovery
5117c478bd9Sstevel@tonic-gate 		 * thread at this time.  recov_filehandle will deal with
5127c478bd9Sstevel@tonic-gate 		 * threads that are competing to recover this filehandle.
5137c478bd9Sstevel@tonic-gate 		 */
5147c478bd9Sstevel@tonic-gate 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
5157c478bd9Sstevel@tonic-gate 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
5167c478bd9Sstevel@tonic-gate 		if (vp1 != NULL)
5177c478bd9Sstevel@tonic-gate 			recov_filehandle(recovp->rc_action, mi, vp1);
5187c478bd9Sstevel@tonic-gate 		if (vp2 != NULL)
5197c478bd9Sstevel@tonic-gate 			recov_filehandle(recovp->rc_action, mi, vp2);
5207c478bd9Sstevel@tonic-gate 		goto out_no_thread;	/* no further recovery needed */
5217c478bd9Sstevel@tonic-gate 
5227c478bd9Sstevel@tonic-gate 	case NR_STALE:
5237c478bd9Sstevel@tonic-gate 		/*
5247c478bd9Sstevel@tonic-gate 		 * NFS4ERR_STALE handling
5257c478bd9Sstevel@tonic-gate 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
5267c478bd9Sstevel@tonic-gate 		 * indicate that we can and should failover.
5277c478bd9Sstevel@tonic-gate 		 */
5287c478bd9Sstevel@tonic-gate 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
5297c478bd9Sstevel@tonic-gate 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
5307c478bd9Sstevel@tonic-gate 
5317c478bd9Sstevel@tonic-gate 		if (vp1 != NULL)
5327c478bd9Sstevel@tonic-gate 			recov_stale(mi, vp1);
5337c478bd9Sstevel@tonic-gate 		if (vp2 != NULL)
5347c478bd9Sstevel@tonic-gate 			recov_stale(mi, vp2);
5357c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
5367c478bd9Sstevel@tonic-gate 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
5377c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
5387c478bd9Sstevel@tonic-gate 			goto out_no_thread;
5397c478bd9Sstevel@tonic-gate 		}
5407c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
5417c478bd9Sstevel@tonic-gate 		recovp->rc_action = NR_FAILOVER;
5427c478bd9Sstevel@tonic-gate 		goto again;
5437c478bd9Sstevel@tonic-gate 
5447c478bd9Sstevel@tonic-gate 	case NR_BAD_SEQID:
5457c478bd9Sstevel@tonic-gate 		if (recovp->rc_bseqid_rqst) {
5467c478bd9Sstevel@tonic-gate 			enqueue_bseqid_rqst(recovp, mi);
5477c478bd9Sstevel@tonic-gate 			break;
5487c478bd9Sstevel@tonic-gate 		}
5497c478bd9Sstevel@tonic-gate 
5507c478bd9Sstevel@tonic-gate 		if (vp1 != NULL)
5517c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
5527c478bd9Sstevel@tonic-gate 		if (vp2 != NULL)
5537c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
5547c478bd9Sstevel@tonic-gate 		goto out_no_thread; /* no further recovery possible */
5557c478bd9Sstevel@tonic-gate 
5567c478bd9Sstevel@tonic-gate 	case NR_OLDSTATEID:
5577c478bd9Sstevel@tonic-gate 		if (vp1 != NULL)
5587c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
5597c478bd9Sstevel@tonic-gate 		if (vp2 != NULL)
5607c478bd9Sstevel@tonic-gate 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
5617c478bd9Sstevel@tonic-gate 		goto out_no_thread;	/* no further recovery possible */
5627c478bd9Sstevel@tonic-gate 
5637c478bd9Sstevel@tonic-gate 	case NR_GRACE:
5647c478bd9Sstevel@tonic-gate 		nfs4_set_grace_wait(mi);
5657c478bd9Sstevel@tonic-gate 		goto out_no_thread; /* no further action required for GRACE */
5667c478bd9Sstevel@tonic-gate 
5677c478bd9Sstevel@tonic-gate 	case NR_DELAY:
5687c478bd9Sstevel@tonic-gate 		if (vp1)
5697c478bd9Sstevel@tonic-gate 			nfs4_set_delay_wait(vp1);
5707c478bd9Sstevel@tonic-gate 		goto out_no_thread; /* no further action required for DELAY */
5717c478bd9Sstevel@tonic-gate 
5727c478bd9Sstevel@tonic-gate 	case NR_LOST_STATE_RQST:
5737c478bd9Sstevel@tonic-gate 	case NR_LOST_LOCK:
5747c478bd9Sstevel@tonic-gate 		nfs4_enqueue_lost_rqst(recovp, mi);
5757c478bd9Sstevel@tonic-gate 		break;
5767c478bd9Sstevel@tonic-gate 	default:
5777c478bd9Sstevel@tonic-gate 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
5787c478bd9Sstevel@tonic-gate 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
5797c478bd9Sstevel@tonic-gate 		    TAG_NONE, 0, 0);
5807c478bd9Sstevel@tonic-gate 		goto out_no_thread;
5817c478bd9Sstevel@tonic-gate 	}
5827c478bd9Sstevel@tonic-gate 
5837c478bd9Sstevel@tonic-gate 	/*
5847c478bd9Sstevel@tonic-gate 	 * If either file recently went through the same recovery, wait
5857c478bd9Sstevel@tonic-gate 	 * awhile.  This is in case there is some sort of bug; we might not
5867c478bd9Sstevel@tonic-gate 	 * be able to recover properly, but at least we won't bombard the
5877c478bd9Sstevel@tonic-gate 	 * server with calls, and we won't tie up the client.
5887c478bd9Sstevel@tonic-gate 	 */
5897c478bd9Sstevel@tonic-gate 	if (vp1 != NULL)
5907c478bd9Sstevel@tonic-gate 		recov_throttle(recovp, vp1);
5917c478bd9Sstevel@tonic-gate 	if (vp2 != NULL)
5927c478bd9Sstevel@tonic-gate 		recov_throttle(recovp, vp2);
5937c478bd9Sstevel@tonic-gate 
5947c478bd9Sstevel@tonic-gate 	/*
5957c478bd9Sstevel@tonic-gate 	 * If there's already a recovery thread, don't start another one.
5967c478bd9Sstevel@tonic-gate 	 */
5977c478bd9Sstevel@tonic-gate 
5987c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
5997c478bd9Sstevel@tonic-gate 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
6007c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
6017c478bd9Sstevel@tonic-gate 		goto out_no_thread;
6027c478bd9Sstevel@tonic-gate 	}
6037c478bd9Sstevel@tonic-gate 	mi->mi_flags |= MI4_RECOV_ACTIV;
6047c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
6057c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
606b9238976Sth 	    "start_recovery: starting new thread for mi %p", (void*)mi));
6077c478bd9Sstevel@tonic-gate 
6087c478bd9Sstevel@tonic-gate 	recovp->rc_mi = mi;
6097c478bd9Sstevel@tonic-gate 	recovp->rc_vp1 = vp1;
6107c478bd9Sstevel@tonic-gate 	if (vp1 != NULL) {
6117c478bd9Sstevel@tonic-gate 		ASSERT(VTOMI4(vp1) == mi);
6127c478bd9Sstevel@tonic-gate 		VN_HOLD(recovp->rc_vp1);
6137c478bd9Sstevel@tonic-gate 	}
6147c478bd9Sstevel@tonic-gate 	recovp->rc_vp2 = vp2;
6157c478bd9Sstevel@tonic-gate 	if (vp2 != NULL) {
6167c478bd9Sstevel@tonic-gate 		ASSERT(VTOMI4(vp2) == mi);
6177c478bd9Sstevel@tonic-gate 		VN_HOLD(recovp->rc_vp2);
6187c478bd9Sstevel@tonic-gate 	}
6192f172c55SRobert Thurlow 	recovp->rc_moved_vp = moved_vp;
6202f172c55SRobert Thurlow 	recovp->rc_moved_nm = moved_nm;
6217c478bd9Sstevel@tonic-gate 
6227c478bd9Sstevel@tonic-gate 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
623b9238976Sth 	    minclsyspri);
6247c478bd9Sstevel@tonic-gate 	return;
6257c478bd9Sstevel@tonic-gate 
6267c478bd9Sstevel@tonic-gate 	/* not reached by thread creating call */
6277c478bd9Sstevel@tonic-gate out_no_thread:
6287c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
6297c478bd9Sstevel@tonic-gate 	mi->mi_in_recovery--;
630e749d04dSjwahlig 	if (mi->mi_in_recovery == 0)
631e749d04dSjwahlig 		cv_broadcast(&mi->mi_cv_in_recov);
6327c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
6337c478bd9Sstevel@tonic-gate 
6347c478bd9Sstevel@tonic-gate 	VFS_RELE(mi->mi_vfsp);
63550a83466Sjwahlig 	MI4_RELE(mi);
6367c478bd9Sstevel@tonic-gate 	/*
6377c478bd9Sstevel@tonic-gate 	 * Free up resources that were allocated for us.
6387c478bd9Sstevel@tonic-gate 	 */
6397c478bd9Sstevel@tonic-gate 	kmem_free(recovp, sizeof (recov_info_t));
6407c478bd9Sstevel@tonic-gate }
6417c478bd9Sstevel@tonic-gate 
6427c478bd9Sstevel@tonic-gate static int
nfs4_check_recov_err(vnode_t * vp,nfs4_op_hint_t op,nfs4_recov_state_t * rsp,int retry_err_cnt,char * str)6437c478bd9Sstevel@tonic-gate nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
644b9238976Sth     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
6457c478bd9Sstevel@tonic-gate {
6467c478bd9Sstevel@tonic-gate 	rnode4_t *rp;
6477c478bd9Sstevel@tonic-gate 	int error = 0;
6487c478bd9Sstevel@tonic-gate 	int exempt;
6497c478bd9Sstevel@tonic-gate 
6507c478bd9Sstevel@tonic-gate 	if (vp == NULL)
6517c478bd9Sstevel@tonic-gate 		return (0);
6527c478bd9Sstevel@tonic-gate 
6537c478bd9Sstevel@tonic-gate 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
6547c478bd9Sstevel@tonic-gate 	rp = VTOR4(vp);
6557c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
6567c478bd9Sstevel@tonic-gate 
6577c478bd9Sstevel@tonic-gate 	/*
6587c478bd9Sstevel@tonic-gate 	 * If there was a recovery error, then allow op hints "exempt" from
6597c478bd9Sstevel@tonic-gate 	 * recov errors to retry (currently 3 times).  Either r_error or
6607c478bd9Sstevel@tonic-gate 	 * EIO is returned for non-exempt op hints.
6617c478bd9Sstevel@tonic-gate 	 */
6627c478bd9Sstevel@tonic-gate 	if (rp->r_flags & R4RECOVERR) {
6637c478bd9Sstevel@tonic-gate 		if (exempt && rsp->rs_num_retry_despite_err <=
664b9238976Sth 		    nfs4_max_recov_error_retry) {
6657c478bd9Sstevel@tonic-gate 
6667c478bd9Sstevel@tonic-gate 			/*
6677c478bd9Sstevel@tonic-gate 			 * Check to make sure that we haven't already inc'd
6687c478bd9Sstevel@tonic-gate 			 * rs_num_retry_despite_err for current nfs4_start_fop
6697c478bd9Sstevel@tonic-gate 			 * instance.  We don't want to double inc (if we were
6707c478bd9Sstevel@tonic-gate 			 * called with vp2, then the vp1 call could have
6717c478bd9Sstevel@tonic-gate 			 * already incremented.
6727c478bd9Sstevel@tonic-gate 			 */
6737c478bd9Sstevel@tonic-gate 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
6747c478bd9Sstevel@tonic-gate 				rsp->rs_num_retry_despite_err++;
6757c478bd9Sstevel@tonic-gate 
6767c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
677b9238976Sth 			    "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
678b9238976Sth 			    (void *)vp, rsp->rs_num_retry_despite_err));
6797c478bd9Sstevel@tonic-gate 		} else {
6807c478bd9Sstevel@tonic-gate 			error = (rp->r_error ? rp->r_error : EIO);
6817c478bd9Sstevel@tonic-gate 			/*
6827c478bd9Sstevel@tonic-gate 			 * An ESTALE error on a non-regular file is not
6837c478bd9Sstevel@tonic-gate 			 * "sticky".  Return the ESTALE error once, but
6847c478bd9Sstevel@tonic-gate 			 * clear the condition to allow future operations
6857c478bd9Sstevel@tonic-gate 			 * to go OTW.  This will allow the client to
6867c478bd9Sstevel@tonic-gate 			 * recover if the server has merely unshared then
6877c478bd9Sstevel@tonic-gate 			 * re-shared the file system.  For regular files,
6887c478bd9Sstevel@tonic-gate 			 * the unshare has destroyed the open state at the
6897c478bd9Sstevel@tonic-gate 			 * server and we aren't willing to do a reopen (yet).
6907c478bd9Sstevel@tonic-gate 			 */
6917c478bd9Sstevel@tonic-gate 			if (error == ESTALE && vp->v_type != VREG) {
6927c478bd9Sstevel@tonic-gate 				rp->r_flags &=
693b9238976Sth 				    ~(R4RECOVERR|R4RECOVERRP|R4STALE);
6947c478bd9Sstevel@tonic-gate 				rp->r_error = 0;
6957c478bd9Sstevel@tonic-gate 				error = ESTALE;
6967c478bd9Sstevel@tonic-gate 			}
6977c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
698b9238976Sth 			    "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
699b9238976Sth 			    str, (void *)vp,
700b9238976Sth 			    rsp->rs_num_retry_despite_err, error));
7017c478bd9Sstevel@tonic-gate 		}
7027c478bd9Sstevel@tonic-gate 	}
703b9238976Sth 
7047c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
7057c478bd9Sstevel@tonic-gate 	return (error);
7067c478bd9Sstevel@tonic-gate }
7077c478bd9Sstevel@tonic-gate 
7087c478bd9Sstevel@tonic-gate /*
7097c478bd9Sstevel@tonic-gate  * Initial setup code that every operation should call if it might invoke
7107c478bd9Sstevel@tonic-gate  * client recovery.  Can block waiting for recovery to finish on a
7117c478bd9Sstevel@tonic-gate  * filesystem.  Either vnode ptr can be NULL.
7127c478bd9Sstevel@tonic-gate  *
7137c478bd9Sstevel@tonic-gate  * Returns 0 if there are no outstanding errors.  Can return an
7147c478bd9Sstevel@tonic-gate  * errno value under various circumstances (e.g., failed recovery, or
7157c478bd9Sstevel@tonic-gate  * interrupted while waiting for recovery to finish).
7167c478bd9Sstevel@tonic-gate  *
7177c478bd9Sstevel@tonic-gate  * There must be a corresponding call to nfs4_end_op() to free up any locks
7187c478bd9Sstevel@tonic-gate  * or resources allocated by this call (assuming this call succeeded),
7197c478bd9Sstevel@tonic-gate  * using the same rsp that's passed in here.
7207c478bd9Sstevel@tonic-gate  *
7217c478bd9Sstevel@tonic-gate  * The open and lock seqid synchronization must be stopped before calling this
7227c478bd9Sstevel@tonic-gate  * function, as it could lead to deadlock when trying to reopen a file or
7237c478bd9Sstevel@tonic-gate  * reclaim a lock.  The synchronization is obtained with calls to:
7247c478bd9Sstevel@tonic-gate  *   nfs4_start_open_seqid_sync()
7257c478bd9Sstevel@tonic-gate  *   nfs4_start_lock_seqid_sync()
7267c478bd9Sstevel@tonic-gate  *
7277c478bd9Sstevel@tonic-gate  * *startrecovp is set TRUE if the caller should not bother with the
7287c478bd9Sstevel@tonic-gate  * over-the-wire call, and just initiate recovery for the given request.
7297c478bd9Sstevel@tonic-gate  * This is typically used for state-releasing ops if the filesystem has
7307c478bd9Sstevel@tonic-gate  * been forcibly unmounted.  startrecovp may be NULL for
7317c478bd9Sstevel@tonic-gate  * non-state-releasing ops.
7327c478bd9Sstevel@tonic-gate  */
7337c478bd9Sstevel@tonic-gate 
7347c478bd9Sstevel@tonic-gate int
nfs4_start_fop(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_op_hint_t op,nfs4_recov_state_t * rsp,bool_t * startrecovp)7357c478bd9Sstevel@tonic-gate nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
736b9238976Sth     nfs4_recov_state_t *rsp, bool_t *startrecovp)
7377c478bd9Sstevel@tonic-gate {
7387c478bd9Sstevel@tonic-gate 	int error = 0, rerr_cnt;
7397c478bd9Sstevel@tonic-gate 	nfs4_server_t *sp = NULL;
7407c478bd9Sstevel@tonic-gate 	nfs4_server_t *tsp;
7417c478bd9Sstevel@tonic-gate 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7423b895386SPavel Filipensky 	uint_t droplock_cnt;
7437c478bd9Sstevel@tonic-gate #ifdef DEBUG
7447c478bd9Sstevel@tonic-gate 	void *fop_caller;
7457c478bd9Sstevel@tonic-gate #endif
7467c478bd9Sstevel@tonic-gate 
7477c478bd9Sstevel@tonic-gate 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
7487c478bd9Sstevel@tonic-gate 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
7497c478bd9Sstevel@tonic-gate 
7507c478bd9Sstevel@tonic-gate #ifdef	DEBUG
7517c478bd9Sstevel@tonic-gate 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
7527c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
753b9238976Sth 		    fop_caller);
7547c478bd9Sstevel@tonic-gate 	}
7557c478bd9Sstevel@tonic-gate 	(void) tsd_set(nfs4_tsd_key, caller());
7567c478bd9Sstevel@tonic-gate #endif
7577c478bd9Sstevel@tonic-gate 
7587c478bd9Sstevel@tonic-gate 	rsp->rs_sp = NULL;
7597c478bd9Sstevel@tonic-gate 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
7607c478bd9Sstevel@tonic-gate 	rerr_cnt = rsp->rs_num_retry_despite_err;
7617c478bd9Sstevel@tonic-gate 
7627c478bd9Sstevel@tonic-gate 	/*
7637c478bd9Sstevel@tonic-gate 	 * Process the items that may delay() based on server response
7647c478bd9Sstevel@tonic-gate 	 */
7657c478bd9Sstevel@tonic-gate 	error = nfs4_wait_for_grace(mi, rsp);
7667c478bd9Sstevel@tonic-gate 	if (error)
7677c478bd9Sstevel@tonic-gate 		goto out;
7687c478bd9Sstevel@tonic-gate 
7697c478bd9Sstevel@tonic-gate 	if (vp1 != NULL) {
7707c478bd9Sstevel@tonic-gate 		error = nfs4_wait_for_delay(vp1, rsp);
7717c478bd9Sstevel@tonic-gate 		if (error)
7727c478bd9Sstevel@tonic-gate 			goto out;
7737c478bd9Sstevel@tonic-gate 	}
7747c478bd9Sstevel@tonic-gate 
7757c478bd9Sstevel@tonic-gate 	/* Wait for a delegation recall to complete. */
7767c478bd9Sstevel@tonic-gate 
7777c478bd9Sstevel@tonic-gate 	error = wait_for_recall(vp1, vp2, op, rsp);
7787c478bd9Sstevel@tonic-gate 	if (error)
7797c478bd9Sstevel@tonic-gate 		goto out;
7807c478bd9Sstevel@tonic-gate 
7817c478bd9Sstevel@tonic-gate 	/*
7827c478bd9Sstevel@tonic-gate 	 * Wait for any current recovery actions to finish.  Note that a
7837c478bd9Sstevel@tonic-gate 	 * recovery thread can still start up after wait_for_recovery()
7847c478bd9Sstevel@tonic-gate 	 * finishes.  We don't block out recovery operations until we
7857c478bd9Sstevel@tonic-gate 	 * acquire s_recovlock and mi_recovlock.
7867c478bd9Sstevel@tonic-gate 	 */
7877c478bd9Sstevel@tonic-gate 	error = wait_for_recovery(mi, op);
7887c478bd9Sstevel@tonic-gate 	if (error)
7897c478bd9Sstevel@tonic-gate 		goto out;
7907c478bd9Sstevel@tonic-gate 
7917c478bd9Sstevel@tonic-gate 	/*
7927c478bd9Sstevel@tonic-gate 	 * Check to see if the rnode is already marked with a
7937c478bd9Sstevel@tonic-gate 	 * recovery error.  If so, return it immediately.  But
7947c478bd9Sstevel@tonic-gate 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
7957c478bd9Sstevel@tonic-gate 	 * clean up state on the server.
7967c478bd9Sstevel@tonic-gate 	 */
7977c478bd9Sstevel@tonic-gate 
7987c478bd9Sstevel@tonic-gate 	if (vp1 != NULL) {
7997c478bd9Sstevel@tonic-gate 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
8007c478bd9Sstevel@tonic-gate 			goto out;
8017c478bd9Sstevel@tonic-gate 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
8027c478bd9Sstevel@tonic-gate 	}
8037c478bd9Sstevel@tonic-gate 
8047c478bd9Sstevel@tonic-gate 	if (vp2 != NULL) {
8057c478bd9Sstevel@tonic-gate 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
8067c478bd9Sstevel@tonic-gate 			goto out;
8077c478bd9Sstevel@tonic-gate 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
8087c478bd9Sstevel@tonic-gate 	}
8097c478bd9Sstevel@tonic-gate 
8107c478bd9Sstevel@tonic-gate 	/*
8117c478bd9Sstevel@tonic-gate 	 * The lock order calls for us to acquire s_recovlock before
8127c478bd9Sstevel@tonic-gate 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
8137c478bd9Sstevel@tonic-gate 	 * prevent races with the failover/migration code).  So acquire
8147c478bd9Sstevel@tonic-gate 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
8157c478bd9Sstevel@tonic-gate 	 * s_recovlock and mi_recovlock, then verify that sp is still the
8167c478bd9Sstevel@tonic-gate 	 * right object.  XXX Can we find a simpler way to deal with this?
8177c478bd9Sstevel@tonic-gate 	 */
8187c478bd9Sstevel@tonic-gate 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
8197c478bd9Sstevel@tonic-gate 	    mi->mi_flags & MI4_INT)) {
8207c478bd9Sstevel@tonic-gate 		error = EINTR;
8217c478bd9Sstevel@tonic-gate 		goto out;
8227c478bd9Sstevel@tonic-gate 	}
8237c478bd9Sstevel@tonic-gate get_sp:
8247c478bd9Sstevel@tonic-gate 	sp = find_nfs4_server(mi);
8257c478bd9Sstevel@tonic-gate 	if (sp != NULL) {
8267c478bd9Sstevel@tonic-gate 		sp->s_otw_call_count++;
8277c478bd9Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
8283b895386SPavel Filipensky 		droplock_cnt = mi->mi_srvset_cnt;
8297c478bd9Sstevel@tonic-gate 	}
8307c478bd9Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_recovlock);
8317c478bd9Sstevel@tonic-gate 
8327c478bd9Sstevel@tonic-gate 	if (sp != NULL) {
8337c478bd9Sstevel@tonic-gate 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
834b9238976Sth 		    mi->mi_flags & MI4_INT)) {
8357c478bd9Sstevel@tonic-gate 			error = EINTR;
8367c478bd9Sstevel@tonic-gate 			goto out;
8377c478bd9Sstevel@tonic-gate 		}
8387c478bd9Sstevel@tonic-gate 	}
8397c478bd9Sstevel@tonic-gate 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
840b9238976Sth 	    mi->mi_flags & MI4_INT)) {
8417c478bd9Sstevel@tonic-gate 		if (sp != NULL)
8427c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&sp->s_recovlock);
8437c478bd9Sstevel@tonic-gate 		error = EINTR;
8447c478bd9Sstevel@tonic-gate 		goto out;
8457c478bd9Sstevel@tonic-gate 	}
8467c478bd9Sstevel@tonic-gate 	/*
8477c478bd9Sstevel@tonic-gate 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
8487c478bd9Sstevel@tonic-gate 	 * there's no point in double checking to make sure it
8497c478bd9Sstevel@tonic-gate 	 * has switched.
8507c478bd9Sstevel@tonic-gate 	 */
8513b895386SPavel Filipensky 	if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
8527c478bd9Sstevel@tonic-gate 		tsp = find_nfs4_server(mi);
8537c478bd9Sstevel@tonic-gate 		if (tsp != sp) {
8547c478bd9Sstevel@tonic-gate 			/* try again */
8557c478bd9Sstevel@tonic-gate 			if (tsp != NULL) {
8567c478bd9Sstevel@tonic-gate 				mutex_exit(&tsp->s_lock);
8577c478bd9Sstevel@tonic-gate 				nfs4_server_rele(tsp);
8587c478bd9Sstevel@tonic-gate 				tsp = NULL;
8597c478bd9Sstevel@tonic-gate 			}
8607c478bd9Sstevel@tonic-gate 			if (sp != NULL) {
8617c478bd9Sstevel@tonic-gate 				nfs_rw_exit(&sp->s_recovlock);
8627c478bd9Sstevel@tonic-gate 				mutex_enter(&sp->s_lock);
8637c478bd9Sstevel@tonic-gate 				sp->s_otw_call_count--;
8647c478bd9Sstevel@tonic-gate 				mutex_exit(&sp->s_lock);
8657c478bd9Sstevel@tonic-gate 				nfs4_server_rele(sp);
8667c478bd9Sstevel@tonic-gate 				sp = NULL;
8677c478bd9Sstevel@tonic-gate 			}
8687c478bd9Sstevel@tonic-gate 			goto get_sp;
8697c478bd9Sstevel@tonic-gate 		} else {
8707c478bd9Sstevel@tonic-gate 			if (tsp != NULL) {
8717c478bd9Sstevel@tonic-gate 				mutex_exit(&tsp->s_lock);
8727c478bd9Sstevel@tonic-gate 				nfs4_server_rele(tsp);
8737c478bd9Sstevel@tonic-gate 				tsp = NULL;
8747c478bd9Sstevel@tonic-gate 			}
8757c478bd9Sstevel@tonic-gate 		}
8767c478bd9Sstevel@tonic-gate 	}
8777c478bd9Sstevel@tonic-gate 
8787c478bd9Sstevel@tonic-gate 	if (sp != NULL) {
8797c478bd9Sstevel@tonic-gate 		rsp->rs_sp = sp;
8807c478bd9Sstevel@tonic-gate 	}
8817c478bd9Sstevel@tonic-gate 
8827c478bd9Sstevel@tonic-gate 	/*
8837c478bd9Sstevel@tonic-gate 	 * If the fileystem uses volatile filehandles, obtain a lock so
8847c478bd9Sstevel@tonic-gate 	 * that we synchronize with renames.  Exception: mount operations
8857c478bd9Sstevel@tonic-gate 	 * can change mi_fh_expire_type, which could be a problem, since
8867c478bd9Sstevel@tonic-gate 	 * the end_op code needs to be consistent with the start_op code
8877c478bd9Sstevel@tonic-gate 	 * about mi_rename_lock.  Since mounts don't compete with renames,
8887c478bd9Sstevel@tonic-gate 	 * it's simpler to just not acquire the rename lock for mounts.
8897c478bd9Sstevel@tonic-gate 	 */
8907c478bd9Sstevel@tonic-gate 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
8917c478bd9Sstevel@tonic-gate 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
892b9238976Sth 		    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
893b9238976Sth 		    mi->mi_flags & MI4_INT)) {
8947c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&mi->mi_recovlock);
8957c478bd9Sstevel@tonic-gate 			if (sp != NULL)
8967c478bd9Sstevel@tonic-gate 				nfs_rw_exit(&sp->s_recovlock);
8977c478bd9Sstevel@tonic-gate 			error = EINTR;
8987c478bd9Sstevel@tonic-gate 			goto out;
8997c478bd9Sstevel@tonic-gate 		}
9007c478bd9Sstevel@tonic-gate 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
9017c478bd9Sstevel@tonic-gate 	}
9027c478bd9Sstevel@tonic-gate 
9037c478bd9Sstevel@tonic-gate 	if (OH_IS_STATE_RELE(op)) {
9047c478bd9Sstevel@tonic-gate 		/*
9057c478bd9Sstevel@tonic-gate 		 * For forced unmount, letting the request proceed will
9067c478bd9Sstevel@tonic-gate 		 * almost always delay response to the user, so hand it off
9077c478bd9Sstevel@tonic-gate 		 * to the recovery thread.  For exiting lwp's, we don't
9087c478bd9Sstevel@tonic-gate 		 * have a good way to tell if the request will hang.  We
9097c478bd9Sstevel@tonic-gate 		 * generally want processes to handle their own requests so
9107c478bd9Sstevel@tonic-gate 		 * that they can be done in parallel, but if there is
9117c478bd9Sstevel@tonic-gate 		 * already a recovery thread, hand the request off to it.
9127c478bd9Sstevel@tonic-gate 		 * This will improve user response at no cost to overall
9137c478bd9Sstevel@tonic-gate 		 * system throughput.  For zone shutdown, we'd prefer
9147c478bd9Sstevel@tonic-gate 		 * the recovery thread to handle this as well.
9157c478bd9Sstevel@tonic-gate 		 */
9167c478bd9Sstevel@tonic-gate 		ASSERT(startrecovp != NULL);
9177c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
9187c478bd9Sstevel@tonic-gate 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
9197c478bd9Sstevel@tonic-gate 			*startrecovp = TRUE;
9207c478bd9Sstevel@tonic-gate 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
9217c478bd9Sstevel@tonic-gate 		    (mi->mi_flags & MI4_RECOV_ACTIV))
9227c478bd9Sstevel@tonic-gate 			*startrecovp = TRUE;
9237c478bd9Sstevel@tonic-gate 		else
9247c478bd9Sstevel@tonic-gate 			*startrecovp = FALSE;
9257c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
9267c478bd9Sstevel@tonic-gate 	} else
9277c478bd9Sstevel@tonic-gate 		if (startrecovp != NULL)
9287c478bd9Sstevel@tonic-gate 			*startrecovp = FALSE;
9297c478bd9Sstevel@tonic-gate 
9307c478bd9Sstevel@tonic-gate 	ASSERT(error == 0);
9317c478bd9Sstevel@tonic-gate 	return (error);
9327c478bd9Sstevel@tonic-gate 
9337c478bd9Sstevel@tonic-gate out:
9347c478bd9Sstevel@tonic-gate 	ASSERT(error != 0);
9357c478bd9Sstevel@tonic-gate 	if (sp != NULL) {
9367c478bd9Sstevel@tonic-gate 		mutex_enter(&sp->s_lock);
9377c478bd9Sstevel@tonic-gate 		sp->s_otw_call_count--;
9387c478bd9Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
9397c478bd9Sstevel@tonic-gate 		nfs4_server_rele(sp);
9407c478bd9Sstevel@tonic-gate 		rsp->rs_sp = NULL;
9417c478bd9Sstevel@tonic-gate 	}
9427c478bd9Sstevel@tonic-gate 	nfs4_end_op_recall(vp1, vp2, rsp);
9437c478bd9Sstevel@tonic-gate 
9447c478bd9Sstevel@tonic-gate #ifdef	DEBUG
9457c478bd9Sstevel@tonic-gate 	(void) tsd_set(nfs4_tsd_key, NULL);
9467c478bd9Sstevel@tonic-gate #endif
9477c478bd9Sstevel@tonic-gate 	return (error);
9487c478bd9Sstevel@tonic-gate }
9497c478bd9Sstevel@tonic-gate 
9507c478bd9Sstevel@tonic-gate /*
9517c478bd9Sstevel@tonic-gate  * It is up to the caller to determine if rsp->rs_sp being NULL
9527c478bd9Sstevel@tonic-gate  * is detrimental or not.
9537c478bd9Sstevel@tonic-gate  */
9547c478bd9Sstevel@tonic-gate int
nfs4_start_op(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_recov_state_t * rsp)9557c478bd9Sstevel@tonic-gate nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
956b9238976Sth     nfs4_recov_state_t *rsp)
9577c478bd9Sstevel@tonic-gate {
9587c478bd9Sstevel@tonic-gate 	ASSERT(rsp->rs_num_retry_despite_err == 0);
9597c478bd9Sstevel@tonic-gate 	rsp->rs_num_retry_despite_err = 0;
9607c478bd9Sstevel@tonic-gate 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
9617c478bd9Sstevel@tonic-gate }
9627c478bd9Sstevel@tonic-gate 
9637c478bd9Sstevel@tonic-gate /*
9647c478bd9Sstevel@tonic-gate  * Release any resources acquired by nfs4_start_op().
9657c478bd9Sstevel@tonic-gate  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
9667c478bd9Sstevel@tonic-gate  *
9677c478bd9Sstevel@tonic-gate  * The operation hint is used to avoid a deadlock by bypassing delegation
9687c478bd9Sstevel@tonic-gate  * return logic for writes, which are done while returning a delegation.
9697c478bd9Sstevel@tonic-gate  */
9707c478bd9Sstevel@tonic-gate 
9717c478bd9Sstevel@tonic-gate void
nfs4_end_fop(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_op_hint_t op,nfs4_recov_state_t * rsp,bool_t needs_recov)9727c478bd9Sstevel@tonic-gate nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
973b9238976Sth     nfs4_recov_state_t *rsp, bool_t needs_recov)
9747c478bd9Sstevel@tonic-gate {
9757c478bd9Sstevel@tonic-gate 	nfs4_server_t *sp = rsp->rs_sp;
9767c478bd9Sstevel@tonic-gate 	rnode4_t *rp = NULL;
9777c478bd9Sstevel@tonic-gate 
9787c478bd9Sstevel@tonic-gate #ifdef	lint
9797c478bd9Sstevel@tonic-gate 	/*
9807c478bd9Sstevel@tonic-gate 	 * The op hint isn't used any more, but might be in
9817c478bd9Sstevel@tonic-gate 	 * the future.
9827c478bd9Sstevel@tonic-gate 	 */
9837c478bd9Sstevel@tonic-gate 	op = op;
9847c478bd9Sstevel@tonic-gate #endif
9857c478bd9Sstevel@tonic-gate 
9867c478bd9Sstevel@tonic-gate #ifdef	DEBUG
9877c478bd9Sstevel@tonic-gate 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
9887c478bd9Sstevel@tonic-gate 	(void) tsd_set(nfs4_tsd_key, NULL);
9897c478bd9Sstevel@tonic-gate #endif
9907c478bd9Sstevel@tonic-gate 
9917c478bd9Sstevel@tonic-gate 	nfs4_end_op_recall(vp1, vp2, rsp);
9927c478bd9Sstevel@tonic-gate 
9937c478bd9Sstevel@tonic-gate 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
9947c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&mi->mi_rename_lock);
9957c478bd9Sstevel@tonic-gate 
9967c478bd9Sstevel@tonic-gate 	if (!needs_recov) {
9977c478bd9Sstevel@tonic-gate 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
9987c478bd9Sstevel@tonic-gate 			/* may need to clear the delay interval */
9997c478bd9Sstevel@tonic-gate 			if (vp1 != NULL) {
10007c478bd9Sstevel@tonic-gate 				rp = VTOR4(vp1);
10017c478bd9Sstevel@tonic-gate 				mutex_enter(&rp->r_statelock);
10027c478bd9Sstevel@tonic-gate 				rp->r_delay_interval = 0;
10037c478bd9Sstevel@tonic-gate 				mutex_exit(&rp->r_statelock);
10047c478bd9Sstevel@tonic-gate 			}
10057c478bd9Sstevel@tonic-gate 		}
10067c478bd9Sstevel@tonic-gate 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
10077c478bd9Sstevel@tonic-gate 	}
10087c478bd9Sstevel@tonic-gate 
10097c478bd9Sstevel@tonic-gate 	/*
10107c478bd9Sstevel@tonic-gate 	 * If the corresponding nfs4_start_op() found a sp,
10117c478bd9Sstevel@tonic-gate 	 * then there must still be a sp.
10127c478bd9Sstevel@tonic-gate 	 */
10137c478bd9Sstevel@tonic-gate 	if (sp != NULL) {
10147c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&mi->mi_recovlock);
10157c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&sp->s_recovlock);
10167c478bd9Sstevel@tonic-gate 		mutex_enter(&sp->s_lock);
10177c478bd9Sstevel@tonic-gate 		sp->s_otw_call_count--;
10187c478bd9Sstevel@tonic-gate 		cv_broadcast(&sp->s_cv_otw_count);
10197c478bd9Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
10207c478bd9Sstevel@tonic-gate 		nfs4_server_rele(sp);
10217c478bd9Sstevel@tonic-gate 	} else {
10227c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&mi->mi_recovlock);
10237c478bd9Sstevel@tonic-gate 	}
10247c478bd9Sstevel@tonic-gate }
10257c478bd9Sstevel@tonic-gate 
10267c478bd9Sstevel@tonic-gate void
nfs4_end_op(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_recov_state_t * rsp,bool_t needrecov)10277c478bd9Sstevel@tonic-gate nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1028b9238976Sth     nfs4_recov_state_t *rsp, bool_t needrecov)
10297c478bd9Sstevel@tonic-gate {
10307c478bd9Sstevel@tonic-gate 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
10317c478bd9Sstevel@tonic-gate }
10327c478bd9Sstevel@tonic-gate 
10337c478bd9Sstevel@tonic-gate /*
10347c478bd9Sstevel@tonic-gate  * If the filesystem is going through client recovery, block until
10357c478bd9Sstevel@tonic-gate  * finished.
10367c478bd9Sstevel@tonic-gate  * Exceptions:
10377c478bd9Sstevel@tonic-gate  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
10387c478bd9Sstevel@tonic-gate  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
10397c478bd9Sstevel@tonic-gate  *
10407c478bd9Sstevel@tonic-gate  * Return value:
10417c478bd9Sstevel@tonic-gate  * - 0 if no errors
10427c478bd9Sstevel@tonic-gate  * - EINTR if the call was interrupted
10437c478bd9Sstevel@tonic-gate  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
10447c478bd9Sstevel@tonic-gate  *   op)
10457c478bd9Sstevel@tonic-gate  * - the errno value from the recovery thread, if recovery failed
10467c478bd9Sstevel@tonic-gate  */
10477c478bd9Sstevel@tonic-gate 
10487c478bd9Sstevel@tonic-gate static int
wait_for_recovery(mntinfo4_t * mi,nfs4_op_hint_t op_hint)10497c478bd9Sstevel@tonic-gate wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
10507c478bd9Sstevel@tonic-gate {
10517c478bd9Sstevel@tonic-gate 	int error = 0;
10527c478bd9Sstevel@tonic-gate 
10537c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
10547c478bd9Sstevel@tonic-gate 
10557c478bd9Sstevel@tonic-gate 	while (mi->mi_recovflags != 0) {
10567c478bd9Sstevel@tonic-gate 		klwp_t *lwp = ttolwp(curthread);
10577c478bd9Sstevel@tonic-gate 
1058ffa198efSvv 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
1059ffa198efSvv 		    (mi->mi_flags & MI4_RECOV_FAIL))
10607c478bd9Sstevel@tonic-gate 			break;
10617c478bd9Sstevel@tonic-gate 		if (OH_IS_STATE_RELE(op_hint) &&
10627c478bd9Sstevel@tonic-gate 		    (curthread->t_proc_flag & TP_LWPEXIT))
10637c478bd9Sstevel@tonic-gate 			break;
10647c478bd9Sstevel@tonic-gate 
10657c478bd9Sstevel@tonic-gate 		if (lwp != NULL)
10667c478bd9Sstevel@tonic-gate 			lwp->lwp_nostop++;
10677c478bd9Sstevel@tonic-gate 		/* XXX - use different cv? */
10687c478bd9Sstevel@tonic-gate 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
10697c478bd9Sstevel@tonic-gate 			error = EINTR;
10707c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
10717c478bd9Sstevel@tonic-gate 				lwp->lwp_nostop--;
10727c478bd9Sstevel@tonic-gate 			break;
10737c478bd9Sstevel@tonic-gate 		}
10747c478bd9Sstevel@tonic-gate 		if (lwp != NULL)
10757c478bd9Sstevel@tonic-gate 			lwp->lwp_nostop--;
10767c478bd9Sstevel@tonic-gate 	}
10777c478bd9Sstevel@tonic-gate 
1078ffa198efSvv 	if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
10797c478bd9Sstevel@tonic-gate 	    !OH_IS_STATE_RELE(op_hint)) {
10807c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1081b9238976Sth 		    "wait_for_recovery: forced unmount"));
10827c478bd9Sstevel@tonic-gate 		error = EIO;
1083ffa198efSvv 	} else if (mi->mi_flags & MI4_RECOV_FAIL) {
1084ffa198efSvv 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1085ffa198efSvv 		    "wait_for_recovery: fail since RECOV FAIL"));
1086ffa198efSvv 		error = mi->mi_error;
10877c478bd9Sstevel@tonic-gate 	}
10887c478bd9Sstevel@tonic-gate 
10897c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
10907c478bd9Sstevel@tonic-gate 
10917c478bd9Sstevel@tonic-gate 	return (error);
10927c478bd9Sstevel@tonic-gate }
10937c478bd9Sstevel@tonic-gate 
10947c478bd9Sstevel@tonic-gate /*
10957c478bd9Sstevel@tonic-gate  * If the client received NFS4ERR_GRACE for this particular mount,
10967c478bd9Sstevel@tonic-gate  * the client blocks here until it is time to try again.
10977c478bd9Sstevel@tonic-gate  *
10987c478bd9Sstevel@tonic-gate  * Return value:
10997c478bd9Sstevel@tonic-gate  * - 0 if wait was successful
11007c478bd9Sstevel@tonic-gate  * - EINTR if the call was interrupted
11017c478bd9Sstevel@tonic-gate  */
11027c478bd9Sstevel@tonic-gate 
11037c478bd9Sstevel@tonic-gate int
nfs4_wait_for_grace(mntinfo4_t * mi,nfs4_recov_state_t * rsp)11047c478bd9Sstevel@tonic-gate nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
11057c478bd9Sstevel@tonic-gate {
11067c478bd9Sstevel@tonic-gate 	int error = 0;
11077c478bd9Sstevel@tonic-gate 	time_t curtime, time_to_wait;
11087c478bd9Sstevel@tonic-gate 
11097c478bd9Sstevel@tonic-gate 	/* do a unprotected check to reduce mi_lock contention */
11107c478bd9Sstevel@tonic-gate 	if (mi->mi_grace_wait != 0) {
11117c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
11127c478bd9Sstevel@tonic-gate 
11137c478bd9Sstevel@tonic-gate 		if (mi->mi_grace_wait != 0) {
11147c478bd9Sstevel@tonic-gate 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
11157c478bd9Sstevel@tonic-gate 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
11167c478bd9Sstevel@tonic-gate 
11177c478bd9Sstevel@tonic-gate 			curtime = gethrestime_sec();
11187c478bd9Sstevel@tonic-gate 
11197c478bd9Sstevel@tonic-gate 			if (curtime < mi->mi_grace_wait) {
11207c478bd9Sstevel@tonic-gate 
11217c478bd9Sstevel@tonic-gate 				time_to_wait = mi->mi_grace_wait - curtime;
11227c478bd9Sstevel@tonic-gate 
11237c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
11247c478bd9Sstevel@tonic-gate 
11251988a130Sdm 				delay(SEC_TO_TICK(time_to_wait));
11267c478bd9Sstevel@tonic-gate 
11277c478bd9Sstevel@tonic-gate 				curtime = gethrestime_sec();
11287c478bd9Sstevel@tonic-gate 
11297c478bd9Sstevel@tonic-gate 				mutex_enter(&mi->mi_lock);
11307c478bd9Sstevel@tonic-gate 
11317c478bd9Sstevel@tonic-gate 				if (curtime >= mi->mi_grace_wait)
11327c478bd9Sstevel@tonic-gate 					mi->mi_grace_wait = 0;
11337c478bd9Sstevel@tonic-gate 			} else {
11347c478bd9Sstevel@tonic-gate 				mi->mi_grace_wait = 0;
11357c478bd9Sstevel@tonic-gate 			}
11367c478bd9Sstevel@tonic-gate 		}
11377c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
11387c478bd9Sstevel@tonic-gate 	}
11397c478bd9Sstevel@tonic-gate 
11407c478bd9Sstevel@tonic-gate 	return (error);
11417c478bd9Sstevel@tonic-gate }
11427c478bd9Sstevel@tonic-gate 
11437c478bd9Sstevel@tonic-gate /*
11447c478bd9Sstevel@tonic-gate  * If the client received NFS4ERR_DELAY for an operation on a vnode,
11457c478bd9Sstevel@tonic-gate  * the client blocks here until it is time to try again.
11467c478bd9Sstevel@tonic-gate  *
11477c478bd9Sstevel@tonic-gate  * Return value:
11487c478bd9Sstevel@tonic-gate  * - 0 if wait was successful
11497c478bd9Sstevel@tonic-gate  * - EINTR if the call was interrupted
11507c478bd9Sstevel@tonic-gate  */
11517c478bd9Sstevel@tonic-gate 
11527c478bd9Sstevel@tonic-gate int
nfs4_wait_for_delay(vnode_t * vp,nfs4_recov_state_t * rsp)11537c478bd9Sstevel@tonic-gate nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
11547c478bd9Sstevel@tonic-gate {
11557c478bd9Sstevel@tonic-gate 	int error = 0;
11567c478bd9Sstevel@tonic-gate 	time_t curtime, time_to_wait;
11577c478bd9Sstevel@tonic-gate 	rnode4_t *rp;
11587c478bd9Sstevel@tonic-gate 
11597c478bd9Sstevel@tonic-gate 	ASSERT(vp != NULL);
11607c478bd9Sstevel@tonic-gate 
11617c478bd9Sstevel@tonic-gate 	rp = VTOR4(vp);
11627c478bd9Sstevel@tonic-gate 
11637c478bd9Sstevel@tonic-gate 	/* do a unprotected check to reduce r_statelock contention */
11647c478bd9Sstevel@tonic-gate 	if (rp->r_delay_wait != 0) {
11657c478bd9Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
11667c478bd9Sstevel@tonic-gate 
11677c478bd9Sstevel@tonic-gate 		if (rp->r_delay_wait != 0) {
11687c478bd9Sstevel@tonic-gate 
11697c478bd9Sstevel@tonic-gate 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
11707c478bd9Sstevel@tonic-gate 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
11717c478bd9Sstevel@tonic-gate 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
11727c478bd9Sstevel@tonic-gate 			}
11737c478bd9Sstevel@tonic-gate 
11747c478bd9Sstevel@tonic-gate 			curtime = gethrestime_sec();
11757c478bd9Sstevel@tonic-gate 
11767c478bd9Sstevel@tonic-gate 			if (curtime < rp->r_delay_wait) {
11777c478bd9Sstevel@tonic-gate 
11787c478bd9Sstevel@tonic-gate 				time_to_wait = rp->r_delay_wait - curtime;
11797c478bd9Sstevel@tonic-gate 
11807c478bd9Sstevel@tonic-gate 				mutex_exit(&rp->r_statelock);
11817c478bd9Sstevel@tonic-gate 
11821988a130Sdm 				delay(SEC_TO_TICK(time_to_wait));
11837c478bd9Sstevel@tonic-gate 
11847c478bd9Sstevel@tonic-gate 				curtime = gethrestime_sec();
11857c478bd9Sstevel@tonic-gate 
11867c478bd9Sstevel@tonic-gate 				mutex_enter(&rp->r_statelock);
11877c478bd9Sstevel@tonic-gate 
11887c478bd9Sstevel@tonic-gate 				if (curtime >= rp->r_delay_wait)
11897c478bd9Sstevel@tonic-gate 					rp->r_delay_wait = 0;
11907c478bd9Sstevel@tonic-gate 			} else {
11917c478bd9Sstevel@tonic-gate 				rp->r_delay_wait = 0;
11927c478bd9Sstevel@tonic-gate 			}
11937c478bd9Sstevel@tonic-gate 		}
11947c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
11957c478bd9Sstevel@tonic-gate 	}
11967c478bd9Sstevel@tonic-gate 
11977c478bd9Sstevel@tonic-gate 	return (error);
11987c478bd9Sstevel@tonic-gate }
11997c478bd9Sstevel@tonic-gate 
12007c478bd9Sstevel@tonic-gate /*
12017c478bd9Sstevel@tonic-gate  * The recovery thread.
12027c478bd9Sstevel@tonic-gate  */
12037c478bd9Sstevel@tonic-gate 
12047c478bd9Sstevel@tonic-gate static void
nfs4_recov_thread(recov_info_t * recovp)12057c478bd9Sstevel@tonic-gate nfs4_recov_thread(recov_info_t *recovp)
12067c478bd9Sstevel@tonic-gate {
12077c478bd9Sstevel@tonic-gate 	mntinfo4_t *mi = recovp->rc_mi;
12087c478bd9Sstevel@tonic-gate 	nfs4_server_t *sp;
12097c478bd9Sstevel@tonic-gate 	int done = 0, error = 0;
12107c478bd9Sstevel@tonic-gate 	bool_t recov_fail = FALSE;
12117c478bd9Sstevel@tonic-gate 	callb_cpr_t cpr_info;
12127c478bd9Sstevel@tonic-gate 	kmutex_t cpr_lock;
12137c478bd9Sstevel@tonic-gate 
12147c478bd9Sstevel@tonic-gate 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
12157c478bd9Sstevel@tonic-gate 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
12167c478bd9Sstevel@tonic-gate 	    0, 0);
12177c478bd9Sstevel@tonic-gate 
12187c478bd9Sstevel@tonic-gate 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
12197c478bd9Sstevel@tonic-gate 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
12207c478bd9Sstevel@tonic-gate 
12217c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
12227c478bd9Sstevel@tonic-gate 	mi->mi_recovthread = curthread;
12237c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
12247c478bd9Sstevel@tonic-gate 
12257c478bd9Sstevel@tonic-gate 	/*
12267c478bd9Sstevel@tonic-gate 	 * We don't really need protection here against failover or
12277c478bd9Sstevel@tonic-gate 	 * migration, since the current thread is the one that would make
12287c478bd9Sstevel@tonic-gate 	 * any changes, but hold mi_recovlock anyway for completeness (and
12297c478bd9Sstevel@tonic-gate 	 * to satisfy any ASSERTs).
12307c478bd9Sstevel@tonic-gate 	 */
12317c478bd9Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
12327c478bd9Sstevel@tonic-gate 	sp = find_nfs4_server(mi);
12337c478bd9Sstevel@tonic-gate 	if (sp != NULL)
12347c478bd9Sstevel@tonic-gate 		mutex_exit(&sp->s_lock);
12357c478bd9Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_recovlock);
12367c478bd9Sstevel@tonic-gate 
12377c478bd9Sstevel@tonic-gate 	/*
12387c478bd9Sstevel@tonic-gate 	 * Do any necessary recovery, based on the information in recovp
12397c478bd9Sstevel@tonic-gate 	 * and any recovery flags.
12407c478bd9Sstevel@tonic-gate 	 */
12417c478bd9Sstevel@tonic-gate 
12427c478bd9Sstevel@tonic-gate 	do {
12437c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
12447c478bd9Sstevel@tonic-gate 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
12457c478bd9Sstevel@tonic-gate 			bool_t activesrv;
12467c478bd9Sstevel@tonic-gate 
12477c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug &&
12487c478bd9Sstevel@tonic-gate 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1249b9238976Sth 			    "nfs4_recov_thread: file system has been "
1250b9238976Sth 			    "unmounted"));
12517c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug &&
12527c478bd9Sstevel@tonic-gate 			    zone_status_get(curproc->p_zone) >=
12537c478bd9Sstevel@tonic-gate 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1254b9238976Sth 			    "nfs4_recov_thread: zone shutting down"));
12557c478bd9Sstevel@tonic-gate 			/*
12567c478bd9Sstevel@tonic-gate 			 * If the server has lost its state for us and
12577c478bd9Sstevel@tonic-gate 			 * the filesystem is unmounted, then the filesystem
12587c478bd9Sstevel@tonic-gate 			 * can be tossed, even if there are lost lock or
12597c478bd9Sstevel@tonic-gate 			 * lost state calls in the recovery queue.
12607c478bd9Sstevel@tonic-gate 			 */
12617c478bd9Sstevel@tonic-gate 			if (mi->mi_recovflags &
12627c478bd9Sstevel@tonic-gate 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
12637c478bd9Sstevel@tonic-gate 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
12647c478bd9Sstevel@tonic-gate 				"nfs4_recov_thread: bailing out"));
12657c478bd9Sstevel@tonic-gate 				mi->mi_flags |= MI4_RECOV_FAIL;
12667c478bd9Sstevel@tonic-gate 				mi->mi_error = recovp->rc_error;
12677c478bd9Sstevel@tonic-gate 				recov_fail = TRUE;
12687c478bd9Sstevel@tonic-gate 			}
12697c478bd9Sstevel@tonic-gate 			/*
12707c478bd9Sstevel@tonic-gate 			 * We don't know if the server has any state for
12717c478bd9Sstevel@tonic-gate 			 * us, and the filesystem has been unmounted.  If
12727c478bd9Sstevel@tonic-gate 			 * there are "lost state" recovery items, keep
12737c478bd9Sstevel@tonic-gate 			 * trying to process them until there are no more
12747c478bd9Sstevel@tonic-gate 			 * mounted filesystems for the server.  Otherwise,
12757c478bd9Sstevel@tonic-gate 			 * bail out.  The reason we don't mark the
12767c478bd9Sstevel@tonic-gate 			 * filesystem as failing recovery is in case we
12777c478bd9Sstevel@tonic-gate 			 * have to do "lost state" recovery later (e.g., a
12787c478bd9Sstevel@tonic-gate 			 * user process exits).
12797c478bd9Sstevel@tonic-gate 			 */
12807c478bd9Sstevel@tonic-gate 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1281e749d04dSjwahlig 				done = 1;
12827c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
12837c478bd9Sstevel@tonic-gate 				break;
12847c478bd9Sstevel@tonic-gate 			}
12857c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
12867c478bd9Sstevel@tonic-gate 
12877c478bd9Sstevel@tonic-gate 			if (sp == NULL)
12887c478bd9Sstevel@tonic-gate 				activesrv = FALSE;
12897c478bd9Sstevel@tonic-gate 			else {
12907c478bd9Sstevel@tonic-gate 				mutex_enter(&sp->s_lock);
12917c478bd9Sstevel@tonic-gate 				activesrv = nfs4_fs_active(sp);
12927c478bd9Sstevel@tonic-gate 			}
12937c478bd9Sstevel@tonic-gate 			if (!activesrv) {
12947c478bd9Sstevel@tonic-gate 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1295b9238976Sth 				    "no active fs for server %p",
1296b9238976Sth 				    (void *)sp));
12977c478bd9Sstevel@tonic-gate 				mutex_enter(&mi->mi_lock);
12987c478bd9Sstevel@tonic-gate 				mi->mi_flags |= MI4_RECOV_FAIL;
12997c478bd9Sstevel@tonic-gate 				mi->mi_error = recovp->rc_error;
13007c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
13017c478bd9Sstevel@tonic-gate 				recov_fail = TRUE;
13027c478bd9Sstevel@tonic-gate 				if (sp != NULL) {
13037c478bd9Sstevel@tonic-gate 					/*
13047c478bd9Sstevel@tonic-gate 					 * Mark the server instance as
13057c478bd9Sstevel@tonic-gate 					 * dead, so that nobody will attach
13067c478bd9Sstevel@tonic-gate 					 * a new filesystem.
13077c478bd9Sstevel@tonic-gate 					 */
13087c478bd9Sstevel@tonic-gate 					nfs4_mark_srv_dead(sp);
13097c478bd9Sstevel@tonic-gate 				}
13107c478bd9Sstevel@tonic-gate 			}
13117c478bd9Sstevel@tonic-gate 			if (sp != NULL)
13127c478bd9Sstevel@tonic-gate 				mutex_exit(&sp->s_lock);
13137c478bd9Sstevel@tonic-gate 		} else {
13147c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
13157c478bd9Sstevel@tonic-gate 		}
13167c478bd9Sstevel@tonic-gate 
13177c478bd9Sstevel@tonic-gate 		/*
13187c478bd9Sstevel@tonic-gate 		 * Check if we need to select a new server for a
13197c478bd9Sstevel@tonic-gate 		 * failover.  Choosing a new server will force at
13207c478bd9Sstevel@tonic-gate 		 * least a check of the clientid.
13217c478bd9Sstevel@tonic-gate 		 */
13227c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
13237c478bd9Sstevel@tonic-gate 		if (!recov_fail &&
13247c478bd9Sstevel@tonic-gate 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
13257c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
13267c478bd9Sstevel@tonic-gate 			recov_newserver(recovp, &sp, &recov_fail);
13277c478bd9Sstevel@tonic-gate 		} else
13287c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
13297c478bd9Sstevel@tonic-gate 
13307c478bd9Sstevel@tonic-gate 		/*
13317c478bd9Sstevel@tonic-gate 		 * Check if we need to recover the clientid.  This
13327c478bd9Sstevel@tonic-gate 		 * must be done before file and lock recovery, and it
13337c478bd9Sstevel@tonic-gate 		 * potentially affects the recovery threads for other
13347c478bd9Sstevel@tonic-gate 		 * filesystems, so it gets special treatment.
13357c478bd9Sstevel@tonic-gate 		 */
13367c478bd9Sstevel@tonic-gate 		if (sp != NULL && recov_fail == FALSE) {
13377c478bd9Sstevel@tonic-gate 			mutex_enter(&sp->s_lock);
13387c478bd9Sstevel@tonic-gate 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
13397c478bd9Sstevel@tonic-gate 				mutex_exit(&sp->s_lock);
13407c478bd9Sstevel@tonic-gate 				recov_clientid(recovp, sp);
13417c478bd9Sstevel@tonic-gate 			} else {
13427c478bd9Sstevel@tonic-gate 				/*
13437c478bd9Sstevel@tonic-gate 				 * Unset this flag in case another recovery
13447c478bd9Sstevel@tonic-gate 				 * thread successfully recovered the clientid
13457c478bd9Sstevel@tonic-gate 				 * for us already.
13467c478bd9Sstevel@tonic-gate 				 */
13477c478bd9Sstevel@tonic-gate 				mutex_enter(&mi->mi_lock);
13487c478bd9Sstevel@tonic-gate 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
13497c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
13507c478bd9Sstevel@tonic-gate 				mutex_exit(&sp->s_lock);
13517c478bd9Sstevel@tonic-gate 			}
13527c478bd9Sstevel@tonic-gate 		}
13537c478bd9Sstevel@tonic-gate 
13547c478bd9Sstevel@tonic-gate 		/*
13557c478bd9Sstevel@tonic-gate 		 * Check if we need to get the security information.
13567c478bd9Sstevel@tonic-gate 		 */
13577c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
13587c478bd9Sstevel@tonic-gate 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
13597c478bd9Sstevel@tonic-gate 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
13607c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
13617c478bd9Sstevel@tonic-gate 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1362b9238976Sth 			    RW_WRITER, 0);
13637c478bd9Sstevel@tonic-gate 			error = nfs4_secinfo_recov(recovp->rc_mi,
1364b9238976Sth 			    recovp->rc_vp1, recovp->rc_vp2);
13657c478bd9Sstevel@tonic-gate 			/*
13667c478bd9Sstevel@tonic-gate 			 * If error, nothing more can be done, stop
13677c478bd9Sstevel@tonic-gate 			 * the recovery.
13687c478bd9Sstevel@tonic-gate 			 */
13697c478bd9Sstevel@tonic-gate 			if (error) {
13707c478bd9Sstevel@tonic-gate 				mutex_enter(&mi->mi_lock);
13717c478bd9Sstevel@tonic-gate 				mi->mi_flags |= MI4_RECOV_FAIL;
13727c478bd9Sstevel@tonic-gate 				mi->mi_error = recovp->rc_error;
13737c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
13747c478bd9Sstevel@tonic-gate 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
13757c478bd9Sstevel@tonic-gate 				    error, recovp->rc_vp1, recovp->rc_vp2,
13767c478bd9Sstevel@tonic-gate 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
13777c478bd9Sstevel@tonic-gate 			}
13787c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&mi->mi_recovlock);
13797c478bd9Sstevel@tonic-gate 		} else
13807c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
13817c478bd9Sstevel@tonic-gate 
13827c478bd9Sstevel@tonic-gate 		/*
13837c478bd9Sstevel@tonic-gate 		 * Check if there's a bad seqid to recover.
13847c478bd9Sstevel@tonic-gate 		 */
13857c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
13867c478bd9Sstevel@tonic-gate 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
13877c478bd9Sstevel@tonic-gate 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
13887c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
13897c478bd9Sstevel@tonic-gate 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1390b9238976Sth 			    RW_WRITER, 0);
13917c478bd9Sstevel@tonic-gate 			recov_bad_seqid(recovp);
13927c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&mi->mi_recovlock);
13937c478bd9Sstevel@tonic-gate 		} else
13947c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
13957c478bd9Sstevel@tonic-gate 
13967c478bd9Sstevel@tonic-gate 		/*
13977c478bd9Sstevel@tonic-gate 		 * Next check for recovery that affects the entire
13987c478bd9Sstevel@tonic-gate 		 * filesystem.
13997c478bd9Sstevel@tonic-gate 		 */
14007c478bd9Sstevel@tonic-gate 		if (sp != NULL) {
14017c478bd9Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
14027c478bd9Sstevel@tonic-gate 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
14037c478bd9Sstevel@tonic-gate 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
14047c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
14057c478bd9Sstevel@tonic-gate 				recov_openfiles(recovp, sp);
14067c478bd9Sstevel@tonic-gate 			} else
14077c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
14087c478bd9Sstevel@tonic-gate 		}
14097c478bd9Sstevel@tonic-gate 
14107c478bd9Sstevel@tonic-gate 		/*
14117c478bd9Sstevel@tonic-gate 		 * Send any queued state recovery requests.
14127c478bd9Sstevel@tonic-gate 		 */
14137c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
14147c478bd9Sstevel@tonic-gate 		if (sp != NULL &&
14157c478bd9Sstevel@tonic-gate 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
14167c478bd9Sstevel@tonic-gate 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
14177c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
14187c478bd9Sstevel@tonic-gate 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1419b9238976Sth 			    RW_WRITER, 0);
14207c478bd9Sstevel@tonic-gate 			nfs4_resend_lost_rqsts(recovp, sp);
14217c478bd9Sstevel@tonic-gate 			if (list_head(&mi->mi_lost_state) == NULL) {
14227c478bd9Sstevel@tonic-gate 				/* done */
14237c478bd9Sstevel@tonic-gate 				mutex_enter(&mi->mi_lock);
14247c478bd9Sstevel@tonic-gate 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
14257c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
14267c478bd9Sstevel@tonic-gate 			}
14277c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&mi->mi_recovlock);
14287c478bd9Sstevel@tonic-gate 		} else {
14297c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
14307c478bd9Sstevel@tonic-gate 		}
14317c478bd9Sstevel@tonic-gate 
14327c478bd9Sstevel@tonic-gate 		/*
14337c478bd9Sstevel@tonic-gate 		 * See if there is anything more to do.  If not, announce
14347c478bd9Sstevel@tonic-gate 		 * that we are done and exit.
14357c478bd9Sstevel@tonic-gate 		 *
14367c478bd9Sstevel@tonic-gate 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
14377c478bd9Sstevel@tonic-gate 		 * mi_recovlock before mi_lock to preserve lock ordering.
14387c478bd9Sstevel@tonic-gate 		 */
14397c478bd9Sstevel@tonic-gate 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
14407c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
14417c478bd9Sstevel@tonic-gate 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
14427c478bd9Sstevel@tonic-gate 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
14437c478bd9Sstevel@tonic-gate 			list_t local_lost_state;
14447c478bd9Sstevel@tonic-gate 			nfs4_lost_rqst_t *lrp;
14457c478bd9Sstevel@tonic-gate 
14467c478bd9Sstevel@tonic-gate 			/*
14477c478bd9Sstevel@tonic-gate 			 * We need to remove the lost requests before we
14487c478bd9Sstevel@tonic-gate 			 * unmark the mi as no longer doing recovery to
14497c478bd9Sstevel@tonic-gate 			 * avoid a race with a new thread putting new lost
14507c478bd9Sstevel@tonic-gate 			 * requests on the same mi (and the going away
14517c478bd9Sstevel@tonic-gate 			 * thread would remove the new lost requests).
14527c478bd9Sstevel@tonic-gate 			 *
14537c478bd9Sstevel@tonic-gate 			 * Move the lost requests to a local list since
14547c478bd9Sstevel@tonic-gate 			 * nfs4_remove_lost_rqst() drops mi_lock, and
14557c478bd9Sstevel@tonic-gate 			 * dropping the mi_lock would make our check to
14567c478bd9Sstevel@tonic-gate 			 * see if recovery is done no longer valid.
14577c478bd9Sstevel@tonic-gate 			 */
14587c478bd9Sstevel@tonic-gate 			list_create(&local_lost_state,
14597c478bd9Sstevel@tonic-gate 			    sizeof (nfs4_lost_rqst_t),
14607c478bd9Sstevel@tonic-gate 			    offsetof(nfs4_lost_rqst_t, lr_node));
14617c478bd9Sstevel@tonic-gate 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
14627c478bd9Sstevel@tonic-gate 
14637c478bd9Sstevel@tonic-gate 			done = 1;
14647c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
14657c478bd9Sstevel@tonic-gate 			/*
14667c478bd9Sstevel@tonic-gate 			 * Now officially free the "moved"
14677c478bd9Sstevel@tonic-gate 			 * lost requests.
14687c478bd9Sstevel@tonic-gate 			 */
14697c478bd9Sstevel@tonic-gate 			while ((lrp = list_head(&local_lost_state)) != NULL) {
14707c478bd9Sstevel@tonic-gate 				list_remove(&local_lost_state, lrp);
14717c478bd9Sstevel@tonic-gate 				nfs4_free_lost_rqst(lrp, sp);
14727c478bd9Sstevel@tonic-gate 			}
14737c478bd9Sstevel@tonic-gate 			list_destroy(&local_lost_state);
14747c478bd9Sstevel@tonic-gate 		} else
14757c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
14767c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&mi->mi_recovlock);
14777c478bd9Sstevel@tonic-gate 
14787c478bd9Sstevel@tonic-gate 		/*
14797c478bd9Sstevel@tonic-gate 		 * If the filesystem has been forcibly unmounted, there is
14807c478bd9Sstevel@tonic-gate 		 * probably no point in retrying immediately.  Furthermore,
14817c478bd9Sstevel@tonic-gate 		 * there might be user processes waiting for a chance to
14827c478bd9Sstevel@tonic-gate 		 * queue up "lost state" requests, so that they can exit.
14837c478bd9Sstevel@tonic-gate 		 * So pause here for a moment.  Same logic for zone shutdown.
14847c478bd9Sstevel@tonic-gate 		 */
14857c478bd9Sstevel@tonic-gate 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
14867c478bd9Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
14877c478bd9Sstevel@tonic-gate 			cv_broadcast(&mi->mi_failover_cv);
14887c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
14897c478bd9Sstevel@tonic-gate 			delay(SEC_TO_TICK(nfs4_unmount_delay));
14907c478bd9Sstevel@tonic-gate 		}
14917c478bd9Sstevel@tonic-gate 
14927c478bd9Sstevel@tonic-gate 	} while (!done);
14937c478bd9Sstevel@tonic-gate 
14947c478bd9Sstevel@tonic-gate 	if (sp != NULL)
14957c478bd9Sstevel@tonic-gate 		nfs4_server_rele(sp);
14967c478bd9Sstevel@tonic-gate 
14977c478bd9Sstevel@tonic-gate 	/*
14987c478bd9Sstevel@tonic-gate 	 * Return all recalled delegations
14997c478bd9Sstevel@tonic-gate 	 */
15007c478bd9Sstevel@tonic-gate 	nfs4_dlistclean();
15017c478bd9Sstevel@tonic-gate 
1502e749d04dSjwahlig 	mutex_enter(&mi->mi_lock);
1503e749d04dSjwahlig 	recov_done(mi, recovp);
1504e749d04dSjwahlig 	mutex_exit(&mi->mi_lock);
1505e749d04dSjwahlig 
15067c478bd9Sstevel@tonic-gate 	/*
15077c478bd9Sstevel@tonic-gate 	 * Free up resources that were allocated for us.
15087c478bd9Sstevel@tonic-gate 	 */
15097c478bd9Sstevel@tonic-gate 	if (recovp->rc_vp1 != NULL)
15107c478bd9Sstevel@tonic-gate 		VN_RELE(recovp->rc_vp1);
15117c478bd9Sstevel@tonic-gate 	if (recovp->rc_vp2 != NULL)
15127c478bd9Sstevel@tonic-gate 		VN_RELE(recovp->rc_vp2);
1513d7d95b9aSjwahlig 
1514e749d04dSjwahlig 	/* now we are done using the mi struct, signal the waiters */
1515e749d04dSjwahlig 	mutex_enter(&mi->mi_lock);
1516e749d04dSjwahlig 	mi->mi_in_recovery--;
1517e749d04dSjwahlig 	if (mi->mi_in_recovery == 0)
1518e749d04dSjwahlig 		cv_broadcast(&mi->mi_cv_in_recov);
1519e749d04dSjwahlig 	mutex_exit(&mi->mi_lock);
1520e749d04dSjwahlig 
152150a83466Sjwahlig 	VFS_RELE(mi->mi_vfsp);
152250a83466Sjwahlig 	MI4_RELE(mi);
15237c478bd9Sstevel@tonic-gate 	kmem_free(recovp, sizeof (recov_info_t));
15247c478bd9Sstevel@tonic-gate 	mutex_enter(&cpr_lock);
15257c478bd9Sstevel@tonic-gate 	CALLB_CPR_EXIT(&cpr_info);
15267c478bd9Sstevel@tonic-gate 	mutex_destroy(&cpr_lock);
15277c478bd9Sstevel@tonic-gate 	zthread_exit();
15287c478bd9Sstevel@tonic-gate }
15297c478bd9Sstevel@tonic-gate 
15307c478bd9Sstevel@tonic-gate /*
15317c478bd9Sstevel@tonic-gate  * Log the end of recovery and notify any waiting threads.
15327c478bd9Sstevel@tonic-gate  */
15337c478bd9Sstevel@tonic-gate 
15347c478bd9Sstevel@tonic-gate static void
recov_done(mntinfo4_t * mi,recov_info_t * recovp)15357c478bd9Sstevel@tonic-gate recov_done(mntinfo4_t *mi, recov_info_t *recovp)
15367c478bd9Sstevel@tonic-gate {
15377c478bd9Sstevel@tonic-gate 
15387c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&mi->mi_lock));
15397c478bd9Sstevel@tonic-gate 
15407c478bd9Sstevel@tonic-gate 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1541b9238976Sth 	    recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
15427c478bd9Sstevel@tonic-gate 	mi->mi_recovthread = NULL;
15437c478bd9Sstevel@tonic-gate 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
15447c478bd9Sstevel@tonic-gate 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
15457c478bd9Sstevel@tonic-gate 	cv_broadcast(&mi->mi_failover_cv);
15467c478bd9Sstevel@tonic-gate }
15477c478bd9Sstevel@tonic-gate 
15487c478bd9Sstevel@tonic-gate /*
15497c478bd9Sstevel@tonic-gate  * State-specific recovery routines, by state.
15507c478bd9Sstevel@tonic-gate  */
15517c478bd9Sstevel@tonic-gate 
15527c478bd9Sstevel@tonic-gate /*
15537c478bd9Sstevel@tonic-gate  * Failover.
15547c478bd9Sstevel@tonic-gate  *
15557c478bd9Sstevel@tonic-gate  * Replaces *spp with a reference to the new server, which must
15567c478bd9Sstevel@tonic-gate  * eventually be freed.
15577c478bd9Sstevel@tonic-gate  */
15587c478bd9Sstevel@tonic-gate 
15597c478bd9Sstevel@tonic-gate static void
recov_newserver(recov_info_t * recovp,nfs4_server_t ** spp,bool_t * recov_fail)15607c478bd9Sstevel@tonic-gate recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
15617c478bd9Sstevel@tonic-gate {
15627c478bd9Sstevel@tonic-gate 	mntinfo4_t *mi = recovp->rc_mi;
15637c478bd9Sstevel@tonic-gate 	servinfo4_t *svp = NULL;
15647c478bd9Sstevel@tonic-gate 	nfs4_server_t *osp = *spp;
15657c478bd9Sstevel@tonic-gate 	CLIENT *cl;
15667c478bd9Sstevel@tonic-gate 	enum clnt_stat status;
15677c478bd9Sstevel@tonic-gate 	struct timeval tv;
15687c478bd9Sstevel@tonic-gate 	int error;
15697c478bd9Sstevel@tonic-gate 	int oncethru = 0;
15707c478bd9Sstevel@tonic-gate 	rnode4_t *rp;
15717c478bd9Sstevel@tonic-gate 	int index;
15727c478bd9Sstevel@tonic-gate 	nfs_fh4 fh;
15737c478bd9Sstevel@tonic-gate 	char *snames;
15747c478bd9Sstevel@tonic-gate 	size_t len;
15757c478bd9Sstevel@tonic-gate 
15767c478bd9Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
15777c478bd9Sstevel@tonic-gate 
15787c478bd9Sstevel@tonic-gate 	tv.tv_sec = 2;
15797c478bd9Sstevel@tonic-gate 	tv.tv_usec = 0;
15807c478bd9Sstevel@tonic-gate 
15817c478bd9Sstevel@tonic-gate #ifdef lint
15827c478bd9Sstevel@tonic-gate 	/*
15837c478bd9Sstevel@tonic-gate 	 * Lint can't follow the logic, so thinks that snames and len
15847c478bd9Sstevel@tonic-gate 	 * can be used before being set.  They can't, but lint can't
15857c478bd9Sstevel@tonic-gate 	 * figure it out.  To address the lint warning, initialize
15867c478bd9Sstevel@tonic-gate 	 * snames and len for lint.
15877c478bd9Sstevel@tonic-gate 	 */
15887c478bd9Sstevel@tonic-gate 	snames = NULL;
15897c478bd9Sstevel@tonic-gate 	len = 0;
15907c478bd9Sstevel@tonic-gate #endif
15917c478bd9Sstevel@tonic-gate 
15927c478bd9Sstevel@tonic-gate 	/*
15937c478bd9Sstevel@tonic-gate 	 * Ping the null NFS procedure of every server in
15947c478bd9Sstevel@tonic-gate 	 * the list until one responds.  We always start
15957c478bd9Sstevel@tonic-gate 	 * at the head of the list and always skip the one
15967c478bd9Sstevel@tonic-gate 	 * that is current, since it's caused us a problem.
15977c478bd9Sstevel@tonic-gate 	 */
15987c478bd9Sstevel@tonic-gate 	while (svp == NULL) {
15997c478bd9Sstevel@tonic-gate 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
16007c478bd9Sstevel@tonic-gate 
16017c478bd9Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
16027c478bd9Sstevel@tonic-gate 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
16037c478bd9Sstevel@tonic-gate 				mi->mi_flags |= MI4_RECOV_FAIL;
16047c478bd9Sstevel@tonic-gate 				mutex_exit(&mi->mi_lock);
16057c478bd9Sstevel@tonic-gate 				(void) nfs_rw_exit(&mi->mi_recovlock);
16067c478bd9Sstevel@tonic-gate 				*recov_fail = TRUE;
16077c478bd9Sstevel@tonic-gate 				if (oncethru)
16087c478bd9Sstevel@tonic-gate 					kmem_free(snames, len);
16097c478bd9Sstevel@tonic-gate 				return;
16107c478bd9Sstevel@tonic-gate 			}
16117c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
16127c478bd9Sstevel@tonic-gate 
16137c478bd9Sstevel@tonic-gate 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
16147c478bd9Sstevel@tonic-gate 			if (svp->sv_flags & SV4_NOTINUSE) {
16157c478bd9Sstevel@tonic-gate 				nfs_rw_exit(&svp->sv_lock);
16167c478bd9Sstevel@tonic-gate 				continue;
16177c478bd9Sstevel@tonic-gate 			}
16187c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&svp->sv_lock);
16197c478bd9Sstevel@tonic-gate 
16207c478bd9Sstevel@tonic-gate 			if (!oncethru && svp == mi->mi_curr_serv)
16217c478bd9Sstevel@tonic-gate 				continue;
16227c478bd9Sstevel@tonic-gate 
16237c478bd9Sstevel@tonic-gate 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
16247c478bd9Sstevel@tonic-gate 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
16257c478bd9Sstevel@tonic-gate 			if (error)
16267c478bd9Sstevel@tonic-gate 				continue;
16277c478bd9Sstevel@tonic-gate 
16287c478bd9Sstevel@tonic-gate 			if (!(mi->mi_flags & MI4_INT))
16297c478bd9Sstevel@tonic-gate 				cl->cl_nosignal = TRUE;
16307c478bd9Sstevel@tonic-gate 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
16317c478bd9Sstevel@tonic-gate 			    xdr_void, NULL, tv);
16327c478bd9Sstevel@tonic-gate 			if (!(mi->mi_flags & MI4_INT))
16337c478bd9Sstevel@tonic-gate 				cl->cl_nosignal = FALSE;
16347c478bd9Sstevel@tonic-gate 			AUTH_DESTROY(cl->cl_auth);
16357c478bd9Sstevel@tonic-gate 			CLNT_DESTROY(cl);
16367c478bd9Sstevel@tonic-gate 			if (status == RPC_SUCCESS) {
16377c478bd9Sstevel@tonic-gate 				nfs4_queue_event(RE_FAILOVER, mi,
16387c478bd9Sstevel@tonic-gate 				    svp == mi->mi_curr_serv ? NULL :
16397c478bd9Sstevel@tonic-gate 				    svp->sv_hostname, 0, NULL, NULL, 0,
16407c478bd9Sstevel@tonic-gate 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
16417c478bd9Sstevel@tonic-gate 				break;
16427c478bd9Sstevel@tonic-gate 			}
16437c478bd9Sstevel@tonic-gate 		}
16447c478bd9Sstevel@tonic-gate 
16457c478bd9Sstevel@tonic-gate 		if (svp == NULL) {
16467c478bd9Sstevel@tonic-gate 			if (!oncethru) {
16477c478bd9Sstevel@tonic-gate 				snames = nfs4_getsrvnames(mi, &len);
16487c478bd9Sstevel@tonic-gate 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
16497c478bd9Sstevel@tonic-gate 				    0, 0, 0, FALSE, snames, 0, NULL);
16507c478bd9Sstevel@tonic-gate 				oncethru = 1;
16517c478bd9Sstevel@tonic-gate 			}
16527c478bd9Sstevel@tonic-gate 			delay(hz);
16537c478bd9Sstevel@tonic-gate 		}
16547c478bd9Sstevel@tonic-gate 	}
16557c478bd9Sstevel@tonic-gate 
16567c478bd9Sstevel@tonic-gate 	if (oncethru) {
16577c478bd9Sstevel@tonic-gate 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
16587c478bd9Sstevel@tonic-gate 		    0, NULL);
16597c478bd9Sstevel@tonic-gate 		kmem_free(snames, len);
16607c478bd9Sstevel@tonic-gate 	}
16617c478bd9Sstevel@tonic-gate 
16627c478bd9Sstevel@tonic-gate #if DEBUG
16637c478bd9Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
16647c478bd9Sstevel@tonic-gate 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
16657c478bd9Sstevel@tonic-gate 	nfs_rw_exit(&svp->sv_lock);
16667c478bd9Sstevel@tonic-gate #endif
16677c478bd9Sstevel@tonic-gate 
16687c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
16697c478bd9Sstevel@tonic-gate 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
16707c478bd9Sstevel@tonic-gate 	if (svp != mi->mi_curr_serv) {
16717c478bd9Sstevel@tonic-gate 		servinfo4_t *osvp = mi->mi_curr_serv;
16727c478bd9Sstevel@tonic-gate 
16737c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
16747c478bd9Sstevel@tonic-gate 
16757c478bd9Sstevel@tonic-gate 		/*
16767c478bd9Sstevel@tonic-gate 		 * Update server-dependent fields in the root vnode.
16777c478bd9Sstevel@tonic-gate 		 */
16787c478bd9Sstevel@tonic-gate 		index = rtable4hash(mi->mi_rootfh);
16797c478bd9Sstevel@tonic-gate 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
16807c478bd9Sstevel@tonic-gate 
16817c478bd9Sstevel@tonic-gate 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
16827c478bd9Sstevel@tonic-gate 		if (rp != NULL) {
16837c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
16847c478bd9Sstevel@tonic-gate 			    "recov_newserver: remapping %s", rnode4info(rp)));
16857c478bd9Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
16867c478bd9Sstevel@tonic-gate 			rp->r_server = svp;
16877c478bd9Sstevel@tonic-gate 			PURGE_ATTRCACHE4_LOCKED(rp);
16887c478bd9Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
16897c478bd9Sstevel@tonic-gate 			(void) nfs4_free_data_reclaim(rp);
16907c478bd9Sstevel@tonic-gate 			nfs4_purge_rddir_cache(RTOV4(rp));
16917c478bd9Sstevel@tonic-gate 			rw_exit(&rtable4[index].r_lock);
16927c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
16937c478bd9Sstevel@tonic-gate 			    "recov_newserver: done with %s",
16947c478bd9Sstevel@tonic-gate 			    rnode4info(rp)));
16957c478bd9Sstevel@tonic-gate 			VN_RELE(RTOV4(rp));
16967c478bd9Sstevel@tonic-gate 		} else
16977c478bd9Sstevel@tonic-gate 			rw_exit(&rtable4[index].r_lock);
16987c478bd9Sstevel@tonic-gate 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
16997c478bd9Sstevel@tonic-gate 
17007c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
17017c478bd9Sstevel@tonic-gate 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
17027c478bd9Sstevel@tonic-gate 		if (recovp->rc_srv_reboot)
17037c478bd9Sstevel@tonic-gate 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
17047c478bd9Sstevel@tonic-gate 		mi->mi_curr_serv = svp;
17057c478bd9Sstevel@tonic-gate 		mi->mi_failover++;
17067c478bd9Sstevel@tonic-gate 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
17077c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
17087c478bd9Sstevel@tonic-gate 
17097c478bd9Sstevel@tonic-gate 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
17107c478bd9Sstevel@tonic-gate 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
17117c478bd9Sstevel@tonic-gate 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
17127c478bd9Sstevel@tonic-gate 		sfh4_update(mi->mi_rootfh, &fh);
17137c478bd9Sstevel@tonic-gate 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
17147c478bd9Sstevel@tonic-gate 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
17157c478bd9Sstevel@tonic-gate 		sfh4_update(mi->mi_srvparentfh, &fh);
17167c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&svp->sv_lock);
17177c478bd9Sstevel@tonic-gate 
17187c478bd9Sstevel@tonic-gate 		*spp = nfs4_move_mi(mi, osvp, svp);
17197c478bd9Sstevel@tonic-gate 		if (osp != NULL)
17207c478bd9Sstevel@tonic-gate 			nfs4_server_rele(osp);
17217c478bd9Sstevel@tonic-gate 	} else
17227c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
17237c478bd9Sstevel@tonic-gate 	(void) nfs_rw_exit(&mi->mi_recovlock);
17247c478bd9Sstevel@tonic-gate }
17257c478bd9Sstevel@tonic-gate 
17267c478bd9Sstevel@tonic-gate /*
17277c478bd9Sstevel@tonic-gate  * Clientid.
17287c478bd9Sstevel@tonic-gate  */
17297c478bd9Sstevel@tonic-gate 
17307c478bd9Sstevel@tonic-gate static void
recov_clientid(recov_info_t * recovp,nfs4_server_t * sp)17317c478bd9Sstevel@tonic-gate recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
17327c478bd9Sstevel@tonic-gate {
17337c478bd9Sstevel@tonic-gate 	mntinfo4_t *mi = recovp->rc_mi;
17347c478bd9Sstevel@tonic-gate 	int error = 0;
17357c478bd9Sstevel@tonic-gate 	int still_stale;
17367c478bd9Sstevel@tonic-gate 	int need_new_s;
17377c478bd9Sstevel@tonic-gate 
17387c478bd9Sstevel@tonic-gate 	ASSERT(sp != NULL);
17397c478bd9Sstevel@tonic-gate 
17407c478bd9Sstevel@tonic-gate 	/*
17417c478bd9Sstevel@tonic-gate 	 * Acquire the recovery lock and then verify that the clientid
17427c478bd9Sstevel@tonic-gate 	 * still needs to be recovered.  (Note that s_recovlock is supposed
17437c478bd9Sstevel@tonic-gate 	 * to be acquired before s_lock.)  Since the thread holds the
17447c478bd9Sstevel@tonic-gate 	 * recovery lock, no other thread will recover the clientid.
17457c478bd9Sstevel@tonic-gate 	 */
17467c478bd9Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
17477c478bd9Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
17487c478bd9Sstevel@tonic-gate 	mutex_enter(&sp->s_lock);
17497c478bd9Sstevel@tonic-gate 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
17507c478bd9Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
17517c478bd9Sstevel@tonic-gate 
17527c478bd9Sstevel@tonic-gate 	if (still_stale) {
17537c478bd9Sstevel@tonic-gate 		nfs4_error_t n4e;
17547c478bd9Sstevel@tonic-gate 
17557c478bd9Sstevel@tonic-gate 		nfs4_error_zinit(&n4e);
17567c478bd9Sstevel@tonic-gate 		nfs4setclientid(mi, kcred, TRUE, &n4e);
17577c478bd9Sstevel@tonic-gate 		error = n4e.error;
17587c478bd9Sstevel@tonic-gate 		if (error != 0) {
17597c478bd9Sstevel@tonic-gate 
17607c478bd9Sstevel@tonic-gate 			/*
17617c478bd9Sstevel@tonic-gate 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
17627c478bd9Sstevel@tonic-gate 			 * if so, just return and let recov_thread drive
17637c478bd9Sstevel@tonic-gate 			 * failover.
17647c478bd9Sstevel@tonic-gate 			 */
17657c478bd9Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
17667c478bd9Sstevel@tonic-gate 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
17677c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
17687c478bd9Sstevel@tonic-gate 
17697c478bd9Sstevel@tonic-gate 			if (need_new_s) {
17707c478bd9Sstevel@tonic-gate 				nfs_rw_exit(&mi->mi_recovlock);
17717c478bd9Sstevel@tonic-gate 				nfs_rw_exit(&sp->s_recovlock);
17727c478bd9Sstevel@tonic-gate 				return;
17737c478bd9Sstevel@tonic-gate 			}
17747c478bd9Sstevel@tonic-gate 
17757c478bd9Sstevel@tonic-gate 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
17767c478bd9Sstevel@tonic-gate 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
17777c478bd9Sstevel@tonic-gate 			mutex_enter(&mi->mi_lock);
17787c478bd9Sstevel@tonic-gate 			mi->mi_flags |= MI4_RECOV_FAIL;
17797c478bd9Sstevel@tonic-gate 			mi->mi_error = recovp->rc_error;
17807c478bd9Sstevel@tonic-gate 			mutex_exit(&mi->mi_lock);
17817c478bd9Sstevel@tonic-gate 			/* don't destroy the nfs4_server, let umount do it */
17827c478bd9Sstevel@tonic-gate 		}
17837c478bd9Sstevel@tonic-gate 	}
17847c478bd9Sstevel@tonic-gate 
17857c478bd9Sstevel@tonic-gate 	if (error == 0) {
17867c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
17877c478bd9Sstevel@tonic-gate 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
17887c478bd9Sstevel@tonic-gate 		/*
17897c478bd9Sstevel@tonic-gate 		 * If still_stale isn't true, then another thread already
17907c478bd9Sstevel@tonic-gate 		 * recovered the clientid.  And that thread that set the
17917c478bd9Sstevel@tonic-gate 		 * clientid will have initiated reopening files on all the
17927c478bd9Sstevel@tonic-gate 		 * filesystems for the server, so we should not initiate
17937c478bd9Sstevel@tonic-gate 		 * reopening for this filesystem here.
17947c478bd9Sstevel@tonic-gate 		 */
17957c478bd9Sstevel@tonic-gate 		if (still_stale) {
17967c478bd9Sstevel@tonic-gate 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
17977c478bd9Sstevel@tonic-gate 			if (recovp->rc_srv_reboot)
17987c478bd9Sstevel@tonic-gate 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
17997c478bd9Sstevel@tonic-gate 		}
18007c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
18017c478bd9Sstevel@tonic-gate 	}
18027c478bd9Sstevel@tonic-gate 
18037c478bd9Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_recovlock);
18047c478bd9Sstevel@tonic-gate 
18057c478bd9Sstevel@tonic-gate 	if (error != 0) {
18067c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&sp->s_recovlock);
18077c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
18087c478bd9Sstevel@tonic-gate 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
18097c478bd9Sstevel@tonic-gate 			delay(SEC_TO_TICK(recov_err_delay));
18107c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
18117c478bd9Sstevel@tonic-gate 	} else {
18127c478bd9Sstevel@tonic-gate 		mntinfo4_t **milist;
18137c478bd9Sstevel@tonic-gate 		mntinfo4_t *tmi;
18147c478bd9Sstevel@tonic-gate 		int nummi, i;
18157c478bd9Sstevel@tonic-gate 
18167c478bd9Sstevel@tonic-gate 		/*
18177c478bd9Sstevel@tonic-gate 		 * Initiate recovery of open files for other filesystems.
18187c478bd9Sstevel@tonic-gate 		 * We create an array of filesystems, rather than just
18197c478bd9Sstevel@tonic-gate 		 * walking the filesystem list, to avoid deadlock issues
18207c478bd9Sstevel@tonic-gate 		 * with s_lock and mi_recovlock.
18217c478bd9Sstevel@tonic-gate 		 */
18227c478bd9Sstevel@tonic-gate 		milist = make_milist(sp, &nummi);
18237c478bd9Sstevel@tonic-gate 		for (i = 0; i < nummi; i++) {
18247c478bd9Sstevel@tonic-gate 			tmi = milist[i];
18257c478bd9Sstevel@tonic-gate 			if (tmi != mi) {
18267c478bd9Sstevel@tonic-gate 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1827b9238976Sth 				    RW_READER, 0);
18287c478bd9Sstevel@tonic-gate 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
1829b9238976Sth 				    NULL, NULL);
18307c478bd9Sstevel@tonic-gate 				nfs_rw_exit(&tmi->mi_recovlock);
18317c478bd9Sstevel@tonic-gate 			}
18327c478bd9Sstevel@tonic-gate 		}
18337c478bd9Sstevel@tonic-gate 		free_milist(milist, nummi);
18347c478bd9Sstevel@tonic-gate 
18357c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&sp->s_recovlock);
18367c478bd9Sstevel@tonic-gate 	}
18377c478bd9Sstevel@tonic-gate }
18387c478bd9Sstevel@tonic-gate 
18397c478bd9Sstevel@tonic-gate /*
18407c478bd9Sstevel@tonic-gate  * Return an array of filesystems associated with the given server.  The
18417c478bd9Sstevel@tonic-gate  * caller should call free_milist() to free the references and memory.
18427c478bd9Sstevel@tonic-gate  */
18437c478bd9Sstevel@tonic-gate 
18447c478bd9Sstevel@tonic-gate static mntinfo4_t **
make_milist(nfs4_server_t * sp,int * nummip)18457c478bd9Sstevel@tonic-gate make_milist(nfs4_server_t *sp, int *nummip)
18467c478bd9Sstevel@tonic-gate {
18477c478bd9Sstevel@tonic-gate 	int nummi, i;
18487c478bd9Sstevel@tonic-gate 	mntinfo4_t **milist;
18497c478bd9Sstevel@tonic-gate 	mntinfo4_t *tmi;
18507c478bd9Sstevel@tonic-gate 
18517c478bd9Sstevel@tonic-gate 	mutex_enter(&sp->s_lock);
18527c478bd9Sstevel@tonic-gate 	nummi = 0;
18537c478bd9Sstevel@tonic-gate 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
18547c478bd9Sstevel@tonic-gate 		nummi++;
18557c478bd9Sstevel@tonic-gate 
185638e20649Sjwahlig 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
18577c478bd9Sstevel@tonic-gate 
18587c478bd9Sstevel@tonic-gate 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
18597c478bd9Sstevel@tonic-gate 	    tmi = tmi->mi_clientid_next) {
18607c478bd9Sstevel@tonic-gate 		milist[i] = tmi;
18617c478bd9Sstevel@tonic-gate 		VFS_HOLD(tmi->mi_vfsp);
18627c478bd9Sstevel@tonic-gate 	}
18637c478bd9Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
18647c478bd9Sstevel@tonic-gate 
18657c478bd9Sstevel@tonic-gate 	*nummip = nummi;
18667c478bd9Sstevel@tonic-gate 	return (milist);
18677c478bd9Sstevel@tonic-gate }
18687c478bd9Sstevel@tonic-gate 
18697c478bd9Sstevel@tonic-gate /*
18707c478bd9Sstevel@tonic-gate  * Free the filesystem list created by make_milist().
18717c478bd9Sstevel@tonic-gate  */
18727c478bd9Sstevel@tonic-gate 
18737c478bd9Sstevel@tonic-gate static void
free_milist(mntinfo4_t ** milist,int nummi)18747c478bd9Sstevel@tonic-gate free_milist(mntinfo4_t **milist, int nummi)
18757c478bd9Sstevel@tonic-gate {
18767c478bd9Sstevel@tonic-gate 	mntinfo4_t *tmi;
18777c478bd9Sstevel@tonic-gate 	int i;
18787c478bd9Sstevel@tonic-gate 
18797c478bd9Sstevel@tonic-gate 	for (i = 0; i < nummi; i++) {
18807c478bd9Sstevel@tonic-gate 		tmi = milist[i];
18817c478bd9Sstevel@tonic-gate 		VFS_RELE(tmi->mi_vfsp);
18827c478bd9Sstevel@tonic-gate 	}
18837c478bd9Sstevel@tonic-gate 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
18847c478bd9Sstevel@tonic-gate }
18857c478bd9Sstevel@tonic-gate 
18867c478bd9Sstevel@tonic-gate /*
18877c478bd9Sstevel@tonic-gate  * Filehandle
18887c478bd9Sstevel@tonic-gate  */
18897c478bd9Sstevel@tonic-gate 
18907c478bd9Sstevel@tonic-gate /*
18917c478bd9Sstevel@tonic-gate  * Lookup the filehandle for the given vnode and update the rnode if it has
18927c478bd9Sstevel@tonic-gate  * changed.
18937c478bd9Sstevel@tonic-gate  *
18947c478bd9Sstevel@tonic-gate  * Errors:
18957c478bd9Sstevel@tonic-gate  * - if the filehandle could not be updated because of an error that
18967c478bd9Sstevel@tonic-gate  *   requires further recovery, initiate that recovery and return.
18977c478bd9Sstevel@tonic-gate  * - if the filehandle could not be updated because of a signal, pretend we
18987c478bd9Sstevel@tonic-gate  *   succeeded and let someone else deal with it.
18997c478bd9Sstevel@tonic-gate  * - if the filehandle could not be updated and the filesystem has been
19007c478bd9Sstevel@tonic-gate  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
19017c478bd9Sstevel@tonic-gate  *   the forced unmount (to retry or not to retry, that is the question).
19027c478bd9Sstevel@tonic-gate  * - if the filehandle could not be updated because of some other error,
19037c478bd9Sstevel@tonic-gate  *   mark the rnode bad and return.
19047c478bd9Sstevel@tonic-gate  */
19057c478bd9Sstevel@tonic-gate static void
recov_filehandle(nfs4_recov_t action,mntinfo4_t * mi,vnode_t * vp)19067c478bd9Sstevel@tonic-gate recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
19077c478bd9Sstevel@tonic-gate {
19087c478bd9Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
19097c478bd9Sstevel@tonic-gate 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
19107c478bd9Sstevel@tonic-gate 	bool_t needrecov;
19117c478bd9Sstevel@tonic-gate 
19127c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
19137c478bd9Sstevel@tonic-gate 
19147c478bd9Sstevel@tonic-gate 	if (rp->r_flags & R4RECOVERR) {
19157c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
19167c478bd9Sstevel@tonic-gate 		return;
19177c478bd9Sstevel@tonic-gate 	}
19187c478bd9Sstevel@tonic-gate 
19197c478bd9Sstevel@tonic-gate 	/*
19207c478bd9Sstevel@tonic-gate 	 * If someone else is updating the filehandle, wait for them to
19217c478bd9Sstevel@tonic-gate 	 * finish and then let our caller retry.
19227c478bd9Sstevel@tonic-gate 	 */
19237c478bd9Sstevel@tonic-gate 	if (rp->r_flags & R4RECEXPFH) {
19247c478bd9Sstevel@tonic-gate 		while (rp->r_flags & R4RECEXPFH) {
19257c478bd9Sstevel@tonic-gate 			cv_wait(&rp->r_cv, &rp->r_statelock);
19267c478bd9Sstevel@tonic-gate 		}
19277c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
19287c478bd9Sstevel@tonic-gate 		return;
19297c478bd9Sstevel@tonic-gate 	}
19307c478bd9Sstevel@tonic-gate 	rp->r_flags |= R4RECEXPFH;
19317c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
19327c478bd9Sstevel@tonic-gate 
19337c478bd9Sstevel@tonic-gate 	if (action == NR_BADHANDLE) {
19347c478bd9Sstevel@tonic-gate 		/* shouldn't happen */
19357c478bd9Sstevel@tonic-gate 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
19367c478bd9Sstevel@tonic-gate 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
19377c478bd9Sstevel@tonic-gate 	}
19387c478bd9Sstevel@tonic-gate 
19397c478bd9Sstevel@tonic-gate 	nfs4_remap_file(mi, vp, 0, &e);
19407c478bd9Sstevel@tonic-gate 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
19417c478bd9Sstevel@tonic-gate 
19427c478bd9Sstevel@tonic-gate 	/*
1943ddbc368aSRick Mesta 	 * If we get BADHANDLE, FHEXPIRED or STALE in their handler,
1944ddbc368aSRick Mesta 	 * something is broken. Don't try to recover, just mark the
1945ddbc368aSRick Mesta 	 * file dead.
19467c478bd9Sstevel@tonic-gate 	 */
1947ddbc368aSRick Mesta 	DTRACE_PROBE2(recov__filehandle, nfs4_error_t, &e, vnode_t, vp);
19487c478bd9Sstevel@tonic-gate 	if (needrecov) {
1949ddbc368aSRick Mesta 		if (e.error == 0) {
1950ddbc368aSRick Mesta 			switch (e.stat) {
1951ddbc368aSRick Mesta 			case NFS4ERR_BADHANDLE:
1952ddbc368aSRick Mesta 			case NFS4ERR_FHEXPIRED:
1953ddbc368aSRick Mesta 			case NFS4ERR_STALE:
1954ddbc368aSRick Mesta 				goto norec;	/* Unrecoverable errors */
1955ddbc368aSRick Mesta 			default:
1956ddbc368aSRick Mesta 				break;
1957ddbc368aSRick Mesta 			}
1958ddbc368aSRick Mesta 		}
1959ddbc368aSRick Mesta 		(void) nfs4_start_recovery(&e, mi, vp, NULL,
1960ddbc368aSRick Mesta 		    NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
1961ddbc368aSRick Mesta 
19627c478bd9Sstevel@tonic-gate 	} else if (e.error != EINTR &&
19637c478bd9Sstevel@tonic-gate 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
19647c478bd9Sstevel@tonic-gate 	    (e.error != 0 || e.stat != NFS4_OK)) {
19657c478bd9Sstevel@tonic-gate 		nfs4_recov_fh_fail(vp, e.error, e.stat);
19667c478bd9Sstevel@tonic-gate 		/*
1967ddbc368aSRick Mesta 		 * Don't set r_error to ESTALE. Higher-level code (e.g.,
19687c478bd9Sstevel@tonic-gate 		 * cstatat_getvp()) retries on ESTALE, which would cause
19697c478bd9Sstevel@tonic-gate 		 * an infinite loop.
19707c478bd9Sstevel@tonic-gate 		 */
19717c478bd9Sstevel@tonic-gate 	}
1972ddbc368aSRick Mesta norec:
19737c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
19747c478bd9Sstevel@tonic-gate 	rp->r_flags &= ~R4RECEXPFH;
19757c478bd9Sstevel@tonic-gate 	cv_broadcast(&rp->r_cv);
19767c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
19777c478bd9Sstevel@tonic-gate }
19787c478bd9Sstevel@tonic-gate 
19797c478bd9Sstevel@tonic-gate /*
19807c478bd9Sstevel@tonic-gate  * Stale Filehandle
19817c478bd9Sstevel@tonic-gate  */
19827c478bd9Sstevel@tonic-gate 
19837c478bd9Sstevel@tonic-gate /*
19847c478bd9Sstevel@tonic-gate  * A stale filehandle can happen when an individual file has
19857c478bd9Sstevel@tonic-gate  * been removed, or when an entire filesystem has been taken
19867c478bd9Sstevel@tonic-gate  * offline.  To distinguish these cases, we do this:
19877c478bd9Sstevel@tonic-gate  * - if a GETATTR with the current filehandle is okay, we do
19887c478bd9Sstevel@tonic-gate  *   nothing (this can happen with two-filehandle ops)
19897c478bd9Sstevel@tonic-gate  * - if the GETATTR fails, but a GETATTR of the root filehandle
19907c478bd9Sstevel@tonic-gate  *   succeeds, mark the rnode with R4STALE, which will stop use
19917c478bd9Sstevel@tonic-gate  * - if the GETATTR fails, and a GETATTR of the root filehandle
19927c478bd9Sstevel@tonic-gate  *   also fails, we consider the problem filesystem-wide, so:
19937c478bd9Sstevel@tonic-gate  *   - if we can failover, we should
19947c478bd9Sstevel@tonic-gate  *   - if we can't failover, we should mark both the original
19957c478bd9Sstevel@tonic-gate  *     vnode and the root bad
19967c478bd9Sstevel@tonic-gate  */
19977c478bd9Sstevel@tonic-gate static void
recov_stale(mntinfo4_t * mi,vnode_t * vp)19987c478bd9Sstevel@tonic-gate recov_stale(mntinfo4_t *mi, vnode_t *vp)
19997c478bd9Sstevel@tonic-gate {
20007c478bd9Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
20017c478bd9Sstevel@tonic-gate 	vnode_t *rootvp = NULL;
20027c478bd9Sstevel@tonic-gate 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
20037c478bd9Sstevel@tonic-gate 	nfs4_ga_res_t gar;
20047c478bd9Sstevel@tonic-gate 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
20057c478bd9Sstevel@tonic-gate 	bool_t needrecov;
20067c478bd9Sstevel@tonic-gate 
20077c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
20087c478bd9Sstevel@tonic-gate 
20097c478bd9Sstevel@tonic-gate 	if (rp->r_flags & R4RECOVERR) {
20107c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
20117c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
20127c478bd9Sstevel@tonic-gate 		    "recov_stale: already marked dead, rp %s",
20137c478bd9Sstevel@tonic-gate 		    rnode4info(rp)));
20147c478bd9Sstevel@tonic-gate 		return;
20157c478bd9Sstevel@tonic-gate 	}
20167c478bd9Sstevel@tonic-gate 
20177c478bd9Sstevel@tonic-gate 	if (rp->r_flags & R4STALE) {
20187c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
20197c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
20207c478bd9Sstevel@tonic-gate 		    "recov_stale: already marked stale, rp %s",
20217c478bd9Sstevel@tonic-gate 		    rnode4info(rp)));
20227c478bd9Sstevel@tonic-gate 		return;
20237c478bd9Sstevel@tonic-gate 	}
20247c478bd9Sstevel@tonic-gate 
20257c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
20267c478bd9Sstevel@tonic-gate 
20277c478bd9Sstevel@tonic-gate 	/* Try a GETATTR on this vnode */
20287c478bd9Sstevel@tonic-gate 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
20297c478bd9Sstevel@tonic-gate 
20307c478bd9Sstevel@tonic-gate 	/*
20317c478bd9Sstevel@tonic-gate 	 * Handle non-STALE recoverable errors
20327c478bd9Sstevel@tonic-gate 	 */
20337c478bd9Sstevel@tonic-gate 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2034ddbc368aSRick Mesta 	if (needrecov) {
2035ddbc368aSRick Mesta 		if (e.error == 0) {
2036ddbc368aSRick Mesta 			switch (e.stat) {
2037ddbc368aSRick Mesta 			case NFS4ERR_STALE:
2038ddbc368aSRick Mesta 			case NFS4ERR_BADHANDLE:
2039ddbc368aSRick Mesta 				goto norec;	/* Unrecoverable */
2040ddbc368aSRick Mesta 			default:
2041ddbc368aSRick Mesta 				break;
2042ddbc368aSRick Mesta 			}
2043ddbc368aSRick Mesta 		}
2044ddbc368aSRick Mesta 		(void) nfs4_start_recovery(&e, mi, vp, NULL,
2045ddbc368aSRick Mesta 		    NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
20467c478bd9Sstevel@tonic-gate 		goto out;
20477c478bd9Sstevel@tonic-gate 	}
2048ddbc368aSRick Mesta norec:
20497c478bd9Sstevel@tonic-gate 	/* Are things OK for this vnode? */
20507c478bd9Sstevel@tonic-gate 	if (!e.error && e.stat == NFS4_OK) {
20517c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
20527c478bd9Sstevel@tonic-gate 		    "recov_stale: file appears fine, rp %s",
20537c478bd9Sstevel@tonic-gate 		    rnode4info(rp)));
20547c478bd9Sstevel@tonic-gate 		goto out;
20557c478bd9Sstevel@tonic-gate 	}
20567c478bd9Sstevel@tonic-gate 
20577c478bd9Sstevel@tonic-gate 	/* Did we get an unrelated non-recoverable error? */
20587c478bd9Sstevel@tonic-gate 	if (e.error || e.stat != NFS4ERR_STALE) {
20597c478bd9Sstevel@tonic-gate 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
20607c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
20617c478bd9Sstevel@tonic-gate 		    "recov_stale: unrelated fatal error, rp %s",
20627c478bd9Sstevel@tonic-gate 		    rnode4info(rp)));
20637c478bd9Sstevel@tonic-gate 		goto out;
20647c478bd9Sstevel@tonic-gate 	}
20657c478bd9Sstevel@tonic-gate 
20667c478bd9Sstevel@tonic-gate 	/*
20677c478bd9Sstevel@tonic-gate 	 * If we don't appear to be dealing with the root node, find it.
20687c478bd9Sstevel@tonic-gate 	 */
20697c478bd9Sstevel@tonic-gate 	if ((vp->v_flag & VROOT) == 0) {
20707c478bd9Sstevel@tonic-gate 		nfs4_error_zinit(&e);
20717c478bd9Sstevel@tonic-gate 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
20727c478bd9Sstevel@tonic-gate 		if (e.error) {
20737c478bd9Sstevel@tonic-gate 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
20747c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
20757c478bd9Sstevel@tonic-gate 			    "recov_stale: can't find root node for rp %s",
20767c478bd9Sstevel@tonic-gate 			    rnode4info(rp)));
20777c478bd9Sstevel@tonic-gate 			goto out;
20787c478bd9Sstevel@tonic-gate 		}
20797c478bd9Sstevel@tonic-gate 	}
20807c478bd9Sstevel@tonic-gate 
20817c478bd9Sstevel@tonic-gate 	/* Try a GETATTR on the root vnode */
20827c478bd9Sstevel@tonic-gate 	if (rootvp != NULL) {
20837c478bd9Sstevel@tonic-gate 		nfs4_error_zinit(&e);
20847c478bd9Sstevel@tonic-gate 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
20857c478bd9Sstevel@tonic-gate 
2086ddbc368aSRick Mesta 		needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2087ddbc368aSRick Mesta 		if (needrecov) {
2088ddbc368aSRick Mesta 			if (e.error == 0) {
2089ddbc368aSRick Mesta 				switch (e.stat) {
2090ddbc368aSRick Mesta 				case NFS4ERR_STALE:
2091ddbc368aSRick Mesta 				case NFS4ERR_BADHANDLE:
2092ddbc368aSRick Mesta 					goto unrec;	/* Unrecoverable */
2093ddbc368aSRick Mesta 				default:
2094ddbc368aSRick Mesta 					break;
2095ddbc368aSRick Mesta 				}
20967c478bd9Sstevel@tonic-gate 			}
2097ddbc368aSRick Mesta 			(void) nfs4_start_recovery(&e, mi, rootvp, NULL,
2098ddbc368aSRick Mesta 			    NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
20997c478bd9Sstevel@tonic-gate 		}
2100ddbc368aSRick Mesta unrec:
21017c478bd9Sstevel@tonic-gate 		/*
21027c478bd9Sstevel@tonic-gate 		 * Check to see if a failover attempt is warranted
21037c478bd9Sstevel@tonic-gate 		 * NB: nfs4_try_failover doesn't check for STALE
21047c478bd9Sstevel@tonic-gate 		 * because recov_stale gets a shot first.  Now that
21057c478bd9Sstevel@tonic-gate 		 * recov_stale has failed, go ahead and try failover.
21067c478bd9Sstevel@tonic-gate 		 *
21077c478bd9Sstevel@tonic-gate 		 * If the getattr on the root filehandle was successful,
21087c478bd9Sstevel@tonic-gate 		 * then mark recovery as failed for 'vp' and exit.
21097c478bd9Sstevel@tonic-gate 		 */
21107c478bd9Sstevel@tonic-gate 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
21117c478bd9Sstevel@tonic-gate 			/*
21127c478bd9Sstevel@tonic-gate 			 * pass the original error to fail_recov, not
21137c478bd9Sstevel@tonic-gate 			 * the one from trying the root vnode.
21147c478bd9Sstevel@tonic-gate 			 */
21157c478bd9Sstevel@tonic-gate 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
21167c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
21177c478bd9Sstevel@tonic-gate 			    "recov_stale: root node OK, marking "
21187c478bd9Sstevel@tonic-gate 			    "dead rp %s", rnode4info(rp)));
21197c478bd9Sstevel@tonic-gate 			goto out;
21207c478bd9Sstevel@tonic-gate 		}
21217c478bd9Sstevel@tonic-gate 	}
21227c478bd9Sstevel@tonic-gate 
21237c478bd9Sstevel@tonic-gate 	/*
21247c478bd9Sstevel@tonic-gate 	 * Here, we know that both the original file and the
21257c478bd9Sstevel@tonic-gate 	 * root filehandle (which may be the same) are stale.
21267c478bd9Sstevel@tonic-gate 	 * We want to fail over if we can, and if we can't, we
21277c478bd9Sstevel@tonic-gate 	 * want to mark everything in sight bad.
21287c478bd9Sstevel@tonic-gate 	 */
21297c478bd9Sstevel@tonic-gate 	if (FAILOVER_MOUNT4(mi)) {
21307c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
21317c478bd9Sstevel@tonic-gate 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
21327c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
21337c478bd9Sstevel@tonic-gate 		    "recov_stale: failing over due to rp %s",
21347c478bd9Sstevel@tonic-gate 		    rnode4info(rp)));
21357c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
21367c478bd9Sstevel@tonic-gate 	} else {
21377c478bd9Sstevel@tonic-gate 		rnode4_t *rootrp;
21387c478bd9Sstevel@tonic-gate 		servinfo4_t *svp;
21397c478bd9Sstevel@tonic-gate 
21407c478bd9Sstevel@tonic-gate 		/*
21417c478bd9Sstevel@tonic-gate 		 * Can't fail over, so mark things dead.
21427c478bd9Sstevel@tonic-gate 		 *
21437c478bd9Sstevel@tonic-gate 		 * If rootvp is set, we know we have a distinct
21447c478bd9Sstevel@tonic-gate 		 * non-root vnode which can be marked dead in
21457c478bd9Sstevel@tonic-gate 		 * the usual way.
21467c478bd9Sstevel@tonic-gate 		 *
21477c478bd9Sstevel@tonic-gate 		 * Then we want to mark the root vnode dead.
21487c478bd9Sstevel@tonic-gate 		 * Note that if rootvp wasn't set, our vp is
21497c478bd9Sstevel@tonic-gate 		 * actually the root vnode.
21507c478bd9Sstevel@tonic-gate 		 */
21517c478bd9Sstevel@tonic-gate 		if (rootvp != NULL) {
21527c478bd9Sstevel@tonic-gate 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
21537c478bd9Sstevel@tonic-gate 			    "recov_stale: can't fail over, marking dead rp %s",
21547c478bd9Sstevel@tonic-gate 			    rnode4info(rp)));
21557c478bd9Sstevel@tonic-gate 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
21567c478bd9Sstevel@tonic-gate 		} else {
21577c478bd9Sstevel@tonic-gate 			rootvp = vp;
21587c478bd9Sstevel@tonic-gate 			VN_HOLD(rootvp);
21597c478bd9Sstevel@tonic-gate 		}
21607c478bd9Sstevel@tonic-gate 
21617c478bd9Sstevel@tonic-gate 		/*
21627c478bd9Sstevel@tonic-gate 		 * Mark root dead, but quietly - since
21637c478bd9Sstevel@tonic-gate 		 * the root rnode is frequently recreated,
21647c478bd9Sstevel@tonic-gate 		 * we can encounter this at every access.
21657c478bd9Sstevel@tonic-gate 		 * Also mark recovery as failed on this VFS.
21667c478bd9Sstevel@tonic-gate 		 */
21677c478bd9Sstevel@tonic-gate 		rootrp = VTOR4(rootvp);
21687c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
21697c478bd9Sstevel@tonic-gate 		    "recov_stale: marking dead root rp %s",
21707c478bd9Sstevel@tonic-gate 		    rnode4info(rootrp)));
21717c478bd9Sstevel@tonic-gate 		mutex_enter(&rootrp->r_statelock);
21727c478bd9Sstevel@tonic-gate 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
21737c478bd9Sstevel@tonic-gate 		rootrp->r_error = ESTALE;
21747c478bd9Sstevel@tonic-gate 		mutex_exit(&rootrp->r_statelock);
21757c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
21767c478bd9Sstevel@tonic-gate 		mi->mi_error = ESTALE;
21777c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
21787c478bd9Sstevel@tonic-gate 
21797c478bd9Sstevel@tonic-gate 		svp = mi->mi_curr_serv;
21807c478bd9Sstevel@tonic-gate 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
21817c478bd9Sstevel@tonic-gate 		svp->sv_flags |= SV4_ROOT_STALE;
21827c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&svp->sv_lock);
21837c478bd9Sstevel@tonic-gate 	}
21847c478bd9Sstevel@tonic-gate 
21857c478bd9Sstevel@tonic-gate out:
21867c478bd9Sstevel@tonic-gate 	if (rootvp)
21877c478bd9Sstevel@tonic-gate 		VN_RELE(rootvp);
21887c478bd9Sstevel@tonic-gate }
21897c478bd9Sstevel@tonic-gate 
21907c478bd9Sstevel@tonic-gate /*
21917c478bd9Sstevel@tonic-gate  * Locks.
21927c478bd9Sstevel@tonic-gate  */
21937c478bd9Sstevel@tonic-gate 
21947c478bd9Sstevel@tonic-gate /*
21957c478bd9Sstevel@tonic-gate  * Reclaim all the active (acquired) locks for the given file.
21967c478bd9Sstevel@tonic-gate  * If a process lost a lock, the process is sent a SIGLOST.  This is not
21977c478bd9Sstevel@tonic-gate  * considered an error.
21987c478bd9Sstevel@tonic-gate  *
21997c478bd9Sstevel@tonic-gate  * Return values:
22007c478bd9Sstevel@tonic-gate  * Errors and status are returned via the nfs4_error_t parameter
22017c478bd9Sstevel@tonic-gate  * If an error indicates that recovery is needed, the caller is responsible
22027c478bd9Sstevel@tonic-gate  * for dealing with it.
22037c478bd9Sstevel@tonic-gate  */
22047c478bd9Sstevel@tonic-gate 
22057c478bd9Sstevel@tonic-gate static void
relock_file(vnode_t * vp,mntinfo4_t * mi,nfs4_error_t * ep,fattr4_change pre_change)22067c478bd9Sstevel@tonic-gate relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
22077c478bd9Sstevel@tonic-gate     fattr4_change pre_change)
22087c478bd9Sstevel@tonic-gate {
22097c478bd9Sstevel@tonic-gate 	locklist_t *locks, *llp;
22107c478bd9Sstevel@tonic-gate 	rnode4_t *rp;
22117c478bd9Sstevel@tonic-gate 
22127c478bd9Sstevel@tonic-gate 	ASSERT(ep != NULL);
22137c478bd9Sstevel@tonic-gate 	nfs4_error_zinit(ep);
22147c478bd9Sstevel@tonic-gate 
22157c478bd9Sstevel@tonic-gate 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
22167c478bd9Sstevel@tonic-gate 		return;
22177c478bd9Sstevel@tonic-gate 
22187c478bd9Sstevel@tonic-gate 	nfs4_flush_lock_owners(VTOR4(vp));
22197c478bd9Sstevel@tonic-gate 
22207c478bd9Sstevel@tonic-gate 	/*
22217c478bd9Sstevel@tonic-gate 	 * If we get an error that requires recovery actions, just bail out
22227c478bd9Sstevel@tonic-gate 	 * and let the top-level recovery code handle it.
22237c478bd9Sstevel@tonic-gate 	 *
22247c478bd9Sstevel@tonic-gate 	 * If we get some other error, kill the process that owned the lock
22257c478bd9Sstevel@tonic-gate 	 * and mark its remaining locks (if any) as belonging to NOPID, so
22267c478bd9Sstevel@tonic-gate 	 * that we don't make any more reclaim requests for that process.
22277c478bd9Sstevel@tonic-gate 	 */
22287c478bd9Sstevel@tonic-gate 
22297c478bd9Sstevel@tonic-gate 	rp = VTOR4(vp);
22307c478bd9Sstevel@tonic-gate 	locks = flk_active_locks_for_vp(vp);
22317c478bd9Sstevel@tonic-gate 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
22327c478bd9Sstevel@tonic-gate 		int did_reclaim = 1;
22337c478bd9Sstevel@tonic-gate 
22347c478bd9Sstevel@tonic-gate 		ASSERT(llp->ll_vp == vp);
22357c478bd9Sstevel@tonic-gate 		if (llp->ll_flock.l_pid == NOPID)
22367c478bd9Sstevel@tonic-gate 			continue;
22377c478bd9Sstevel@tonic-gate 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
22387c478bd9Sstevel@tonic-gate 		/*
22397c478bd9Sstevel@tonic-gate 		 * If we need to restart recovery, stop processing the
22407c478bd9Sstevel@tonic-gate 		 * list.  Some errors would be recoverable under other
22417c478bd9Sstevel@tonic-gate 		 * circumstances, but if they happen here we just give up
22427c478bd9Sstevel@tonic-gate 		 * on the lock.
22437c478bd9Sstevel@tonic-gate 		 */
22447c478bd9Sstevel@tonic-gate 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
22457c478bd9Sstevel@tonic-gate 			if (ep->error != 0)
22467c478bd9Sstevel@tonic-gate 				break;
22477c478bd9Sstevel@tonic-gate 			if (!nfs4_recov_marks_dead(ep->stat))
22487c478bd9Sstevel@tonic-gate 				break;
22497c478bd9Sstevel@tonic-gate 		}
22507c478bd9Sstevel@tonic-gate 		/*
22517c478bd9Sstevel@tonic-gate 		 *   In case the server isn't offering us a grace period, or
22527c478bd9Sstevel@tonic-gate 		 * if we missed it, we might have opened & locked from scratch,
22537c478bd9Sstevel@tonic-gate 		 * rather than reopened/reclaimed.
22547c478bd9Sstevel@tonic-gate 		 *   We need to ensure that the object hadn't been otherwise
22557c478bd9Sstevel@tonic-gate 		 * changed during this time, by comparing the changeinfo.
22567c478bd9Sstevel@tonic-gate 		 *   We get passed the changeinfo from before the reopen by our
22577c478bd9Sstevel@tonic-gate 		 * caller, in pre_change.
22587c478bd9Sstevel@tonic-gate 		 *   The changeinfo from after the reopen is in rp->r_change,
22597c478bd9Sstevel@tonic-gate 		 * courtesy of the GETATTR in the reopen.
22607c478bd9Sstevel@tonic-gate 		 *   If they're different, then the file has changed, and we
22617c478bd9Sstevel@tonic-gate 		 * have to SIGLOST the app.
22627c478bd9Sstevel@tonic-gate 		 */
22637c478bd9Sstevel@tonic-gate 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
22647c478bd9Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
22657c478bd9Sstevel@tonic-gate 			if (pre_change != rp->r_change)
22667c478bd9Sstevel@tonic-gate 				ep->stat = NFS4ERR_NO_GRACE;
22677c478bd9Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
22687c478bd9Sstevel@tonic-gate 		}
22697c478bd9Sstevel@tonic-gate 		if (ep->error != 0 || ep->stat != NFS4_OK) {
22707c478bd9Sstevel@tonic-gate 			if (ep->error != 0)
22717c478bd9Sstevel@tonic-gate 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
22727c478bd9Sstevel@tonic-gate 				    NULL, ep->error, vp, NULL, 0, NULL,
22737c478bd9Sstevel@tonic-gate 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
22747c478bd9Sstevel@tonic-gate 				    0, 0);
22757c478bd9Sstevel@tonic-gate 			else
22767c478bd9Sstevel@tonic-gate 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
22777c478bd9Sstevel@tonic-gate 				    NULL, 0, vp, NULL, ep->stat, NULL,
22787c478bd9Sstevel@tonic-gate 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
22797c478bd9Sstevel@tonic-gate 				    0, 0);
22807c478bd9Sstevel@tonic-gate 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
22817c478bd9Sstevel@tonic-gate 			    ep->error, ep->stat);
2282*faf39f17SMarcel Telka 			relock_skip_pid(vp, llp, llp->ll_flock.l_pid);
22837c478bd9Sstevel@tonic-gate 
22847c478bd9Sstevel@tonic-gate 			/* Reinitialize the nfs4_error and continue */
22857c478bd9Sstevel@tonic-gate 			nfs4_error_zinit(ep);
22867c478bd9Sstevel@tonic-gate 		}
22877c478bd9Sstevel@tonic-gate 	}
22887c478bd9Sstevel@tonic-gate 
22897c478bd9Sstevel@tonic-gate 	if (locks != NULL)
22907c478bd9Sstevel@tonic-gate 		flk_free_locklist(locks);
22917c478bd9Sstevel@tonic-gate }
22927c478bd9Sstevel@tonic-gate 
22937c478bd9Sstevel@tonic-gate /*
22947c478bd9Sstevel@tonic-gate  * Reclaim the given lock.
22957c478bd9Sstevel@tonic-gate  *
22967c478bd9Sstevel@tonic-gate  * Errors are returned via the nfs4_error_t parameter.
22977c478bd9Sstevel@tonic-gate  */
22987c478bd9Sstevel@tonic-gate static void
reclaim_one_lock(vnode_t * vp,flock64_t * flk,nfs4_error_t * ep,int * did_reclaimp)22997c478bd9Sstevel@tonic-gate reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2300b9238976Sth     int *did_reclaimp)
23017c478bd9Sstevel@tonic-gate {
23027c478bd9Sstevel@tonic-gate 	cred_t *cr;
23037c478bd9Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
23047c478bd9Sstevel@tonic-gate 
23057c478bd9Sstevel@tonic-gate 	cr = pid_to_cr(flk->l_pid);
23067c478bd9Sstevel@tonic-gate 	if (cr == NULL) {
2307*faf39f17SMarcel Telka 		nfs4_error_init(ep, ESRCH);
23087c478bd9Sstevel@tonic-gate 		return;
23097c478bd9Sstevel@tonic-gate 	}
23107c478bd9Sstevel@tonic-gate 
23117c478bd9Sstevel@tonic-gate 	do {
23127c478bd9Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
23137c478bd9Sstevel@tonic-gate 		if (rp->r_flags & R4RECOVERR) {
23147c478bd9Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
2315*faf39f17SMarcel Telka 			nfs4_error_init(ep, ESTALE);
23167c478bd9Sstevel@tonic-gate 			break;
23177c478bd9Sstevel@tonic-gate 		}
23187c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
23197c478bd9Sstevel@tonic-gate 
23207c478bd9Sstevel@tonic-gate 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
2321b9238976Sth 		    FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
23227c478bd9Sstevel@tonic-gate 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
23237c478bd9Sstevel@tonic-gate 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2324b9238976Sth 			    vp, NULL);
23257c478bd9Sstevel@tonic-gate 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
23267c478bd9Sstevel@tonic-gate 
23277c478bd9Sstevel@tonic-gate 	crfree(cr);
23287c478bd9Sstevel@tonic-gate }
23297c478bd9Sstevel@tonic-gate 
23307c478bd9Sstevel@tonic-gate /*
23317c478bd9Sstevel@tonic-gate  * Open files.
23327c478bd9Sstevel@tonic-gate  */
23337c478bd9Sstevel@tonic-gate 
23347c478bd9Sstevel@tonic-gate /*
23357c478bd9Sstevel@tonic-gate  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
23367c478bd9Sstevel@tonic-gate  * Returns 1 if the error is valid; 0 otherwise.
23377c478bd9Sstevel@tonic-gate  */
23387c478bd9Sstevel@tonic-gate static int
nfs4_valid_recov_err_for_vp(vnode_t * vp,nfsstat4 stat)23397c478bd9Sstevel@tonic-gate nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
23407c478bd9Sstevel@tonic-gate {
23417c478bd9Sstevel@tonic-gate 	/*
23427c478bd9Sstevel@tonic-gate 	 * We should not be marking non-regular files as dead,
23437c478bd9Sstevel@tonic-gate 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
23447c478bd9Sstevel@tonic-gate 	 */
23457c478bd9Sstevel@tonic-gate 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
23467c478bd9Sstevel@tonic-gate 	    stat != NFS4ERR_BADNAME)
23477c478bd9Sstevel@tonic-gate 		return (0);
23487c478bd9Sstevel@tonic-gate 
23497c478bd9Sstevel@tonic-gate 	return (1);
23507c478bd9Sstevel@tonic-gate }
23517c478bd9Sstevel@tonic-gate 
23527c478bd9Sstevel@tonic-gate /*
23537c478bd9Sstevel@tonic-gate  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
23547c478bd9Sstevel@tonic-gate  * then mark the object dead.  Since we've had to do a lookup for
23557c478bd9Sstevel@tonic-gate  * filehandle recovery, we will mark the object dead if we got NOENT.
23567c478bd9Sstevel@tonic-gate  */
23577c478bd9Sstevel@tonic-gate static void
nfs4_recov_fh_fail(vnode_t * vp,int error,nfsstat4 stat)23587c478bd9Sstevel@tonic-gate nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
23597c478bd9Sstevel@tonic-gate {
23607c478bd9Sstevel@tonic-gate 	ASSERT(vp != NULL);
23617c478bd9Sstevel@tonic-gate 
23627c478bd9Sstevel@tonic-gate 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
23637c478bd9Sstevel@tonic-gate 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
23647c478bd9Sstevel@tonic-gate 		return;
23657c478bd9Sstevel@tonic-gate 
23667c478bd9Sstevel@tonic-gate 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
23677c478bd9Sstevel@tonic-gate }
23687c478bd9Sstevel@tonic-gate 
23697c478bd9Sstevel@tonic-gate /*
23707c478bd9Sstevel@tonic-gate  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
23717c478bd9Sstevel@tonic-gate  * to mark only the data structure(s) that provided the bad value as being
23727c478bd9Sstevel@tonic-gate  * bad.  But for now we'll just mark the entire file.
23737c478bd9Sstevel@tonic-gate  */
23747c478bd9Sstevel@tonic-gate 
23757c478bd9Sstevel@tonic-gate static void
recov_badstate(recov_info_t * recovp,vnode_t * vp,nfsstat4 stat)23767c478bd9Sstevel@tonic-gate recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
23777c478bd9Sstevel@tonic-gate {
23787c478bd9Sstevel@tonic-gate 	ASSERT(vp != NULL);
23797c478bd9Sstevel@tonic-gate 	recov_throttle(recovp, vp);
23807c478bd9Sstevel@tonic-gate 
23817c478bd9Sstevel@tonic-gate 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
23827c478bd9Sstevel@tonic-gate 		return;
23837c478bd9Sstevel@tonic-gate 
23847c478bd9Sstevel@tonic-gate 	nfs4_fail_recov(vp, "", 0, stat);
23857c478bd9Sstevel@tonic-gate }
23867c478bd9Sstevel@tonic-gate 
23877c478bd9Sstevel@tonic-gate /*
23887c478bd9Sstevel@tonic-gate  * Free up the information saved for a lost state request.
23897c478bd9Sstevel@tonic-gate  */
23907c478bd9Sstevel@tonic-gate static void
nfs4_free_lost_rqst(nfs4_lost_rqst_t * lrp,nfs4_server_t * sp)23917c478bd9Sstevel@tonic-gate nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
23927c478bd9Sstevel@tonic-gate {
23937c478bd9Sstevel@tonic-gate 	component4 *filep;
23947c478bd9Sstevel@tonic-gate 	nfs4_open_stream_t *osp;
23957c478bd9Sstevel@tonic-gate 	int have_sync_lock;
23967c478bd9Sstevel@tonic-gate 
23977c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_lost_rqst_debug,
2398b9238976Sth 	    (CE_NOTE, "nfs4_free_lost_rqst:"));
23997c478bd9Sstevel@tonic-gate 
24007c478bd9Sstevel@tonic-gate 	switch (lrp->lr_op) {
24017c478bd9Sstevel@tonic-gate 	case OP_OPEN:
24027c478bd9Sstevel@tonic-gate 		filep = &lrp->lr_ofile;
24037c478bd9Sstevel@tonic-gate 		if (filep->utf8string_val) {
24047c478bd9Sstevel@tonic-gate 			kmem_free(filep->utf8string_val, filep->utf8string_len);
24057c478bd9Sstevel@tonic-gate 			filep->utf8string_val = NULL;
24067c478bd9Sstevel@tonic-gate 		}
24077c478bd9Sstevel@tonic-gate 		break;
24087c478bd9Sstevel@tonic-gate 	case OP_DELEGRETURN:
24097c478bd9Sstevel@tonic-gate 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
24107c478bd9Sstevel@tonic-gate 		break;
24117c478bd9Sstevel@tonic-gate 	case OP_CLOSE:
24127c478bd9Sstevel@tonic-gate 		osp = lrp->lr_osp;
24137c478bd9Sstevel@tonic-gate 		ASSERT(osp != NULL);
24147c478bd9Sstevel@tonic-gate 		mutex_enter(&osp->os_sync_lock);
24157c478bd9Sstevel@tonic-gate 		have_sync_lock = 1;
24167c478bd9Sstevel@tonic-gate 		if (osp->os_pending_close) {
24177c478bd9Sstevel@tonic-gate 			/* clean up the open file state. */
24187c478bd9Sstevel@tonic-gate 			osp->os_pending_close = 0;
24197c478bd9Sstevel@tonic-gate 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
24207c478bd9Sstevel@tonic-gate 		}
24217c478bd9Sstevel@tonic-gate 		if (have_sync_lock)
24227c478bd9Sstevel@tonic-gate 			mutex_exit(&osp->os_sync_lock);
24237c478bd9Sstevel@tonic-gate 		break;
24247c478bd9Sstevel@tonic-gate 	}
24257c478bd9Sstevel@tonic-gate 
24267c478bd9Sstevel@tonic-gate 	lrp->lr_op = 0;
24277c478bd9Sstevel@tonic-gate 	if (lrp->lr_oop != NULL) {
24287c478bd9Sstevel@tonic-gate 		open_owner_rele(lrp->lr_oop);
24297c478bd9Sstevel@tonic-gate 		lrp->lr_oop = NULL;
24307c478bd9Sstevel@tonic-gate 	}
24317c478bd9Sstevel@tonic-gate 	if (lrp->lr_osp != NULL) {
24327c478bd9Sstevel@tonic-gate 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
24337c478bd9Sstevel@tonic-gate 		lrp->lr_osp = NULL;
24347c478bd9Sstevel@tonic-gate 	}
24357c478bd9Sstevel@tonic-gate 	if (lrp->lr_lop != NULL) {
24367c478bd9Sstevel@tonic-gate 		lock_owner_rele(lrp->lr_lop);
24377c478bd9Sstevel@tonic-gate 		lrp->lr_lop = NULL;
24387c478bd9Sstevel@tonic-gate 	}
24397c478bd9Sstevel@tonic-gate 	if (lrp->lr_flk != NULL) {
24407c478bd9Sstevel@tonic-gate 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
24417c478bd9Sstevel@tonic-gate 		lrp->lr_flk = NULL;
24427c478bd9Sstevel@tonic-gate 	}
24437c478bd9Sstevel@tonic-gate 	if (lrp->lr_vp != NULL) {
24447c478bd9Sstevel@tonic-gate 		VN_RELE(lrp->lr_vp);
24457c478bd9Sstevel@tonic-gate 		lrp->lr_vp = NULL;
24467c478bd9Sstevel@tonic-gate 	}
24477c478bd9Sstevel@tonic-gate 	if (lrp->lr_dvp != NULL) {
24487c478bd9Sstevel@tonic-gate 		VN_RELE(lrp->lr_dvp);
24497c478bd9Sstevel@tonic-gate 		lrp->lr_dvp = NULL;
24507c478bd9Sstevel@tonic-gate 	}
24517c478bd9Sstevel@tonic-gate 	if (lrp->lr_cr != NULL) {
24527c478bd9Sstevel@tonic-gate 		crfree(lrp->lr_cr);
24537c478bd9Sstevel@tonic-gate 		lrp->lr_cr = NULL;
24547c478bd9Sstevel@tonic-gate 	}
24557c478bd9Sstevel@tonic-gate 
24567c478bd9Sstevel@tonic-gate 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
24577c478bd9Sstevel@tonic-gate }
24587c478bd9Sstevel@tonic-gate 
24597c478bd9Sstevel@tonic-gate /*
24607c478bd9Sstevel@tonic-gate  * Remove any lost state requests and free them.
24617c478bd9Sstevel@tonic-gate  */
24627c478bd9Sstevel@tonic-gate static void
nfs4_remove_lost_rqsts(mntinfo4_t * mi,nfs4_server_t * sp)24637c478bd9Sstevel@tonic-gate nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
24647c478bd9Sstevel@tonic-gate {
24657c478bd9Sstevel@tonic-gate 	nfs4_lost_rqst_t *lrp;
24667c478bd9Sstevel@tonic-gate 
24677c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
24687c478bd9Sstevel@tonic-gate 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
24697c478bd9Sstevel@tonic-gate 		list_remove(&mi->mi_lost_state, lrp);
24707c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
24717c478bd9Sstevel@tonic-gate 		nfs4_free_lost_rqst(lrp, sp);
24727c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
24737c478bd9Sstevel@tonic-gate 	}
24747c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
24757c478bd9Sstevel@tonic-gate }
24767c478bd9Sstevel@tonic-gate 
24777c478bd9Sstevel@tonic-gate /*
24787c478bd9Sstevel@tonic-gate  * Reopen all the files for the given filesystem and reclaim any locks.
24797c478bd9Sstevel@tonic-gate  */
24807c478bd9Sstevel@tonic-gate 
24817c478bd9Sstevel@tonic-gate static void
recov_openfiles(recov_info_t * recovp,nfs4_server_t * sp)24827c478bd9Sstevel@tonic-gate recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
24837c478bd9Sstevel@tonic-gate {
24847c478bd9Sstevel@tonic-gate 	mntinfo4_t *mi = recovp->rc_mi;
24857c478bd9Sstevel@tonic-gate 	nfs4_opinst_t *reopenlist = NULL, *rep;
24867c478bd9Sstevel@tonic-gate 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
24877c478bd9Sstevel@tonic-gate 	open_claim_type4 claim;
24887c478bd9Sstevel@tonic-gate 	int remap;
24897c478bd9Sstevel@tonic-gate 	char *fail_msg = "No such file or directory on replica";
24907c478bd9Sstevel@tonic-gate 	rnode4_t *rp;
24917c478bd9Sstevel@tonic-gate 	fattr4_change pre_change;
24927c478bd9Sstevel@tonic-gate 
24937c478bd9Sstevel@tonic-gate 	ASSERT(sp != NULL);
24947c478bd9Sstevel@tonic-gate 
24957c478bd9Sstevel@tonic-gate 	/*
24967c478bd9Sstevel@tonic-gate 	 * This check is to allow a 10ms pause before we reopen files
24977c478bd9Sstevel@tonic-gate 	 * it should allow the server time to have received the CB_NULL
24987c478bd9Sstevel@tonic-gate 	 * reply and update its internal structures such that (if
24997c478bd9Sstevel@tonic-gate 	 * applicable) we are granted a delegation on reopened files.
25007c478bd9Sstevel@tonic-gate 	 */
25017c478bd9Sstevel@tonic-gate 	mutex_enter(&sp->s_lock);
25027c478bd9Sstevel@tonic-gate 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
25037c478bd9Sstevel@tonic-gate 		sp->s_flags |= N4S_CB_WAITER;
2504d3d50737SRafael Vanoni 		(void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock,
2505d3d50737SRafael Vanoni 		    drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK);
25067c478bd9Sstevel@tonic-gate 	}
25077c478bd9Sstevel@tonic-gate 	mutex_exit(&sp->s_lock);
25087c478bd9Sstevel@tonic-gate 
25097c478bd9Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
25107c478bd9Sstevel@tonic-gate 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
25117c478bd9Sstevel@tonic-gate 
25127c478bd9Sstevel@tonic-gate 	if (NFS4_VOLATILE_FH(mi)) {
25137c478bd9Sstevel@tonic-gate 		nfs4_remap_root(mi, &e, 0);
25147c478bd9Sstevel@tonic-gate 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
25157c478bd9Sstevel@tonic-gate 			(void) nfs4_start_recovery(&e, mi, NULL,
25162f172c55SRobert Thurlow 			    NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
25177c478bd9Sstevel@tonic-gate 		}
25187c478bd9Sstevel@tonic-gate 	}
25197c478bd9Sstevel@tonic-gate 
25207c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
25217c478bd9Sstevel@tonic-gate 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
25227c478bd9Sstevel@tonic-gate 		claim = CLAIM_PREVIOUS;
25237c478bd9Sstevel@tonic-gate 	else
25247c478bd9Sstevel@tonic-gate 		claim = CLAIM_NULL;
25257c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
25267c478bd9Sstevel@tonic-gate 
25277c478bd9Sstevel@tonic-gate 	if (e.error == 0 && e.stat == NFS4_OK) {
25287c478bd9Sstevel@tonic-gate 		/*
25297c478bd9Sstevel@tonic-gate 		 * Get a snapshot of open files in the filesystem.  Note
25307c478bd9Sstevel@tonic-gate 		 * that new opens will stall until the server's grace
25317c478bd9Sstevel@tonic-gate 		 * period is done.
25327c478bd9Sstevel@tonic-gate 		 */
25337c478bd9Sstevel@tonic-gate 		reopenlist = r4mkopenlist(mi);
25347c478bd9Sstevel@tonic-gate 
25357c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
25367c478bd9Sstevel@tonic-gate 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
25377c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
25387c478bd9Sstevel@tonic-gate 		/*
25397c478bd9Sstevel@tonic-gate 		 * Since we are re-establishing state on the
25407c478bd9Sstevel@tonic-gate 		 * server, its ok to blow away the saved lost
25417c478bd9Sstevel@tonic-gate 		 * requests since we don't need to reissue it.
25427c478bd9Sstevel@tonic-gate 		 */
25437c478bd9Sstevel@tonic-gate 		nfs4_remove_lost_rqsts(mi, sp);
25447c478bd9Sstevel@tonic-gate 
25457c478bd9Sstevel@tonic-gate 		for (rep = reopenlist; rep; rep = rep->re_next) {
25467c478bd9Sstevel@tonic-gate 
25477c478bd9Sstevel@tonic-gate 			if (remap) {
25487c478bd9Sstevel@tonic-gate 				nfs4_remap_file(mi, rep->re_vp,
2549b9238976Sth 				    NFS4_REMAP_CKATTRS, &e);
25507c478bd9Sstevel@tonic-gate 			}
2551ddbc368aSRick Mesta 			DTRACE_PROBE2(recov__openfiles, nfs4_error_t, &e,
2552ddbc368aSRick Mesta 			    vnode_t, rep->re_vp);
25537c478bd9Sstevel@tonic-gate 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
25547c478bd9Sstevel@tonic-gate 				/*
25557c478bd9Sstevel@tonic-gate 				 * The current server does not have the file
25567c478bd9Sstevel@tonic-gate 				 * that is to be remapped.  This is most
25577c478bd9Sstevel@tonic-gate 				 * likely due to an improperly maintained
25587c478bd9Sstevel@tonic-gate 				 * replica.   The files that are missing from
25597c478bd9Sstevel@tonic-gate 				 * the server will be marked dead and logged
25607c478bd9Sstevel@tonic-gate 				 * in order to make sys admins aware of the
25617c478bd9Sstevel@tonic-gate 				 * problem.
25627c478bd9Sstevel@tonic-gate 				 */
25637c478bd9Sstevel@tonic-gate 				nfs4_fail_recov(rep->re_vp,
2564b9238976Sth 				    fail_msg, e.error, e.stat);
25657c478bd9Sstevel@tonic-gate 				/*
25667c478bd9Sstevel@tonic-gate 				 * We've already handled the error so clear it.
25677c478bd9Sstevel@tonic-gate 				 */
25687c478bd9Sstevel@tonic-gate 				nfs4_error_zinit(&e);
25697c478bd9Sstevel@tonic-gate 				continue;
25707c478bd9Sstevel@tonic-gate 			} else if (e.error == 0 && e.stat == NFS4_OK) {
25717c478bd9Sstevel@tonic-gate 				int j;
25727c478bd9Sstevel@tonic-gate 
25737c478bd9Sstevel@tonic-gate 				rp = VTOR4(rep->re_vp);
25747c478bd9Sstevel@tonic-gate 				mutex_enter(&rp->r_statelock);
25757c478bd9Sstevel@tonic-gate 				pre_change = rp->r_change;
25767c478bd9Sstevel@tonic-gate 				mutex_exit(&rp->r_statelock);
25777c478bd9Sstevel@tonic-gate 
25787c478bd9Sstevel@tonic-gate 				for (j = 0; j < rep->re_numosp; j++) {
25797c478bd9Sstevel@tonic-gate 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
2580b9238976Sth 					    &e, claim, FALSE, TRUE);
25817c478bd9Sstevel@tonic-gate 					if (e.error != 0 || e.stat != NFS4_OK)
25827c478bd9Sstevel@tonic-gate 						break;
25837c478bd9Sstevel@tonic-gate 				}
25847c478bd9Sstevel@tonic-gate 				if (nfs4_needs_recovery(&e, TRUE,
25857c478bd9Sstevel@tonic-gate 				    mi->mi_vfsp)) {
25867c478bd9Sstevel@tonic-gate 					(void) nfs4_start_recovery(&e, mi,
2587b9238976Sth 					    rep->re_vp, NULL, NULL, NULL,
25882f172c55SRobert Thurlow 					    OP_OPEN, NULL, NULL, NULL);
25897c478bd9Sstevel@tonic-gate 					break;
25907c478bd9Sstevel@tonic-gate 				}
25917c478bd9Sstevel@tonic-gate 			}
25927c478bd9Sstevel@tonic-gate #ifdef DEBUG
25937c478bd9Sstevel@tonic-gate 			if (nfs4_recovdelay > 0)
25947c478bd9Sstevel@tonic-gate 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
25957c478bd9Sstevel@tonic-gate #endif
2596ddbc368aSRick Mesta 			if (e.error == 0 && e.stat == NFS4_OK) {
25977c478bd9Sstevel@tonic-gate 				relock_file(rep->re_vp, mi, &e, pre_change);
25987c478bd9Sstevel@tonic-gate 
2599ddbc368aSRick Mesta 				if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2600ddbc368aSRick Mesta 					(void) nfs4_start_recovery(&e, mi,
2601ddbc368aSRick Mesta 					    rep->re_vp, NULL, NULL, NULL,
2602ddbc368aSRick Mesta 					    OP_LOCK, NULL, NULL, NULL);
2603ddbc368aSRick Mesta 			}
2604ddbc368aSRick Mesta 
26057c478bd9Sstevel@tonic-gate 			if (e.error != 0 || e.stat != NFS4_OK)
26067c478bd9Sstevel@tonic-gate 				break;
26077c478bd9Sstevel@tonic-gate 		}
26087c478bd9Sstevel@tonic-gate 
26097c478bd9Sstevel@tonic-gate 		/*
26107c478bd9Sstevel@tonic-gate 		 * Check to see if we need to remap files passed in
26117c478bd9Sstevel@tonic-gate 		 * via the recovery arguments; this will have been
26127c478bd9Sstevel@tonic-gate 		 * done for open files.  A failure here is not fatal.
26137c478bd9Sstevel@tonic-gate 		 */
26147c478bd9Sstevel@tonic-gate 		if (remap) {
26157c478bd9Sstevel@tonic-gate 			nfs4_error_t ignore;
26167c478bd9Sstevel@tonic-gate 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2617b9238976Sth 			    &ignore);
26187c478bd9Sstevel@tonic-gate 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2619b9238976Sth 			    &ignore);
26207c478bd9Sstevel@tonic-gate 		}
26217c478bd9Sstevel@tonic-gate 	}
26227c478bd9Sstevel@tonic-gate 
26237c478bd9Sstevel@tonic-gate 	if (e.error == 0 && e.stat == NFS4_OK) {
26247c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
26257c478bd9Sstevel@tonic-gate 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
26267c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
26277c478bd9Sstevel@tonic-gate 	}
26287c478bd9Sstevel@tonic-gate 
26297c478bd9Sstevel@tonic-gate 	nfs_rw_exit(&mi->mi_recovlock);
26307c478bd9Sstevel@tonic-gate 	nfs_rw_exit(&sp->s_recovlock);
26317c478bd9Sstevel@tonic-gate 
26327c478bd9Sstevel@tonic-gate 	if (reopenlist != NULL)
26337c478bd9Sstevel@tonic-gate 		r4releopenlist(reopenlist);
26347c478bd9Sstevel@tonic-gate }
26357c478bd9Sstevel@tonic-gate 
26367c478bd9Sstevel@tonic-gate /*
26377c478bd9Sstevel@tonic-gate  * Resend the queued state recovery requests in "rqsts".
26387c478bd9Sstevel@tonic-gate  */
26397c478bd9Sstevel@tonic-gate 
26407c478bd9Sstevel@tonic-gate static void
nfs4_resend_lost_rqsts(recov_info_t * recovp,nfs4_server_t * sp)26417c478bd9Sstevel@tonic-gate nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
26427c478bd9Sstevel@tonic-gate {
26437c478bd9Sstevel@tonic-gate 	nfs4_lost_rqst_t	*lrp, *tlrp;
26447c478bd9Sstevel@tonic-gate 	mntinfo4_t		*mi = recovp->rc_mi;
2645ba8fdb6fSek 	nfs4_error_t		n4e;
26467c478bd9Sstevel@tonic-gate #ifdef NOTYET
26477c478bd9Sstevel@tonic-gate 	uint32_t		deny_bits = 0;
26487c478bd9Sstevel@tonic-gate #endif
26497c478bd9Sstevel@tonic-gate 
26507c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
26517c478bd9Sstevel@tonic-gate 
26527c478bd9Sstevel@tonic-gate 	ASSERT(mi != NULL);
26537c478bd9Sstevel@tonic-gate 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
26547c478bd9Sstevel@tonic-gate 
26557c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
26567c478bd9Sstevel@tonic-gate 	lrp = list_head(&mi->mi_lost_state);
26577c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
26587c478bd9Sstevel@tonic-gate 	while (lrp != NULL) {
2659ba8fdb6fSek 		nfs4_error_zinit(&n4e);
2660ba8fdb6fSek 		resend_one_op(lrp, &n4e, mi, sp);
26617c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
26627c478bd9Sstevel@tonic-gate 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2663ba8fdb6fSek 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2664ba8fdb6fSek 		    n4e.stat));
26657c478bd9Sstevel@tonic-gate 
26667c478bd9Sstevel@tonic-gate 		/*
26677c478bd9Sstevel@tonic-gate 		 * If we get a recovery error that we can actually
26687c478bd9Sstevel@tonic-gate 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
26697c478bd9Sstevel@tonic-gate 		 * return and let the recovery thread redrive the call.
26707c478bd9Sstevel@tonic-gate 		 * Don't requeue unless the zone is still healthy.
26717c478bd9Sstevel@tonic-gate 		 */
26727c478bd9Sstevel@tonic-gate 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2673ba8fdb6fSek 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2674ba8fdb6fSek 		    (nfs4_try_failover(&n4e) ||
2675ba8fdb6fSek 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2676ba8fdb6fSek 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2677ba8fdb6fSek 		    !nfs4_recov_marks_dead(n4e.stat)))) {
26787c478bd9Sstevel@tonic-gate 			/*
26797c478bd9Sstevel@tonic-gate 			 * For these three errors, we want to delay a bit
26807c478bd9Sstevel@tonic-gate 			 * instead of pounding the server into submission.
26817c478bd9Sstevel@tonic-gate 			 * We have to do this manually; the normal
26827c478bd9Sstevel@tonic-gate 			 * processing for these errors only works for
26837c478bd9Sstevel@tonic-gate 			 * non-recovery requests.
26847c478bd9Sstevel@tonic-gate 			 */
2685ba8fdb6fSek 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2686ba8fdb6fSek 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2687ba8fdb6fSek 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2688ba8fdb6fSek 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
26897c478bd9Sstevel@tonic-gate 				delay(SEC_TO_TICK(nfs4err_delay_time));
26907c478bd9Sstevel@tonic-gate 			} else {
2691ba8fdb6fSek 				(void) nfs4_start_recovery(&n4e,
2692b9238976Sth 				    mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
26932f172c55SRobert Thurlow 				    lrp->lr_op, NULL, NULL, NULL);
26947c478bd9Sstevel@tonic-gate 			}
26957c478bd9Sstevel@tonic-gate 			return;
26967c478bd9Sstevel@tonic-gate 		}
26977c478bd9Sstevel@tonic-gate 
26987c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
26997c478bd9Sstevel@tonic-gate 		list_remove(&mi->mi_lost_state, lrp);
27007c478bd9Sstevel@tonic-gate 		tlrp = lrp;
27017c478bd9Sstevel@tonic-gate 		lrp = list_head(&mi->mi_lost_state);
27027c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
27037c478bd9Sstevel@tonic-gate 		nfs4_free_lost_rqst(tlrp, sp);
27047c478bd9Sstevel@tonic-gate 	}
27057c478bd9Sstevel@tonic-gate }
27067c478bd9Sstevel@tonic-gate 
27077c478bd9Sstevel@tonic-gate /*
27087c478bd9Sstevel@tonic-gate  * Resend the given op, and issue any necessary undo call.
27097c478bd9Sstevel@tonic-gate  * errors are returned via the nfs4_error_t parameter.
27107c478bd9Sstevel@tonic-gate  */
27117c478bd9Sstevel@tonic-gate 
27127c478bd9Sstevel@tonic-gate static void
resend_one_op(nfs4_lost_rqst_t * lrp,nfs4_error_t * ep,mntinfo4_t * mi,nfs4_server_t * sp)27137c478bd9Sstevel@tonic-gate resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2714b9238976Sth     mntinfo4_t *mi, nfs4_server_t *sp)
27157c478bd9Sstevel@tonic-gate {
27167c478bd9Sstevel@tonic-gate 	vnode_t *vp;
27177c478bd9Sstevel@tonic-gate 	nfs4_open_stream_t *osp;
27187c478bd9Sstevel@tonic-gate 	cred_t *cr;
27197c478bd9Sstevel@tonic-gate 	uint32_t acc_bits;
27207c478bd9Sstevel@tonic-gate 
27217c478bd9Sstevel@tonic-gate 	vp = lrp->lr_vp;
27227c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
27237c478bd9Sstevel@tonic-gate 	    "have a lost open/close request for vp %p", (void *)vp));
27247c478bd9Sstevel@tonic-gate 
27257c478bd9Sstevel@tonic-gate 	switch (lrp->lr_op) {
27267c478bd9Sstevel@tonic-gate 	case OP_OPEN:
27277c478bd9Sstevel@tonic-gate 		nfs4_resend_open_otw(&vp, lrp, ep);
27287c478bd9Sstevel@tonic-gate 		break;
27297c478bd9Sstevel@tonic-gate 	case OP_OPEN_DOWNGRADE:
27307c478bd9Sstevel@tonic-gate 		ASSERT(lrp->lr_oop != NULL);
27317c478bd9Sstevel@tonic-gate 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
27327c478bd9Sstevel@tonic-gate 		ASSERT(!ep->error);	/* recov thread always succeeds */
27337c478bd9Sstevel@tonic-gate 		ASSERT(lrp->lr_osp != NULL);
27347c478bd9Sstevel@tonic-gate 		mutex_enter(&lrp->lr_osp->os_sync_lock);
27357c478bd9Sstevel@tonic-gate 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2736b9238976Sth 		    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2737b9238976Sth 		    ep, NULL, NULL);
27387c478bd9Sstevel@tonic-gate 		mutex_exit(&lrp->lr_osp->os_sync_lock);
27397c478bd9Sstevel@tonic-gate 		nfs4_end_open_seqid_sync(lrp->lr_oop);
27407c478bd9Sstevel@tonic-gate 		break;
27417c478bd9Sstevel@tonic-gate 	case OP_CLOSE:
27427c478bd9Sstevel@tonic-gate 		osp = lrp->lr_osp;
27437c478bd9Sstevel@tonic-gate 		cr = lrp->lr_cr;
27447c478bd9Sstevel@tonic-gate 		acc_bits = 0;
27457c478bd9Sstevel@tonic-gate 		mutex_enter(&osp->os_sync_lock);
27467c478bd9Sstevel@tonic-gate 		if (osp->os_share_acc_read)
27477c478bd9Sstevel@tonic-gate 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
27487c478bd9Sstevel@tonic-gate 		if (osp->os_share_acc_write)
27497c478bd9Sstevel@tonic-gate 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
27507c478bd9Sstevel@tonic-gate 		mutex_exit(&osp->os_sync_lock);
27517c478bd9Sstevel@tonic-gate 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2752b9238976Sth 		    CLOSE_RESEND, 0, 0, 0);
27537c478bd9Sstevel@tonic-gate 		break;
27547c478bd9Sstevel@tonic-gate 	case OP_LOCK:
27557c478bd9Sstevel@tonic-gate 	case OP_LOCKU:
27567c478bd9Sstevel@tonic-gate 		resend_lock(lrp, ep);
27577c478bd9Sstevel@tonic-gate 		goto done;
27587c478bd9Sstevel@tonic-gate 	case OP_DELEGRETURN:
27597c478bd9Sstevel@tonic-gate 		nfs4_resend_delegreturn(lrp, ep, sp);
27607c478bd9Sstevel@tonic-gate 		goto done;
27617c478bd9Sstevel@tonic-gate 	default:
27627c478bd9Sstevel@tonic-gate #ifdef DEBUG
27637c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2764b9238976Sth 		    lrp->lr_op);
27657c478bd9Sstevel@tonic-gate #endif
27667c478bd9Sstevel@tonic-gate 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
27677c478bd9Sstevel@tonic-gate 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
27687c478bd9Sstevel@tonic-gate 		    TAG_NONE, TAG_NONE, 0, 0);
27697c478bd9Sstevel@tonic-gate 		nfs4_error_init(ep, EINVAL);
27707c478bd9Sstevel@tonic-gate 		return;
27717c478bd9Sstevel@tonic-gate 	}
27727c478bd9Sstevel@tonic-gate 
27737c478bd9Sstevel@tonic-gate 	/*
27747c478bd9Sstevel@tonic-gate 	 * No need to retry nor send an "undo" CLOSE in the
27757c478bd9Sstevel@tonic-gate 	 * event the server rebooted.
27767c478bd9Sstevel@tonic-gate 	 */
27777c478bd9Sstevel@tonic-gate 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
27787c478bd9Sstevel@tonic-gate 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
27797c478bd9Sstevel@tonic-gate 		goto done;
27807c478bd9Sstevel@tonic-gate 
27817c478bd9Sstevel@tonic-gate 	/*
27827c478bd9Sstevel@tonic-gate 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
27837c478bd9Sstevel@tonic-gate 	 * to undo.  Undoing locking operations was handled by
27847c478bd9Sstevel@tonic-gate 	 * resend_lock().
27857c478bd9Sstevel@tonic-gate 	 */
27867c478bd9Sstevel@tonic-gate 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
27877c478bd9Sstevel@tonic-gate 		goto done;
27887c478bd9Sstevel@tonic-gate 
27897c478bd9Sstevel@tonic-gate 	/*
27907c478bd9Sstevel@tonic-gate 	 * If we get any other error for OPEN, then don't attempt
27917c478bd9Sstevel@tonic-gate 	 * to undo the resend of the open (since it was never
27927c478bd9Sstevel@tonic-gate 	 * successful!).
27937c478bd9Sstevel@tonic-gate 	 */
27947c478bd9Sstevel@tonic-gate 	ASSERT(lrp->lr_op == OP_OPEN);
27957c478bd9Sstevel@tonic-gate 	if (ep->error || ep->stat != NFS4_OK)
27967c478bd9Sstevel@tonic-gate 		goto done;
27977c478bd9Sstevel@tonic-gate 
27987c478bd9Sstevel@tonic-gate 	/*
27997c478bd9Sstevel@tonic-gate 	 * Now let's undo our OPEN.
28007c478bd9Sstevel@tonic-gate 	 */
28017c478bd9Sstevel@tonic-gate 	nfs4_error_zinit(ep);
28027c478bd9Sstevel@tonic-gate 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
28037c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
28047c478bd9Sstevel@tonic-gate 	    "nfs4close_one: for vp %p got error %d stat %d",
28057c478bd9Sstevel@tonic-gate 	    (void *)vp, ep->error, ep->stat));
28067c478bd9Sstevel@tonic-gate 
28077c478bd9Sstevel@tonic-gate done:
28087c478bd9Sstevel@tonic-gate 	if (vp != lrp->lr_vp)
28097c478bd9Sstevel@tonic-gate 		VN_RELE(vp);
28107c478bd9Sstevel@tonic-gate }
28117c478bd9Sstevel@tonic-gate 
28127c478bd9Sstevel@tonic-gate /*
28137c478bd9Sstevel@tonic-gate  * Close a file that was opened via a resent OPEN.
28147c478bd9Sstevel@tonic-gate  * Most errors are passed back to the caller (via the return value and
28157c478bd9Sstevel@tonic-gate  * *statp), except for FHEXPIRED, which is retried.
28167c478bd9Sstevel@tonic-gate  *
28177c478bd9Sstevel@tonic-gate  * It might be conceptually cleaner to push the CLOSE request onto the
28187c478bd9Sstevel@tonic-gate  * front of the resend queue, rather than sending it here.  That would
28197c478bd9Sstevel@tonic-gate  * match the way we undo lost lock requests.  On the other
28207c478bd9Sstevel@tonic-gate  * hand, we've already got something that works, and there's no reason to
28217c478bd9Sstevel@tonic-gate  * change it at this time.
28227c478bd9Sstevel@tonic-gate  */
28237c478bd9Sstevel@tonic-gate 
28247c478bd9Sstevel@tonic-gate static void
close_after_open_resend(vnode_t * vp,cred_t * cr,uint32_t acc_bits,nfs4_error_t * ep)28257c478bd9Sstevel@tonic-gate close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2826b9238976Sth     nfs4_error_t *ep)
28277c478bd9Sstevel@tonic-gate {
28287c478bd9Sstevel@tonic-gate 
28297c478bd9Sstevel@tonic-gate 	for (;;) {
28307c478bd9Sstevel@tonic-gate 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2831b9238976Sth 		    CLOSE_AFTER_RESEND, 0, 0, 0);
28327c478bd9Sstevel@tonic-gate 		if (ep->error == 0 && ep->stat == NFS4_OK)
28337c478bd9Sstevel@tonic-gate 			break;		/* success; done */
28347c478bd9Sstevel@tonic-gate 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
28357c478bd9Sstevel@tonic-gate 			break;
28367c478bd9Sstevel@tonic-gate 		/* else retry FHEXPIRED */
28377c478bd9Sstevel@tonic-gate 	}
28387c478bd9Sstevel@tonic-gate 
28397c478bd9Sstevel@tonic-gate }
28407c478bd9Sstevel@tonic-gate 
28417c478bd9Sstevel@tonic-gate /*
28427c478bd9Sstevel@tonic-gate  * Resend the given lost lock request.  Return an errno value.  If zero,
28437c478bd9Sstevel@tonic-gate  * *statp is set to the NFS status code for the call.
28447c478bd9Sstevel@tonic-gate  *
28457c478bd9Sstevel@tonic-gate  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
28467c478bd9Sstevel@tonic-gate  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
28477c478bd9Sstevel@tonic-gate  * Let the recovery thread redrive the call if we get a recovery error that
28487c478bd9Sstevel@tonic-gate  * we can actually recover from.
28497c478bd9Sstevel@tonic-gate  */
28507c478bd9Sstevel@tonic-gate static void
resend_lock(nfs4_lost_rqst_t * lrp,nfs4_error_t * ep)28517c478bd9Sstevel@tonic-gate resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
28527c478bd9Sstevel@tonic-gate {
28537c478bd9Sstevel@tonic-gate 	bool_t		send_siglost = FALSE;
28547c478bd9Sstevel@tonic-gate 	vnode_t		*vp = lrp->lr_vp;
28557c478bd9Sstevel@tonic-gate 
28567c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
28577c478bd9Sstevel@tonic-gate 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
28587c478bd9Sstevel@tonic-gate 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
28597c478bd9Sstevel@tonic-gate 
28607c478bd9Sstevel@tonic-gate 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
2861b9238976Sth 	    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
28627c478bd9Sstevel@tonic-gate 
28637c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
28647c478bd9Sstevel@tonic-gate 	    "nfs4frlock for vp %p returned error %d, stat %d",
28657c478bd9Sstevel@tonic-gate 	    (void *)vp, ep->error, ep->stat));
28667c478bd9Sstevel@tonic-gate 
28677c478bd9Sstevel@tonic-gate 	if (ep->error == 0 && ep->stat == 0)
28687c478bd9Sstevel@tonic-gate 		goto done;
28697c478bd9Sstevel@tonic-gate 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
28707c478bd9Sstevel@tonic-gate 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
28717c478bd9Sstevel@tonic-gate 		goto done;
28727c478bd9Sstevel@tonic-gate 
28737c478bd9Sstevel@tonic-gate 	/*
28747c478bd9Sstevel@tonic-gate 	 * If we failed with a non-recovery error, send SIGLOST and
28757c478bd9Sstevel@tonic-gate 	 * mark the file dead.
28767c478bd9Sstevel@tonic-gate 	 */
28777c478bd9Sstevel@tonic-gate 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
28787c478bd9Sstevel@tonic-gate 		send_siglost = TRUE;
28797c478bd9Sstevel@tonic-gate 	else {
28807c478bd9Sstevel@tonic-gate 		/*
28817c478bd9Sstevel@tonic-gate 		 * Done with recovering LOST LOCK in the event the
28827c478bd9Sstevel@tonic-gate 		 * server rebooted or we've lost the lease.
28837c478bd9Sstevel@tonic-gate 		 */
28847c478bd9Sstevel@tonic-gate 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
28857c478bd9Sstevel@tonic-gate 		    ep->stat == NFS4ERR_STALE_STATEID ||
28867c478bd9Sstevel@tonic-gate 		    ep->stat == NFS4ERR_EXPIRED)) {
28877c478bd9Sstevel@tonic-gate 			goto done;
28887c478bd9Sstevel@tonic-gate 		}
28897c478bd9Sstevel@tonic-gate 
28907c478bd9Sstevel@tonic-gate 		/*
28917c478bd9Sstevel@tonic-gate 		 * BAD_STATEID on an unlock indicates that the server has
28927c478bd9Sstevel@tonic-gate 		 * forgotten about the lock anyway, so act like the call
28937c478bd9Sstevel@tonic-gate 		 * was successful.
28947c478bd9Sstevel@tonic-gate 		 */
28957c478bd9Sstevel@tonic-gate 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
28967c478bd9Sstevel@tonic-gate 		    lrp->lr_op == OP_LOCKU)
28977c478bd9Sstevel@tonic-gate 			goto done;
28987c478bd9Sstevel@tonic-gate 
28997c478bd9Sstevel@tonic-gate 		/*
29007c478bd9Sstevel@tonic-gate 		 * If we got a recovery error that we don't actually
29017c478bd9Sstevel@tonic-gate 		 * recover from, send SIGLOST.  If the filesystem was
29027c478bd9Sstevel@tonic-gate 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
29037c478bd9Sstevel@tonic-gate 		 * unnecessary noise, and (b) there could be a new process
29047c478bd9Sstevel@tonic-gate 		 * with the same pid as the one that had generated the lost
29057c478bd9Sstevel@tonic-gate 		 * state request.
29067c478bd9Sstevel@tonic-gate 		 */
29077c478bd9Sstevel@tonic-gate 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
29087c478bd9Sstevel@tonic-gate 		    nfs4_recov_marks_dead(ep->stat))) {
29097c478bd9Sstevel@tonic-gate 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
29107c478bd9Sstevel@tonic-gate 				send_siglost = TRUE;
29117c478bd9Sstevel@tonic-gate 			goto done;
29127c478bd9Sstevel@tonic-gate 		}
29137c478bd9Sstevel@tonic-gate 
29147c478bd9Sstevel@tonic-gate 		/*
29157c478bd9Sstevel@tonic-gate 		 * If the filesystem was forcibly unmounted, we
29167c478bd9Sstevel@tonic-gate 		 * still need to synchronize with the server and
29177c478bd9Sstevel@tonic-gate 		 * release state.  Try again later.
29187c478bd9Sstevel@tonic-gate 		 */
29197c478bd9Sstevel@tonic-gate 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
29207c478bd9Sstevel@tonic-gate 			goto done;
29217c478bd9Sstevel@tonic-gate 
29227c478bd9Sstevel@tonic-gate 		/*
29237c478bd9Sstevel@tonic-gate 		 * If we get a recovery error that we can actually
29247c478bd9Sstevel@tonic-gate 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
29257c478bd9Sstevel@tonic-gate 		 * return and let the recovery thread redrive the call.
29267c478bd9Sstevel@tonic-gate 		 *
29277c478bd9Sstevel@tonic-gate 		 * For the three errors below, we want to delay a bit
29287c478bd9Sstevel@tonic-gate 		 * instead of pounding the server into submission.
29297c478bd9Sstevel@tonic-gate 		 */
29307c478bd9Sstevel@tonic-gate 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
29317c478bd9Sstevel@tonic-gate 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
29327c478bd9Sstevel@tonic-gate 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
29337c478bd9Sstevel@tonic-gate 			delay(SEC_TO_TICK(recov_err_delay));
29347c478bd9Sstevel@tonic-gate 		goto done;
29357c478bd9Sstevel@tonic-gate 	}
29367c478bd9Sstevel@tonic-gate 
29377c478bd9Sstevel@tonic-gate done:
29387c478bd9Sstevel@tonic-gate 	if (send_siglost) {
29397c478bd9Sstevel@tonic-gate 		cred_t *sv_cred;
29407c478bd9Sstevel@tonic-gate 
29417c478bd9Sstevel@tonic-gate 		/*
29427c478bd9Sstevel@tonic-gate 		 * Must be root or the actual thread being issued the
29437c478bd9Sstevel@tonic-gate 		 * SIGLOST for this to work, so just become root.
29447c478bd9Sstevel@tonic-gate 		 */
29457c478bd9Sstevel@tonic-gate 		sv_cred = curthread->t_cred;
29467c478bd9Sstevel@tonic-gate 		curthread->t_cred = kcred;
29477c478bd9Sstevel@tonic-gate 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
29487c478bd9Sstevel@tonic-gate 		    ep->error, ep->stat);
29497c478bd9Sstevel@tonic-gate 		curthread->t_cred = sv_cred;
29507c478bd9Sstevel@tonic-gate 
29517c478bd9Sstevel@tonic-gate 		/*
29527c478bd9Sstevel@tonic-gate 		 * Flush any additional reinstantiation requests for
29537c478bd9Sstevel@tonic-gate 		 * this operation.  Sending multiple SIGLOSTs to the user
29547c478bd9Sstevel@tonic-gate 		 * process is unlikely to help and may cause trouble.
29557c478bd9Sstevel@tonic-gate 		 */
29567c478bd9Sstevel@tonic-gate 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
29577c478bd9Sstevel@tonic-gate 			flush_reinstate(lrp);
29587c478bd9Sstevel@tonic-gate 	}
29597c478bd9Sstevel@tonic-gate }
29607c478bd9Sstevel@tonic-gate 
29617c478bd9Sstevel@tonic-gate /*
29627c478bd9Sstevel@tonic-gate  * Remove any lock reinstantiation requests that correspond to the given
29637c478bd9Sstevel@tonic-gate  * lost request.  We only remove items that follow lrp in the queue,
29647c478bd9Sstevel@tonic-gate  * assuming that lrp will be removed by the generic lost state code.
29657c478bd9Sstevel@tonic-gate  */
29667c478bd9Sstevel@tonic-gate 
29677c478bd9Sstevel@tonic-gate static void
flush_reinstate(nfs4_lost_rqst_t * lrp)29687c478bd9Sstevel@tonic-gate flush_reinstate(nfs4_lost_rqst_t *lrp)
29697c478bd9Sstevel@tonic-gate {
29707c478bd9Sstevel@tonic-gate 	vnode_t *vp;
29717c478bd9Sstevel@tonic-gate 	pid_t pid;
29727c478bd9Sstevel@tonic-gate 	mntinfo4_t *mi;
29737c478bd9Sstevel@tonic-gate 	nfs4_lost_rqst_t *nlrp;
29747c478bd9Sstevel@tonic-gate 
29757c478bd9Sstevel@tonic-gate 	vp = lrp->lr_vp;
29767c478bd9Sstevel@tonic-gate 	mi = VTOMI4(vp);
29777c478bd9Sstevel@tonic-gate 	pid = lrp->lr_flk->l_pid;
29787c478bd9Sstevel@tonic-gate 
29797c478bd9Sstevel@tonic-gate 	/*
29807c478bd9Sstevel@tonic-gate 	 * If there are any more reinstantation requests to get rid of,
29817c478bd9Sstevel@tonic-gate 	 * they should all be clustered at the front of the lost state
29827c478bd9Sstevel@tonic-gate 	 * queue.
29837c478bd9Sstevel@tonic-gate 	 */
29847c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
29857c478bd9Sstevel@tonic-gate 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
29867c478bd9Sstevel@tonic-gate 	    lrp = nlrp) {
29877c478bd9Sstevel@tonic-gate 		nlrp = list_next(&mi->mi_lost_state, lrp);
29887c478bd9Sstevel@tonic-gate 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
29897c478bd9Sstevel@tonic-gate 			break;
29907c478bd9Sstevel@tonic-gate 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
29917c478bd9Sstevel@tonic-gate 			break;
29927c478bd9Sstevel@tonic-gate 		ASSERT(lrp->lr_vp == vp);
29937c478bd9Sstevel@tonic-gate 		ASSERT(lrp->lr_flk->l_pid == pid);
29947c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2995b9238976Sth 		    "remove reinstantiation %p", (void *)lrp));
29967c478bd9Sstevel@tonic-gate 		list_remove(&mi->mi_lost_state, lrp);
29977c478bd9Sstevel@tonic-gate 		nfs4_free_lost_rqst(lrp, NULL);
29987c478bd9Sstevel@tonic-gate 	}
29997c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
30007c478bd9Sstevel@tonic-gate }
30017c478bd9Sstevel@tonic-gate 
30027c478bd9Sstevel@tonic-gate /*
30037c478bd9Sstevel@tonic-gate  * End of state-specific recovery routines.
30047c478bd9Sstevel@tonic-gate  */
30057c478bd9Sstevel@tonic-gate 
30067c478bd9Sstevel@tonic-gate /*
30077c478bd9Sstevel@tonic-gate  * Allocate a lost request struct, initialize it from lost_rqstp (including
30087c478bd9Sstevel@tonic-gate  * bumping the reference counts for the referenced vnode, etc.), and hang
30097c478bd9Sstevel@tonic-gate  * it off of recovp.
30107c478bd9Sstevel@tonic-gate  */
30117c478bd9Sstevel@tonic-gate 
30127c478bd9Sstevel@tonic-gate static void
nfs4_save_lost_rqst(nfs4_lost_rqst_t * lost_rqstp,recov_info_t * recovp,nfs4_recov_t * action,mntinfo4_t * mi)30137c478bd9Sstevel@tonic-gate nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
3014b9238976Sth     nfs4_recov_t *action, mntinfo4_t *mi)
30157c478bd9Sstevel@tonic-gate {
30167c478bd9Sstevel@tonic-gate 	nfs4_lost_rqst_t *destp;
30177c478bd9Sstevel@tonic-gate 
30187c478bd9Sstevel@tonic-gate 	ASSERT(recovp->rc_lost_rqst == NULL);
30197c478bd9Sstevel@tonic-gate 
30207c478bd9Sstevel@tonic-gate 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
30217c478bd9Sstevel@tonic-gate 	recovp->rc_lost_rqst = destp;
30227c478bd9Sstevel@tonic-gate 
30237c478bd9Sstevel@tonic-gate 	if (lost_rqstp->lr_op == OP_LOCK ||
30247c478bd9Sstevel@tonic-gate 	    lost_rqstp->lr_op == OP_LOCKU) {
30257c478bd9Sstevel@tonic-gate 		ASSERT(lost_rqstp->lr_lop);
30267c478bd9Sstevel@tonic-gate 		*action = NR_LOST_LOCK;
30277c478bd9Sstevel@tonic-gate 		destp->lr_ctype = lost_rqstp->lr_ctype;
30287c478bd9Sstevel@tonic-gate 		destp->lr_locktype = lost_rqstp->lr_locktype;
30297c478bd9Sstevel@tonic-gate 	} else if (lost_rqstp->lr_op == OP_OPEN) {
30307c478bd9Sstevel@tonic-gate 		component4 *srcfp, *destfp;
30317c478bd9Sstevel@tonic-gate 
30327c478bd9Sstevel@tonic-gate 		destp->lr_oacc = lost_rqstp->lr_oacc;
30337c478bd9Sstevel@tonic-gate 		destp->lr_odeny = lost_rqstp->lr_odeny;
30347c478bd9Sstevel@tonic-gate 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
30357c478bd9Sstevel@tonic-gate 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
30367c478bd9Sstevel@tonic-gate 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
30377c478bd9Sstevel@tonic-gate 
30387c478bd9Sstevel@tonic-gate 		srcfp = &lost_rqstp->lr_ofile;
30397c478bd9Sstevel@tonic-gate 		destfp = &destp->lr_ofile;
30407c478bd9Sstevel@tonic-gate 		/*
30417c478bd9Sstevel@tonic-gate 		 * Consume caller's utf8string
30427c478bd9Sstevel@tonic-gate 		 */
30437c478bd9Sstevel@tonic-gate 		destfp->utf8string_len = srcfp->utf8string_len;
30447c478bd9Sstevel@tonic-gate 		destfp->utf8string_val = srcfp->utf8string_val;
30457c478bd9Sstevel@tonic-gate 		srcfp->utf8string_len = 0;
30467c478bd9Sstevel@tonic-gate 		srcfp->utf8string_val = NULL;	/* make sure not reused */
30477c478bd9Sstevel@tonic-gate 
30487c478bd9Sstevel@tonic-gate 		*action = NR_LOST_STATE_RQST;
30497c478bd9Sstevel@tonic-gate 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
30507c478bd9Sstevel@tonic-gate 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
30517c478bd9Sstevel@tonic-gate 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
30527c478bd9Sstevel@tonic-gate 
30537c478bd9Sstevel@tonic-gate 		*action = NR_LOST_STATE_RQST;
30547c478bd9Sstevel@tonic-gate 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
30557c478bd9Sstevel@tonic-gate 		ASSERT(lost_rqstp->lr_oop);
30567c478bd9Sstevel@tonic-gate 		*action = NR_LOST_STATE_RQST;
30577c478bd9Sstevel@tonic-gate 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
30587c478bd9Sstevel@tonic-gate 		*action = NR_LOST_STATE_RQST;
30597c478bd9Sstevel@tonic-gate 	} else {
30607c478bd9Sstevel@tonic-gate #ifdef DEBUG
30617c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3062b9238976Sth 		    lost_rqstp->lr_op);
30637c478bd9Sstevel@tonic-gate #endif
30647c478bd9Sstevel@tonic-gate 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
30657c478bd9Sstevel@tonic-gate 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
30667c478bd9Sstevel@tonic-gate 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
30677c478bd9Sstevel@tonic-gate 		*action = NR_UNUSED;
30687c478bd9Sstevel@tonic-gate 		recovp->rc_lost_rqst = NULL;
30697c478bd9Sstevel@tonic-gate 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
30707c478bd9Sstevel@tonic-gate 		return;
30717c478bd9Sstevel@tonic-gate 	}
30727c478bd9Sstevel@tonic-gate 
30737c478bd9Sstevel@tonic-gate 	destp->lr_op = lost_rqstp->lr_op;
30747c478bd9Sstevel@tonic-gate 	destp->lr_vp = lost_rqstp->lr_vp;
30757c478bd9Sstevel@tonic-gate 	if (destp->lr_vp)
30767c478bd9Sstevel@tonic-gate 		VN_HOLD(destp->lr_vp);
30777c478bd9Sstevel@tonic-gate 	destp->lr_dvp = lost_rqstp->lr_dvp;
30787c478bd9Sstevel@tonic-gate 	if (destp->lr_dvp)
30797c478bd9Sstevel@tonic-gate 		VN_HOLD(destp->lr_dvp);
30807c478bd9Sstevel@tonic-gate 	destp->lr_oop = lost_rqstp->lr_oop;
30817c478bd9Sstevel@tonic-gate 	if (destp->lr_oop)
30827c478bd9Sstevel@tonic-gate 		open_owner_hold(destp->lr_oop);
30837c478bd9Sstevel@tonic-gate 	destp->lr_osp = lost_rqstp->lr_osp;
30847c478bd9Sstevel@tonic-gate 	if (destp->lr_osp)
30857c478bd9Sstevel@tonic-gate 		open_stream_hold(destp->lr_osp);
30867c478bd9Sstevel@tonic-gate 	destp->lr_lop = lost_rqstp->lr_lop;
30877c478bd9Sstevel@tonic-gate 	if (destp->lr_lop)
30887c478bd9Sstevel@tonic-gate 		lock_owner_hold(destp->lr_lop);
30897c478bd9Sstevel@tonic-gate 	destp->lr_cr = lost_rqstp->lr_cr;
30907c478bd9Sstevel@tonic-gate 	if (destp->lr_cr)
30917c478bd9Sstevel@tonic-gate 		crhold(destp->lr_cr);
30927c478bd9Sstevel@tonic-gate 	if (lost_rqstp->lr_flk == NULL)
30937c478bd9Sstevel@tonic-gate 		destp->lr_flk = NULL;
30947c478bd9Sstevel@tonic-gate 	else {
30957c478bd9Sstevel@tonic-gate 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
30967c478bd9Sstevel@tonic-gate 		*destp->lr_flk = *lost_rqstp->lr_flk;
30977c478bd9Sstevel@tonic-gate 	}
30987c478bd9Sstevel@tonic-gate 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
30997c478bd9Sstevel@tonic-gate }
31007c478bd9Sstevel@tonic-gate 
31017c478bd9Sstevel@tonic-gate /*
31027c478bd9Sstevel@tonic-gate  * Map the given return values (errno and nfs4 status code) to a recovery
31037c478bd9Sstevel@tonic-gate  * action and fill in the following fields of recovp: rc_action,
31047c478bd9Sstevel@tonic-gate  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
31057c478bd9Sstevel@tonic-gate  */
31067c478bd9Sstevel@tonic-gate 
31077c478bd9Sstevel@tonic-gate void
errs_to_action(recov_info_t * recovp,nfs4_server_t * sp,mntinfo4_t * mi,stateid4 * sidp,nfs4_lost_rqst_t * lost_rqstp,int unmounted,nfs_opnum4 op,nfs4_bseqid_entry_t * bsep)31087c478bd9Sstevel@tonic-gate errs_to_action(recov_info_t *recovp,
3109b9238976Sth     nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3110b9238976Sth     nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3111b9238976Sth     nfs4_bseqid_entry_t *bsep)
31127c478bd9Sstevel@tonic-gate {
31137c478bd9Sstevel@tonic-gate 	nfs4_recov_t action = NR_UNUSED;
31147c478bd9Sstevel@tonic-gate 	bool_t reboot = FALSE;
31157c478bd9Sstevel@tonic-gate 	int try_f;
31167c478bd9Sstevel@tonic-gate 	int error = recovp->rc_orig_errors.error;
31177c478bd9Sstevel@tonic-gate 	nfsstat4 stat = recovp->rc_orig_errors.stat;
31187c478bd9Sstevel@tonic-gate 
31197c478bd9Sstevel@tonic-gate 	bzero(&recovp->rc_stateid, sizeof (stateid4));
31207c478bd9Sstevel@tonic-gate 	recovp->rc_lost_rqst = NULL;
31217c478bd9Sstevel@tonic-gate 	recovp->rc_bseqid_rqst = NULL;
31227c478bd9Sstevel@tonic-gate 
31237c478bd9Sstevel@tonic-gate 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3124b9238976Sth 	    FAILOVER_MOUNT4(mi);
31257c478bd9Sstevel@tonic-gate 
31267c478bd9Sstevel@tonic-gate 	/*
31277c478bd9Sstevel@tonic-gate 	 * We start recovery for EINTR only in the lost lock
31287c478bd9Sstevel@tonic-gate 	 * or lost open/close case.
31297c478bd9Sstevel@tonic-gate 	 */
31307c478bd9Sstevel@tonic-gate 
31317c478bd9Sstevel@tonic-gate 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
31327c478bd9Sstevel@tonic-gate 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
31337c478bd9Sstevel@tonic-gate 		if (lost_rqstp) {
31347c478bd9Sstevel@tonic-gate 			ASSERT(lost_rqstp->lr_op != 0);
31357c478bd9Sstevel@tonic-gate 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
31367c478bd9Sstevel@tonic-gate 		}
31377c478bd9Sstevel@tonic-gate 		if (try_f)
31387c478bd9Sstevel@tonic-gate 			action = NR_FAILOVER;
31397c478bd9Sstevel@tonic-gate 	} else if (error != 0) {
31407c478bd9Sstevel@tonic-gate 		recovp->rc_error = error;
31417c478bd9Sstevel@tonic-gate 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
31427c478bd9Sstevel@tonic-gate 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
31437c478bd9Sstevel@tonic-gate 		action = NR_CLIENTID;
31447c478bd9Sstevel@tonic-gate 	} else {
31457c478bd9Sstevel@tonic-gate 		recovp->rc_error = geterrno4(stat);
31467c478bd9Sstevel@tonic-gate 		switch (stat) {
31477c478bd9Sstevel@tonic-gate #ifdef notyet
31487c478bd9Sstevel@tonic-gate 		case NFS4ERR_LEASE_MOVED:
31497c478bd9Sstevel@tonic-gate 			action = xxx;
31507c478bd9Sstevel@tonic-gate 			break;
31512f172c55SRobert Thurlow #endif
31527c478bd9Sstevel@tonic-gate 		case NFS4ERR_MOVED:
31532f172c55SRobert Thurlow 			action = NR_MOVED;
31547c478bd9Sstevel@tonic-gate 			break;
31557c478bd9Sstevel@tonic-gate 		case NFS4ERR_BADHANDLE:
31567c478bd9Sstevel@tonic-gate 			action = NR_BADHANDLE;
31577c478bd9Sstevel@tonic-gate 			break;
31587c478bd9Sstevel@tonic-gate 		case NFS4ERR_BAD_SEQID:
31597c478bd9Sstevel@tonic-gate 			if (bsep)
31607c478bd9Sstevel@tonic-gate 				save_bseqid_rqst(bsep, recovp);
31617c478bd9Sstevel@tonic-gate 			action = NR_BAD_SEQID;
31627c478bd9Sstevel@tonic-gate 			break;
31637c478bd9Sstevel@tonic-gate 		case NFS4ERR_OLD_STATEID:
31647c478bd9Sstevel@tonic-gate 			action = NR_OLDSTATEID;
31657c478bd9Sstevel@tonic-gate 			break;
31667c478bd9Sstevel@tonic-gate 		case NFS4ERR_WRONGSEC:
31677c478bd9Sstevel@tonic-gate 			action = NR_WRONGSEC;
31687c478bd9Sstevel@tonic-gate 			break;
31697c478bd9Sstevel@tonic-gate 		case NFS4ERR_FHEXPIRED:
31707c478bd9Sstevel@tonic-gate 			action = NR_FHEXPIRED;
31717c478bd9Sstevel@tonic-gate 			break;
31727c478bd9Sstevel@tonic-gate 		case NFS4ERR_BAD_STATEID:
31737c478bd9Sstevel@tonic-gate 			if (sp == NULL || (sp != NULL && inlease(sp))) {
31747c478bd9Sstevel@tonic-gate 
31757c478bd9Sstevel@tonic-gate 				action = NR_BAD_STATEID;
31767c478bd9Sstevel@tonic-gate 				if (sidp)
31777c478bd9Sstevel@tonic-gate 					recovp->rc_stateid = *sidp;
31787c478bd9Sstevel@tonic-gate 			} else
31797c478bd9Sstevel@tonic-gate 				action = NR_CLIENTID;
31807c478bd9Sstevel@tonic-gate 			break;
31817c478bd9Sstevel@tonic-gate 		case NFS4ERR_EXPIRED:
31827c478bd9Sstevel@tonic-gate 			/*
31837c478bd9Sstevel@tonic-gate 			 * The client's lease has expired, either due
31847c478bd9Sstevel@tonic-gate 			 * to a network partition or perhaps a client
31857c478bd9Sstevel@tonic-gate 			 * error.  In either case, try an NR_CLIENTID
31867c478bd9Sstevel@tonic-gate 			 * style recovery.  reboot remains false, since
31877c478bd9Sstevel@tonic-gate 			 * there is no evidence the server has rebooted.
31887c478bd9Sstevel@tonic-gate 			 * This will cause CLAIM_NULL opens and lock
31897c478bd9Sstevel@tonic-gate 			 * requests without the reclaim bit.
31907c478bd9Sstevel@tonic-gate 			 */
31917c478bd9Sstevel@tonic-gate 			action = NR_CLIENTID;
31927c478bd9Sstevel@tonic-gate 
31937c478bd9Sstevel@tonic-gate 			DTRACE_PROBE4(nfs4__expired,
3194b9238976Sth 			    nfs4_server_t *, sp,
3195b9238976Sth 			    mntinfo4_t *, mi,
3196b9238976Sth 			    stateid4 *, sidp, int, op);
31977c478bd9Sstevel@tonic-gate 
31987c478bd9Sstevel@tonic-gate 			break;
31997c478bd9Sstevel@tonic-gate 		case NFS4ERR_STALE_CLIENTID:
32007c478bd9Sstevel@tonic-gate 		case NFS4ERR_STALE_STATEID:
32017c478bd9Sstevel@tonic-gate 			action = NR_CLIENTID;
32027c478bd9Sstevel@tonic-gate 			reboot = TRUE;
32037c478bd9Sstevel@tonic-gate 			break;
32047c478bd9Sstevel@tonic-gate 		case NFS4ERR_RESOURCE:
32057c478bd9Sstevel@tonic-gate 			/*
32067c478bd9Sstevel@tonic-gate 			 * If this had been a FAILOVER mount, then
32077c478bd9Sstevel@tonic-gate 			 * we'd have tried failover.  Since it's not,
32087c478bd9Sstevel@tonic-gate 			 * just delay a while and retry.
32097c478bd9Sstevel@tonic-gate 			 */
32107c478bd9Sstevel@tonic-gate 			action = NR_DELAY;
32117c478bd9Sstevel@tonic-gate 			break;
32127c478bd9Sstevel@tonic-gate 		case NFS4ERR_GRACE:
32137c478bd9Sstevel@tonic-gate 			action = NR_GRACE;
32147c478bd9Sstevel@tonic-gate 			break;
32157c478bd9Sstevel@tonic-gate 		case NFS4ERR_DELAY:
32167c478bd9Sstevel@tonic-gate 			action = NR_DELAY;
32177c478bd9Sstevel@tonic-gate 			break;
32187c478bd9Sstevel@tonic-gate 		case NFS4ERR_STALE:
32197c478bd9Sstevel@tonic-gate 			action = NR_STALE;
32207c478bd9Sstevel@tonic-gate 			break;
32217c478bd9Sstevel@tonic-gate 		default:
32227c478bd9Sstevel@tonic-gate 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
32237c478bd9Sstevel@tonic-gate 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
32247c478bd9Sstevel@tonic-gate 			    0, 0);
32257c478bd9Sstevel@tonic-gate 			action = NR_CLIENTID;
32267c478bd9Sstevel@tonic-gate 			break;
32277c478bd9Sstevel@tonic-gate 		}
32287c478bd9Sstevel@tonic-gate 	}
32297c478bd9Sstevel@tonic-gate 
32307c478bd9Sstevel@tonic-gate 	/* make sure action got set */
32317c478bd9Sstevel@tonic-gate 	ASSERT(action != NR_UNUSED);
32327c478bd9Sstevel@tonic-gate 	recovp->rc_srv_reboot = reboot;
32337c478bd9Sstevel@tonic-gate 	recovp->rc_action = action;
32347c478bd9Sstevel@tonic-gate 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3235b9238976Sth 	    NULL);
32367c478bd9Sstevel@tonic-gate }
32377c478bd9Sstevel@tonic-gate 
32387c478bd9Sstevel@tonic-gate /*
32397c478bd9Sstevel@tonic-gate  * Return the (held) credential for the process with the given pid.
32407c478bd9Sstevel@tonic-gate  * May return NULL (e.g., process not found).
32417c478bd9Sstevel@tonic-gate  */
32427c478bd9Sstevel@tonic-gate 
32437c478bd9Sstevel@tonic-gate static cred_t *
pid_to_cr(pid_t pid)32447c478bd9Sstevel@tonic-gate pid_to_cr(pid_t pid)
32457c478bd9Sstevel@tonic-gate {
32467c478bd9Sstevel@tonic-gate 	proc_t *p;
32477c478bd9Sstevel@tonic-gate 	cred_t *cr;
32487c478bd9Sstevel@tonic-gate 
32497c478bd9Sstevel@tonic-gate 	mutex_enter(&pidlock);
32507c478bd9Sstevel@tonic-gate 	if ((p = prfind(pid)) == NULL) {
32517c478bd9Sstevel@tonic-gate 		mutex_exit(&pidlock);
32527c478bd9Sstevel@tonic-gate 		return (NULL);
32537c478bd9Sstevel@tonic-gate 	}
32547c478bd9Sstevel@tonic-gate 
32557c478bd9Sstevel@tonic-gate 	mutex_enter(&p->p_crlock);
32567c478bd9Sstevel@tonic-gate 	crhold(cr = p->p_cred);
32577c478bd9Sstevel@tonic-gate 	mutex_exit(&p->p_crlock);
32587c478bd9Sstevel@tonic-gate 	mutex_exit(&pidlock);
32597c478bd9Sstevel@tonic-gate 
32607c478bd9Sstevel@tonic-gate 	return (cr);
32617c478bd9Sstevel@tonic-gate }
32627c478bd9Sstevel@tonic-gate 
32637c478bd9Sstevel@tonic-gate /*
32647c478bd9Sstevel@tonic-gate  * Send SIGLOST to the given process and queue the event.
32657c478bd9Sstevel@tonic-gate  *
32667c478bd9Sstevel@tonic-gate  * The 'dump' boolean tells us whether this action should dump the
32677c478bd9Sstevel@tonic-gate  * in-kernel queue of recovery messages or not.
32687c478bd9Sstevel@tonic-gate  */
32697c478bd9Sstevel@tonic-gate 
32707c478bd9Sstevel@tonic-gate void
nfs4_send_siglost(pid_t pid,mntinfo4_t * mi,vnode_t * vp,bool_t dump,int error,nfsstat4 stat)32717c478bd9Sstevel@tonic-gate nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
32727c478bd9Sstevel@tonic-gate     int error, nfsstat4 stat)
32737c478bd9Sstevel@tonic-gate {
32747c478bd9Sstevel@tonic-gate 	proc_t *p;
32757c478bd9Sstevel@tonic-gate 
32767c478bd9Sstevel@tonic-gate 	mutex_enter(&pidlock);
32777c478bd9Sstevel@tonic-gate 	p = prfind(pid);
32787c478bd9Sstevel@tonic-gate 	if (p)
32797c478bd9Sstevel@tonic-gate 		psignal(p, SIGLOST);
32807c478bd9Sstevel@tonic-gate 	mutex_exit(&pidlock);
32817c478bd9Sstevel@tonic-gate 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
32827c478bd9Sstevel@tonic-gate 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
32837c478bd9Sstevel@tonic-gate }
32847c478bd9Sstevel@tonic-gate 
32857c478bd9Sstevel@tonic-gate /*
3286*faf39f17SMarcel Telka  * Scan the lock list for entries that match the given pid.  Unregister those
3287*faf39f17SMarcel Telka  * locks that do and change their pid to NOPID.
32887c478bd9Sstevel@tonic-gate  */
32897c478bd9Sstevel@tonic-gate 
32907c478bd9Sstevel@tonic-gate static void
relock_skip_pid(vnode_t * vp,locklist_t * llp,pid_t pid)3291*faf39f17SMarcel Telka relock_skip_pid(vnode_t *vp, locklist_t *llp, pid_t pid)
32927c478bd9Sstevel@tonic-gate {
32937c478bd9Sstevel@tonic-gate 	for (; llp != NULL; llp = llp->ll_next) {
3294*faf39f17SMarcel Telka 		if (llp->ll_flock.l_pid == pid) {
3295*faf39f17SMarcel Telka 			int r;
3296*faf39f17SMarcel Telka 
3297*faf39f17SMarcel Telka 			/*
3298*faf39f17SMarcel Telka 			 * Unregister the lost lock.
3299*faf39f17SMarcel Telka 			 */
3300*faf39f17SMarcel Telka 			llp->ll_flock.l_type = F_UNLCK;
3301*faf39f17SMarcel Telka 			r = reclock(vp, &llp->ll_flock, SETFLCK, FREAD | FWRITE,
3302*faf39f17SMarcel Telka 			    0, NULL);
3303*faf39f17SMarcel Telka 			/* The unlock cannot fail */
3304*faf39f17SMarcel Telka 			ASSERT(r == 0);
3305*faf39f17SMarcel Telka 
33067c478bd9Sstevel@tonic-gate 			llp->ll_flock.l_pid = NOPID;
3307*faf39f17SMarcel Telka 		}
33087c478bd9Sstevel@tonic-gate 	}
33097c478bd9Sstevel@tonic-gate }
33107c478bd9Sstevel@tonic-gate 
33117c478bd9Sstevel@tonic-gate /*
33127c478bd9Sstevel@tonic-gate  * Mark a file as having failed recovery, after making a last-ditch effort
33137c478bd9Sstevel@tonic-gate  * to return any delegation.
33147c478bd9Sstevel@tonic-gate  *
33157c478bd9Sstevel@tonic-gate  * Sets r_error to EIO or ESTALE for the given vnode.
33167c478bd9Sstevel@tonic-gate  */
33177c478bd9Sstevel@tonic-gate void
nfs4_fail_recov(vnode_t * vp,char * why,int error,nfsstat4 stat)33187c478bd9Sstevel@tonic-gate nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
33197c478bd9Sstevel@tonic-gate {
33207c478bd9Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
33217c478bd9Sstevel@tonic-gate 
33227c478bd9Sstevel@tonic-gate #ifdef DEBUG
33237c478bd9Sstevel@tonic-gate 	if (nfs4_fail_recov_stop)
33247c478bd9Sstevel@tonic-gate 		debug_enter("nfs4_fail_recov");
33257c478bd9Sstevel@tonic-gate #endif
33267c478bd9Sstevel@tonic-gate 
33277c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
33287c478bd9Sstevel@tonic-gate 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
33297c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
33307c478bd9Sstevel@tonic-gate 		return;
33317c478bd9Sstevel@tonic-gate 	}
33327c478bd9Sstevel@tonic-gate 
33337c478bd9Sstevel@tonic-gate 	/*
33347c478bd9Sstevel@tonic-gate 	 * Set R4RECOVERRP to indicate that a recovery error is in
33357c478bd9Sstevel@tonic-gate 	 * progress.  This will shut down reads and writes at the top
33367c478bd9Sstevel@tonic-gate 	 * half.  Don't set R4RECOVERR until after we've returned the
33377c478bd9Sstevel@tonic-gate 	 * delegation, otherwise it will fail.
33387c478bd9Sstevel@tonic-gate 	 */
33397c478bd9Sstevel@tonic-gate 
33407c478bd9Sstevel@tonic-gate 	rp->r_flags |= R4RECOVERRP;
33417c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
33427c478bd9Sstevel@tonic-gate 
33437c478bd9Sstevel@tonic-gate 	nfs4delegabandon(rp);
33447c478bd9Sstevel@tonic-gate 
33457c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
33467c478bd9Sstevel@tonic-gate 	rp->r_flags |= (R4RECOVERR | R4STALE);
33477c478bd9Sstevel@tonic-gate 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
33487c478bd9Sstevel@tonic-gate 	PURGE_ATTRCACHE4_LOCKED(rp);
33497c478bd9Sstevel@tonic-gate 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
33507c478bd9Sstevel@tonic-gate 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
33517c478bd9Sstevel@tonic-gate 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
33527c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
33537c478bd9Sstevel@tonic-gate 
33547c478bd9Sstevel@tonic-gate 	dnlc_purge_vp(vp);
33557c478bd9Sstevel@tonic-gate }
33567c478bd9Sstevel@tonic-gate 
33577c478bd9Sstevel@tonic-gate /*
33587c478bd9Sstevel@tonic-gate  * recov_throttle: if the file had the same recovery action within the
33597c478bd9Sstevel@tonic-gate  * throttle interval, wait for the throttle interval to finish before
33607c478bd9Sstevel@tonic-gate  * proceeding.
33617c478bd9Sstevel@tonic-gate  *
33627c478bd9Sstevel@tonic-gate  * Side effects: updates the rnode with the current recovery information.
33637c478bd9Sstevel@tonic-gate  */
33647c478bd9Sstevel@tonic-gate 
33657c478bd9Sstevel@tonic-gate static void
recov_throttle(recov_info_t * recovp,vnode_t * vp)33667c478bd9Sstevel@tonic-gate recov_throttle(recov_info_t *recovp, vnode_t *vp)
33677c478bd9Sstevel@tonic-gate {
33687c478bd9Sstevel@tonic-gate 	time_t curtime, time_to_wait;
33697c478bd9Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
33707c478bd9Sstevel@tonic-gate 
33717c478bd9Sstevel@tonic-gate 	curtime = gethrestime_sec();
33727c478bd9Sstevel@tonic-gate 
33737c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
33747c478bd9Sstevel@tonic-gate 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3375b9238976Sth 	    "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3376b9238976Sth 	    recovp->rc_action, curtime,
3377b9238976Sth 	    rp->r_recov_act, rp->r_last_recov));
33787c478bd9Sstevel@tonic-gate 	if (recovp->rc_action == rp->r_recov_act &&
33797c478bd9Sstevel@tonic-gate 	    rp->r_last_recov + recov_err_delay > curtime) {
33807c478bd9Sstevel@tonic-gate 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
33817c478bd9Sstevel@tonic-gate 		mutex_exit(&rp->r_statelock);
33827c478bd9Sstevel@tonic-gate 		delay(SEC_TO_TICK(time_to_wait));
33837c478bd9Sstevel@tonic-gate 		curtime = gethrestime_sec();
33847c478bd9Sstevel@tonic-gate 		mutex_enter(&rp->r_statelock);
33857c478bd9Sstevel@tonic-gate 	}
33867c478bd9Sstevel@tonic-gate 
33877c478bd9Sstevel@tonic-gate 	rp->r_last_recov = curtime;
33887c478bd9Sstevel@tonic-gate 	rp->r_recov_act = recovp->rc_action;
33897c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
33907c478bd9Sstevel@tonic-gate }
33917c478bd9Sstevel@tonic-gate 
33927c478bd9Sstevel@tonic-gate /*
33937c478bd9Sstevel@tonic-gate  * React to NFS4ERR_GRACE by setting the time we'll permit
33947c478bd9Sstevel@tonic-gate  * the next call to this filesystem.
33957c478bd9Sstevel@tonic-gate  */
33967c478bd9Sstevel@tonic-gate void
nfs4_set_grace_wait(mntinfo4_t * mi)33977c478bd9Sstevel@tonic-gate nfs4_set_grace_wait(mntinfo4_t *mi)
33987c478bd9Sstevel@tonic-gate {
33997c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
34007c478bd9Sstevel@tonic-gate 	/* Mark the time for the future */
34017c478bd9Sstevel@tonic-gate 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
34027c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
34037c478bd9Sstevel@tonic-gate }
34047c478bd9Sstevel@tonic-gate 
34057c478bd9Sstevel@tonic-gate /*
34067c478bd9Sstevel@tonic-gate  * React to MFS4ERR_DELAY by setting the time we'll permit
34077c478bd9Sstevel@tonic-gate  * the next call to this vnode.
34087c478bd9Sstevel@tonic-gate  */
34097c478bd9Sstevel@tonic-gate void
nfs4_set_delay_wait(vnode_t * vp)34107c478bd9Sstevel@tonic-gate nfs4_set_delay_wait(vnode_t *vp)
34117c478bd9Sstevel@tonic-gate {
34127c478bd9Sstevel@tonic-gate 	rnode4_t *rp = VTOR4(vp);
34137c478bd9Sstevel@tonic-gate 
34147c478bd9Sstevel@tonic-gate 	mutex_enter(&rp->r_statelock);
34157c478bd9Sstevel@tonic-gate 	/*
34167c478bd9Sstevel@tonic-gate 	 * Calculate amount we should delay, initial
34177c478bd9Sstevel@tonic-gate 	 * delay will be short and then we will back off.
34187c478bd9Sstevel@tonic-gate 	 */
34197c478bd9Sstevel@tonic-gate 	if (rp->r_delay_interval == 0)
34207c478bd9Sstevel@tonic-gate 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
34217c478bd9Sstevel@tonic-gate 	else
34227c478bd9Sstevel@tonic-gate 		/* calculate next interval value */
34237c478bd9Sstevel@tonic-gate 		rp->r_delay_interval =
34247c478bd9Sstevel@tonic-gate 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
34257c478bd9Sstevel@tonic-gate 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
34267c478bd9Sstevel@tonic-gate 	mutex_exit(&rp->r_statelock);
34277c478bd9Sstevel@tonic-gate }
34287c478bd9Sstevel@tonic-gate 
34297c478bd9Sstevel@tonic-gate /*
34307c478bd9Sstevel@tonic-gate  * The caller is responsible for freeing the returned string.
34317c478bd9Sstevel@tonic-gate  */
34327c478bd9Sstevel@tonic-gate static char *
nfs4_getsrvnames(mntinfo4_t * mi,size_t * len)34337c478bd9Sstevel@tonic-gate nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
34347c478bd9Sstevel@tonic-gate {
34357c478bd9Sstevel@tonic-gate 	servinfo4_t *svp;
34367c478bd9Sstevel@tonic-gate 	char *srvnames;
34377c478bd9Sstevel@tonic-gate 	char *namep;
34387c478bd9Sstevel@tonic-gate 	size_t length;
34397c478bd9Sstevel@tonic-gate 
34407c478bd9Sstevel@tonic-gate 	/*
34417c478bd9Sstevel@tonic-gate 	 * Calculate the length of the string required to hold all
34427c478bd9Sstevel@tonic-gate 	 * of the server names plus either a comma or a null
34437c478bd9Sstevel@tonic-gate 	 * character following each individual one.
34447c478bd9Sstevel@tonic-gate 	 */
34457c478bd9Sstevel@tonic-gate 	length = 0;
34467c478bd9Sstevel@tonic-gate 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
34477c478bd9Sstevel@tonic-gate 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
34487c478bd9Sstevel@tonic-gate 		if (svp->sv_flags & SV4_NOTINUSE) {
34497c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&svp->sv_lock);
34507c478bd9Sstevel@tonic-gate 			continue;
34517c478bd9Sstevel@tonic-gate 		}
34527c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&svp->sv_lock);
34537c478bd9Sstevel@tonic-gate 		length += svp->sv_hostnamelen;
34547c478bd9Sstevel@tonic-gate 	}
34557c478bd9Sstevel@tonic-gate 
34567c478bd9Sstevel@tonic-gate 	srvnames = kmem_alloc(length, KM_SLEEP);
34577c478bd9Sstevel@tonic-gate 
34587c478bd9Sstevel@tonic-gate 	namep = srvnames;
34597c478bd9Sstevel@tonic-gate 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
34607c478bd9Sstevel@tonic-gate 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
34617c478bd9Sstevel@tonic-gate 		if (svp->sv_flags & SV4_NOTINUSE) {
34627c478bd9Sstevel@tonic-gate 			nfs_rw_exit(&svp->sv_lock);
34637c478bd9Sstevel@tonic-gate 			continue;
34647c478bd9Sstevel@tonic-gate 		}
34657c478bd9Sstevel@tonic-gate 		nfs_rw_exit(&svp->sv_lock);
34667c478bd9Sstevel@tonic-gate 		(void) strcpy(namep, svp->sv_hostname);
34677c478bd9Sstevel@tonic-gate 		namep += svp->sv_hostnamelen - 1;
34687c478bd9Sstevel@tonic-gate 		*namep++ = ',';
34697c478bd9Sstevel@tonic-gate 	}
34707c478bd9Sstevel@tonic-gate 	*--namep = '\0';
34717c478bd9Sstevel@tonic-gate 
34727c478bd9Sstevel@tonic-gate 	*len = length;
34737c478bd9Sstevel@tonic-gate 
34747c478bd9Sstevel@tonic-gate 	return (srvnames);
34757c478bd9Sstevel@tonic-gate }
34767c478bd9Sstevel@tonic-gate 
34777c478bd9Sstevel@tonic-gate static void
save_bseqid_rqst(nfs4_bseqid_entry_t * bsep,recov_info_t * recovp)34787c478bd9Sstevel@tonic-gate save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
34797c478bd9Sstevel@tonic-gate {
34807c478bd9Sstevel@tonic-gate 	nfs4_bseqid_entry_t *destp;
34817c478bd9Sstevel@tonic-gate 
34827c478bd9Sstevel@tonic-gate 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
34837c478bd9Sstevel@tonic-gate 	recovp->rc_bseqid_rqst = destp;
34847c478bd9Sstevel@tonic-gate 
34857c478bd9Sstevel@tonic-gate 	if (bsep->bs_oop)
34867c478bd9Sstevel@tonic-gate 		open_owner_hold(bsep->bs_oop);
34877c478bd9Sstevel@tonic-gate 	destp->bs_oop = bsep->bs_oop;
34887c478bd9Sstevel@tonic-gate 	if (bsep->bs_lop)
34897c478bd9Sstevel@tonic-gate 		lock_owner_hold(bsep->bs_lop);
34907c478bd9Sstevel@tonic-gate 	destp->bs_lop = bsep->bs_lop;
34917c478bd9Sstevel@tonic-gate 	if (bsep->bs_vp)
34927c478bd9Sstevel@tonic-gate 		VN_HOLD(bsep->bs_vp);
34937c478bd9Sstevel@tonic-gate 	destp->bs_vp = bsep->bs_vp;
34947c478bd9Sstevel@tonic-gate 	destp->bs_pid = bsep->bs_pid;
34957c478bd9Sstevel@tonic-gate 	destp->bs_tag = bsep->bs_tag;
34967c478bd9Sstevel@tonic-gate 	destp->bs_seqid = bsep->bs_seqid;
34977c478bd9Sstevel@tonic-gate }
34987c478bd9Sstevel@tonic-gate 
34997c478bd9Sstevel@tonic-gate static void
free_bseqid_rqst(nfs4_bseqid_entry_t * bsep)35007c478bd9Sstevel@tonic-gate free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
35017c478bd9Sstevel@tonic-gate {
35027c478bd9Sstevel@tonic-gate 	if (bsep->bs_oop)
35037c478bd9Sstevel@tonic-gate 		open_owner_rele(bsep->bs_oop);
35047c478bd9Sstevel@tonic-gate 	if (bsep->bs_lop)
35057c478bd9Sstevel@tonic-gate 		lock_owner_rele(bsep->bs_lop);
35067c478bd9Sstevel@tonic-gate 	if (bsep->bs_vp)
35077c478bd9Sstevel@tonic-gate 		VN_RELE(bsep->bs_vp);
35087c478bd9Sstevel@tonic-gate 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
35097c478bd9Sstevel@tonic-gate }
35107c478bd9Sstevel@tonic-gate 
35117c478bd9Sstevel@tonic-gate /*
35127c478bd9Sstevel@tonic-gate  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
35137c478bd9Sstevel@tonic-gate  * simply mark the open owner and open stream (if provided) as "bad".
35147c478bd9Sstevel@tonic-gate  * Then future uses of these data structures will be limited to basically
35157c478bd9Sstevel@tonic-gate  * just cleaning up the internal client state (no going OTW).
35167c478bd9Sstevel@tonic-gate  *
35177c478bd9Sstevel@tonic-gate  * The result of this is to return errors back to the app/usr when
35187c478bd9Sstevel@tonic-gate  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
35197c478bd9Sstevel@tonic-gate  * succeed so progress can be made.
35207c478bd9Sstevel@tonic-gate  */
35217c478bd9Sstevel@tonic-gate void
recov_bad_seqid(recov_info_t * recovp)35227c478bd9Sstevel@tonic-gate recov_bad_seqid(recov_info_t *recovp)
35237c478bd9Sstevel@tonic-gate {
35247c478bd9Sstevel@tonic-gate 	mntinfo4_t		*mi = recovp->rc_mi;
35257c478bd9Sstevel@tonic-gate 	nfs4_open_owner_t	*bad_oop;
35267c478bd9Sstevel@tonic-gate 	nfs4_lock_owner_t	*bad_lop;
35277c478bd9Sstevel@tonic-gate 	vnode_t			*vp;
35287c478bd9Sstevel@tonic-gate 	rnode4_t		*rp = NULL;
35297c478bd9Sstevel@tonic-gate 	pid_t			pid;
35307c478bd9Sstevel@tonic-gate 	nfs4_bseqid_entry_t	*bsep, *tbsep;
35317c478bd9Sstevel@tonic-gate 	int			error;
35327c478bd9Sstevel@tonic-gate 
35337c478bd9Sstevel@tonic-gate 	ASSERT(mi != NULL);
35347c478bd9Sstevel@tonic-gate 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
35357c478bd9Sstevel@tonic-gate 
35367c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
35377c478bd9Sstevel@tonic-gate 	bsep = list_head(&mi->mi_bseqid_list);
35387c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
35397c478bd9Sstevel@tonic-gate 
35407c478bd9Sstevel@tonic-gate 	/*
35417c478bd9Sstevel@tonic-gate 	 * Handle all the bad seqid entries on mi's list.
35427c478bd9Sstevel@tonic-gate 	 */
35437c478bd9Sstevel@tonic-gate 	while (bsep != NULL) {
35447c478bd9Sstevel@tonic-gate 		bad_oop = bsep->bs_oop;
35457c478bd9Sstevel@tonic-gate 		bad_lop = bsep->bs_lop;
35467c478bd9Sstevel@tonic-gate 		vp = bsep->bs_vp;
35477c478bd9Sstevel@tonic-gate 		pid = bsep->bs_pid;
35487c478bd9Sstevel@tonic-gate 
35497c478bd9Sstevel@tonic-gate 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
35507c478bd9Sstevel@tonic-gate 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
35517c478bd9Sstevel@tonic-gate 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
35527c478bd9Sstevel@tonic-gate 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
35537c478bd9Sstevel@tonic-gate 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
35547c478bd9Sstevel@tonic-gate 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
35557c478bd9Sstevel@tonic-gate 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
35567c478bd9Sstevel@tonic-gate 		    nfs4_ctags[TAG_NONE].ct_str));
35577c478bd9Sstevel@tonic-gate 
35587c478bd9Sstevel@tonic-gate 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
35597c478bd9Sstevel@tonic-gate 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
35607c478bd9Sstevel@tonic-gate 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
35617c478bd9Sstevel@tonic-gate 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
35627c478bd9Sstevel@tonic-gate 
35637c478bd9Sstevel@tonic-gate 		if (bad_oop) {
35647c478bd9Sstevel@tonic-gate 			/* essentially reset the open owner */
35657c478bd9Sstevel@tonic-gate 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
35667c478bd9Sstevel@tonic-gate 			ASSERT(!error);	/* recov thread always succeeds */
35677c478bd9Sstevel@tonic-gate 			bad_oop->oo_name = nfs4_get_new_oo_name();
35687c478bd9Sstevel@tonic-gate 			bad_oop->oo_seqid = 0;
35697c478bd9Sstevel@tonic-gate 			nfs4_end_open_seqid_sync(bad_oop);
35707c478bd9Sstevel@tonic-gate 		}
35717c478bd9Sstevel@tonic-gate 
35727c478bd9Sstevel@tonic-gate 		if (bad_lop) {
35737c478bd9Sstevel@tonic-gate 			mutex_enter(&bad_lop->lo_lock);
35747c478bd9Sstevel@tonic-gate 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
35757c478bd9Sstevel@tonic-gate 			mutex_exit(&bad_lop->lo_lock);
35767c478bd9Sstevel@tonic-gate 
35777c478bd9Sstevel@tonic-gate 			ASSERT(vp != NULL);
35787c478bd9Sstevel@tonic-gate 			rp = VTOR4(vp);
35797c478bd9Sstevel@tonic-gate 			mutex_enter(&rp->r_statelock);
35807c478bd9Sstevel@tonic-gate 			rp->r_flags |= R4LODANGLERS;
35817c478bd9Sstevel@tonic-gate 			mutex_exit(&rp->r_statelock);
35827c478bd9Sstevel@tonic-gate 
35837c478bd9Sstevel@tonic-gate 			nfs4_send_siglost(pid, mi, vp, TRUE,
35847c478bd9Sstevel@tonic-gate 			    0, NFS4ERR_BAD_SEQID);
35857c478bd9Sstevel@tonic-gate 		}
35867c478bd9Sstevel@tonic-gate 
35877c478bd9Sstevel@tonic-gate 		mutex_enter(&mi->mi_lock);
35887c478bd9Sstevel@tonic-gate 		list_remove(&mi->mi_bseqid_list, bsep);
35897c478bd9Sstevel@tonic-gate 		tbsep = bsep;
35907c478bd9Sstevel@tonic-gate 		bsep = list_head(&mi->mi_bseqid_list);
35917c478bd9Sstevel@tonic-gate 		mutex_exit(&mi->mi_lock);
35927c478bd9Sstevel@tonic-gate 		free_bseqid_rqst(tbsep);
35937c478bd9Sstevel@tonic-gate 	}
35947c478bd9Sstevel@tonic-gate 
35957c478bd9Sstevel@tonic-gate 	mutex_enter(&mi->mi_lock);
35967c478bd9Sstevel@tonic-gate 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
35977c478bd9Sstevel@tonic-gate 	mutex_exit(&mi->mi_lock);
35987c478bd9Sstevel@tonic-gate }
3599