1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 */
28
29/*
30 *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
31 *	All Rights Reserved
32 */
33
34#include <sys/param.h>
35#include <sys/types.h>
36#include <sys/systm.h>
37#include <sys/cmn_err.h>
38#include <sys/vtrace.h>
39#include <sys/session.h>
40#include <sys/thread.h>
41#include <sys/dnlc.h>
42#include <sys/cred.h>
43#include <sys/priv.h>
44#include <sys/list.h>
45#include <sys/sdt.h>
46#include <sys/policy.h>
47
48#include <rpc/types.h>
49#include <rpc/xdr.h>
50
51#include <nfs/nfs.h>
52
53#include <nfs/nfs_clnt.h>
54
55#include <nfs/nfs4.h>
56#include <nfs/rnode4.h>
57#include <nfs/nfs4_clnt.h>
58
59/*
60 * client side statistics
61 */
62static const struct clstat4 clstat4_tmpl = {
63	{ "calls",	KSTAT_DATA_UINT64 },
64	{ "badcalls",	KSTAT_DATA_UINT64 },
65	{ "referrals",	KSTAT_DATA_UINT64 },
66	{ "referlinks",	KSTAT_DATA_UINT64 },
67	{ "clgets",	KSTAT_DATA_UINT64 },
68	{ "cltoomany",	KSTAT_DATA_UINT64 },
69#ifdef DEBUG
70	{ "clalloc",	KSTAT_DATA_UINT64 },
71	{ "noresponse",	KSTAT_DATA_UINT64 },
72	{ "failover",	KSTAT_DATA_UINT64 },
73	{ "remap",	KSTAT_DATA_UINT64 },
74#endif
75};
76
77#ifdef DEBUG
78struct clstat4_debug clstat4_debug = {
79	{ "nrnode",	KSTAT_DATA_UINT64 },
80	{ "access",	KSTAT_DATA_UINT64 },
81	{ "dirent",	KSTAT_DATA_UINT64 },
82	{ "dirents",	KSTAT_DATA_UINT64 },
83	{ "reclaim",	KSTAT_DATA_UINT64 },
84	{ "clreclaim",	KSTAT_DATA_UINT64 },
85	{ "f_reclaim",	KSTAT_DATA_UINT64 },
86	{ "a_reclaim",	KSTAT_DATA_UINT64 },
87	{ "r_reclaim",	KSTAT_DATA_UINT64 },
88	{ "r_path",	KSTAT_DATA_UINT64 },
89};
90#endif
91
92/*
93 * We keep a global list of per-zone client data, so we can clean up all zones
94 * if we get low on memory.
95 */
96static list_t nfs4_clnt_list;
97static kmutex_t nfs4_clnt_list_lock;
98zone_key_t nfs4clnt_zone_key;
99
100static struct kmem_cache *chtab4_cache;
101
102#ifdef DEBUG
103static int nfs4_rfscall_debug;
104static int nfs4_try_failover_any;
105int nfs4_utf8_debug = 0;
106#endif
107
108/*
109 * NFSv4 readdir cache implementation
110 */
111typedef struct rddir4_cache_impl {
112	rddir4_cache	rc;		/* readdir cache element */
113	kmutex_t	lock;		/* lock protects count */
114	uint_t		count;		/* reference count */
115	avl_node_t	tree;		/* AVL tree link */
116} rddir4_cache_impl;
117
118static int rddir4_cache_compar(const void *, const void *);
119static void rddir4_cache_free(rddir4_cache_impl *);
120static rddir4_cache *rddir4_cache_alloc(int);
121static void rddir4_cache_hold(rddir4_cache *);
122static int try_failover(enum clnt_stat);
123
124static int nfs4_readdir_cache_hits = 0;
125static int nfs4_readdir_cache_waits = 0;
126static int nfs4_readdir_cache_misses = 0;
127
128/*
129 * Shared nfs4 functions
130 */
131
132/*
133 * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
134 * be allocated.
135 */
136
137void
138nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
139{
140	to->nfs_fh4_len = from->nfs_fh4_len;
141	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
142}
143
144/*
145 * nfs4cmpfh - compare 2 filehandles.
146 * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
147 * "less" than the second, +1 if the first is "greater" than the second.
148 */
149
150int
151nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
152{
153	const char *c1, *c2;
154
155	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
156		return (-1);
157	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
158		return (1);
159	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
160	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
161	    c1++, c2++) {
162		if (*c1 < *c2)
163			return (-1);
164		if (*c1 > *c2)
165			return (1);
166	}
167
168	return (0);
169}
170
171/*
172 * Compare two v4 filehandles.  Return zero if they're the same, non-zero
173 * if they're not.  Like nfs4cmpfh(), but different filehandle
174 * representation, and doesn't provide information about greater than or
175 * less than.
176 */
177
178int
179nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
180{
181	if (fh1->fh_len == fh2->fh_len)
182		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
183
184	return (1);
185}
186
187int
188stateid4_cmp(stateid4 *s1, stateid4 *s2)
189{
190	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
191		return (1);
192	else
193		return (0);
194}
195
196nfsstat4
197puterrno4(int error)
198{
199	switch (error) {
200	case 0:
201		return (NFS4_OK);
202	case EPERM:
203		return (NFS4ERR_PERM);
204	case ENOENT:
205		return (NFS4ERR_NOENT);
206	case EINTR:
207		return (NFS4ERR_IO);
208	case EIO:
209		return (NFS4ERR_IO);
210	case ENXIO:
211		return (NFS4ERR_NXIO);
212	case ENOMEM:
213		return (NFS4ERR_RESOURCE);
214	case EACCES:
215		return (NFS4ERR_ACCESS);
216	case EBUSY:
217		return (NFS4ERR_IO);
218	case EEXIST:
219		return (NFS4ERR_EXIST);
220	case EXDEV:
221		return (NFS4ERR_XDEV);
222	case ENODEV:
223		return (NFS4ERR_IO);
224	case ENOTDIR:
225		return (NFS4ERR_NOTDIR);
226	case EISDIR:
227		return (NFS4ERR_ISDIR);
228	case EINVAL:
229		return (NFS4ERR_INVAL);
230	case EMFILE:
231		return (NFS4ERR_RESOURCE);
232	case EFBIG:
233		return (NFS4ERR_FBIG);
234	case ENOSPC:
235		return (NFS4ERR_NOSPC);
236	case EROFS:
237		return (NFS4ERR_ROFS);
238	case EMLINK:
239		return (NFS4ERR_MLINK);
240	case EDEADLK:
241		return (NFS4ERR_DEADLOCK);
242	case ENOLCK:
243		return (NFS4ERR_DENIED);
244	case EREMOTE:
245		return (NFS4ERR_SERVERFAULT);
246	case ENOTSUP:
247		return (NFS4ERR_NOTSUPP);
248	case EDQUOT:
249		return (NFS4ERR_DQUOT);
250	case ENAMETOOLONG:
251		return (NFS4ERR_NAMETOOLONG);
252	case EOVERFLOW:
253		return (NFS4ERR_INVAL);
254	case ENOSYS:
255		return (NFS4ERR_NOTSUPP);
256	case ENOTEMPTY:
257		return (NFS4ERR_NOTEMPTY);
258	case EOPNOTSUPP:
259		return (NFS4ERR_NOTSUPP);
260	case ESTALE:
261		return (NFS4ERR_STALE);
262	case EAGAIN:
263		if (curthread->t_flag & T_WOULDBLOCK) {
264			curthread->t_flag &= ~T_WOULDBLOCK;
265			return (NFS4ERR_DELAY);
266		}
267		return (NFS4ERR_LOCKED);
268	default:
269		return ((enum nfsstat4)error);
270	}
271}
272
273int
274geterrno4(enum nfsstat4 status)
275{
276	switch (status) {
277	case NFS4_OK:
278		return (0);
279	case NFS4ERR_PERM:
280		return (EPERM);
281	case NFS4ERR_NOENT:
282		return (ENOENT);
283	case NFS4ERR_IO:
284		return (EIO);
285	case NFS4ERR_NXIO:
286		return (ENXIO);
287	case NFS4ERR_ACCESS:
288		return (EACCES);
289	case NFS4ERR_EXIST:
290		return (EEXIST);
291	case NFS4ERR_XDEV:
292		return (EXDEV);
293	case NFS4ERR_NOTDIR:
294		return (ENOTDIR);
295	case NFS4ERR_ISDIR:
296		return (EISDIR);
297	case NFS4ERR_INVAL:
298		return (EINVAL);
299	case NFS4ERR_FBIG:
300		return (EFBIG);
301	case NFS4ERR_NOSPC:
302		return (ENOSPC);
303	case NFS4ERR_ROFS:
304		return (EROFS);
305	case NFS4ERR_MLINK:
306		return (EMLINK);
307	case NFS4ERR_NAMETOOLONG:
308		return (ENAMETOOLONG);
309	case NFS4ERR_NOTEMPTY:
310		return (ENOTEMPTY);
311	case NFS4ERR_DQUOT:
312		return (EDQUOT);
313	case NFS4ERR_STALE:
314		return (ESTALE);
315	case NFS4ERR_BADHANDLE:
316		return (ESTALE);
317	case NFS4ERR_BAD_COOKIE:
318		return (EINVAL);
319	case NFS4ERR_NOTSUPP:
320		return (EOPNOTSUPP);
321	case NFS4ERR_TOOSMALL:
322		return (EINVAL);
323	case NFS4ERR_SERVERFAULT:
324		return (EIO);
325	case NFS4ERR_BADTYPE:
326		return (EINVAL);
327	case NFS4ERR_DELAY:
328		return (ENXIO);
329	case NFS4ERR_SAME:
330		return (EPROTO);
331	case NFS4ERR_DENIED:
332		return (ENOLCK);
333	case NFS4ERR_EXPIRED:
334		return (EPROTO);
335	case NFS4ERR_LOCKED:
336		return (EACCES);
337	case NFS4ERR_GRACE:
338		return (EAGAIN);
339	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
340		return (ESTALE);
341	case NFS4ERR_SHARE_DENIED:
342		return (EACCES);
343	case NFS4ERR_WRONGSEC:
344		return (EPERM);
345	case NFS4ERR_CLID_INUSE:
346		return (EAGAIN);
347	case NFS4ERR_RESOURCE:
348		return (EAGAIN);
349	case NFS4ERR_MOVED:
350		return (EPROTO);
351	case NFS4ERR_NOFILEHANDLE:
352		return (EIO);
353	case NFS4ERR_MINOR_VERS_MISMATCH:
354		return (ENOTSUP);
355	case NFS4ERR_STALE_CLIENTID:
356		return (EIO);
357	case NFS4ERR_STALE_STATEID:
358		return (EIO);
359	case NFS4ERR_OLD_STATEID:
360		return (EIO);
361	case NFS4ERR_BAD_STATEID:
362		return (EIO);
363	case NFS4ERR_BAD_SEQID:
364		return (EIO);
365	case NFS4ERR_NOT_SAME:
366		return (EPROTO);
367	case NFS4ERR_LOCK_RANGE:
368		return (EPROTO);
369	case NFS4ERR_SYMLINK:
370		return (EPROTO);
371	case NFS4ERR_RESTOREFH:
372		return (EPROTO);
373	case NFS4ERR_LEASE_MOVED:
374		return (EPROTO);
375	case NFS4ERR_ATTRNOTSUPP:
376		return (ENOTSUP);
377	case NFS4ERR_NO_GRACE:
378		return (EPROTO);
379	case NFS4ERR_RECLAIM_BAD:
380		return (EPROTO);
381	case NFS4ERR_RECLAIM_CONFLICT:
382		return (EPROTO);
383	case NFS4ERR_BADXDR:
384		return (EINVAL);
385	case NFS4ERR_LOCKS_HELD:
386		return (EIO);
387	case NFS4ERR_OPENMODE:
388		return (EACCES);
389	case NFS4ERR_BADOWNER:
390		/*
391		 * Client and server are in different DNS domains
392		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
393		 * doesn't match.  No good answer here.  Return
394		 * EACCESS, which translates to "permission denied".
395		 */
396		return (EACCES);
397	case NFS4ERR_BADCHAR:
398		return (EINVAL);
399	case NFS4ERR_BADNAME:
400		return (EINVAL);
401	case NFS4ERR_BAD_RANGE:
402		return (EIO);
403	case NFS4ERR_LOCK_NOTSUPP:
404		return (ENOTSUP);
405	case NFS4ERR_OP_ILLEGAL:
406		return (EINVAL);
407	case NFS4ERR_DEADLOCK:
408		return (EDEADLK);
409	case NFS4ERR_FILE_OPEN:
410		return (EACCES);
411	case NFS4ERR_ADMIN_REVOKED:
412		return (EPROTO);
413	case NFS4ERR_CB_PATH_DOWN:
414		return (EPROTO);
415	default:
416#ifdef DEBUG
417		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
418		    status);
419#endif
420		return ((int)status);
421	}
422}
423
424void
425nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
426{
427	nfs4_server_t *server;
428
429	/*
430	 * Return if already printed/queued a msg
431	 * for this mount point.
432	 */
433	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
434		return;
435	/*
436	 * Happens once per client <-> server pair.
437	 */
438	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
439	    mi->mi_flags & MI4_INT))
440		return;
441
442	server = find_nfs4_server(mi);
443	if (server == NULL) {
444		nfs_rw_exit(&mi->mi_recovlock);
445		return;
446	}
447
448	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
449		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
450		    "!NFSMAPID_DOMAIN does not match"
451		    " the server: %s domain.\n"
452		    "Please check configuration",
453		    mi->mi_curr_serv->sv_hostname);
454		server->s_flags |= N4S_BADOWNER_DEBUG;
455	}
456	mutex_exit(&server->s_lock);
457	nfs4_server_rele(server);
458	nfs_rw_exit(&mi->mi_recovlock);
459
460	/*
461	 * Happens once per mntinfo4_t.
462	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
463	 * queue this in the mesg queue for this mount_info. This message
464	 * is not printed, meaning its absent from id_to_dump_solo_fact()
465	 * but its there for inspection if the queue is ever dumped/inspected.
466	 */
467	mutex_enter(&mi->mi_lock);
468	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
469		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
470		    FALSE, NULL, 0, NULL);
471		mi->mi_flags |= MI4_BADOWNER_DEBUG;
472	}
473	mutex_exit(&mi->mi_lock);
474}
475
476int
477nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
478{
479	int64_t sec;
480	int32_t nsec;
481
482	/*
483	 * Here check that the nfsv4 time is valid for the system.
484	 * nfsv4 time value is a signed 64-bit, and the system time
485	 * may be either int64_t or int32_t (depends on the kernel),
486	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
487	 */
488#ifndef _LP64
489	if (! NFS4_TIME_OK(ntime->seconds)) {
490		return (EOVERFLOW);
491	}
492#endif
493
494	/* Invalid to specify 1 billion (or more) nsecs */
495	if (ntime->nseconds >= 1000000000)
496		return (EINVAL);
497
498	if (ntime->seconds < 0) {
499		sec = ntime->seconds + 1;
500		nsec = -1000000000 + ntime->nseconds;
501	} else {
502		sec = ntime->seconds;
503		nsec = ntime->nseconds;
504	}
505
506	vatime->tv_sec = sec;
507	vatime->tv_nsec = nsec;
508
509	return (0);
510}
511
512int
513nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
514{
515	int64_t sec;
516	uint32_t nsec;
517
518	/*
519	 * nfsv4 time value is a signed 64-bit, and the system time
520	 * may be either int64_t or int32_t (depends on the kernel),
521	 * so all system time values will fit.
522	 */
523	if (vatime->tv_nsec >= 0) {
524		sec = vatime->tv_sec;
525		nsec = vatime->tv_nsec;
526	} else {
527		sec = vatime->tv_sec - 1;
528		nsec = 1000000000 + vatime->tv_nsec;
529	}
530	ntime->seconds = sec;
531	ntime->nseconds = nsec;
532
533	return (0);
534}
535
536/*
537 * Converts a utf8 string to a valid null terminated filename string.
538 *
539 * XXX - Not actually translating the UTF-8 string as per RFC 2279.
540 *	 For now, just validate that the UTF-8 string off the wire
541 *	 does not have characters that will freak out UFS, and leave
542 *	 it at that.
543 */
544char *
545utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
546{
547	ASSERT(lenp != NULL);
548
549	if (u8s == NULL || u8s->utf8string_len <= 0 ||
550	    u8s->utf8string_val == NULL)
551		return (NULL);
552
553	/*
554	 * Check for obvious illegal filename chars
555	 */
556	if (utf8_strchr(u8s, '/') != NULL) {
557#ifdef DEBUG
558		if (nfs4_utf8_debug) {
559			char *path;
560			int len = u8s->utf8string_len;
561
562			path = kmem_alloc(len + 1, KM_SLEEP);
563			bcopy(u8s->utf8string_val, path, len);
564			path[len] = '\0';
565
566			zcmn_err(getzoneid(), CE_WARN,
567			    "Invalid UTF-8 filename: %s", path);
568
569			kmem_free(path, len + 1);
570		}
571#endif
572		return (NULL);
573	}
574
575	return (utf8_to_str(u8s, lenp, s));
576}
577
578/*
579 * Converts a utf8 string to a C string.
580 * kmem_allocs a new string if not supplied
581 */
582char *
583utf8_to_str(utf8string *str, uint_t *lenp, char *s)
584{
585	char	*sp;
586	char	*u8p;
587	int	len;
588	int	 i;
589
590	ASSERT(lenp != NULL);
591
592	if (str == NULL)
593		return (NULL);
594
595	u8p = str->utf8string_val;
596	len = str->utf8string_len;
597	if (len <= 0 || u8p == NULL) {
598		if (s)
599			*s = '\0';
600		return (NULL);
601	}
602
603	sp = s;
604	if (sp == NULL)
605		sp = kmem_alloc(len + 1, KM_SLEEP);
606
607	/*
608	 * At least check for embedded nulls
609	 */
610	for (i = 0; i < len; i++) {
611		sp[i] = u8p[i];
612		if (u8p[i] == '\0') {
613#ifdef	DEBUG
614			zcmn_err(getzoneid(), CE_WARN,
615			    "Embedded NULL in UTF-8 string");
616#endif
617			if (s == NULL)
618				kmem_free(sp, len + 1);
619			return (NULL);
620		}
621	}
622	sp[len] = '\0';
623	*lenp = len + 1;
624
625	return (sp);
626}
627
628/*
629 * str_to_utf8 - converts a null-terminated C string to a utf8 string
630 */
631utf8string *
632str_to_utf8(char *nm, utf8string *str)
633{
634	int len;
635
636	if (str == NULL)
637		return (NULL);
638
639	if (nm == NULL || *nm == '\0') {
640		str->utf8string_len = 0;
641		str->utf8string_val = NULL;
642	}
643
644	len = strlen(nm);
645
646	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
647	str->utf8string_len = len;
648	bcopy(nm, str->utf8string_val, len);
649
650	return (str);
651}
652
653utf8string *
654utf8_copy(utf8string *src, utf8string *dest)
655{
656	if (src == NULL)
657		return (NULL);
658	if (dest == NULL)
659		return (NULL);
660
661	if (src->utf8string_len > 0) {
662		dest->utf8string_val = kmem_alloc(src->utf8string_len,
663		    KM_SLEEP);
664		bcopy(src->utf8string_val, dest->utf8string_val,
665		    src->utf8string_len);
666		dest->utf8string_len = src->utf8string_len;
667	} else {
668		dest->utf8string_val = NULL;
669		dest->utf8string_len = 0;
670	}
671
672	return (dest);
673}
674
675int
676utf8_compare(const utf8string *a, const utf8string *b)
677{
678	int mlen, cmp;
679	int alen, blen;
680	char *aval, *bval;
681
682	if ((a == NULL) && (b == NULL))
683		return (0);
684	else if (a == NULL)
685		return (-1);
686	else if (b == NULL)
687		return (1);
688
689	alen = a->utf8string_len;
690	blen = b->utf8string_len;
691	aval = a->utf8string_val;
692	bval = b->utf8string_val;
693
694	if (((alen == 0) || (aval == NULL)) &&
695	    ((blen == 0) || (bval == NULL)))
696		return (0);
697	else if ((alen == 0) || (aval == NULL))
698		return (-1);
699	else if ((blen == 0) || (bval == NULL))
700		return (1);
701
702	mlen = MIN(alen, blen);
703	cmp = strncmp(aval, bval, mlen);
704
705	if ((cmp == 0) && (alen == blen))
706		return (0);
707	else if ((cmp == 0) && (alen < blen))
708		return (-1);
709	else if (cmp == 0)
710		return (1);
711	else if (cmp < 0)
712		return (-1);
713	return (1);
714}
715
716/*
717 * utf8_dir_verify - checks that the utf8 string is valid
718 */
719nfsstat4
720utf8_dir_verify(utf8string *str)
721{
722	char *nm;
723	int len;
724
725	if (str == NULL)
726		return (NFS4ERR_INVAL);
727
728	nm = str->utf8string_val;
729	len = str->utf8string_len;
730	if (nm == NULL || len == 0) {
731		return (NFS4ERR_INVAL);
732	}
733
734	if (len == 1 && nm[0] == '.')
735		return (NFS4ERR_BADNAME);
736	if (len == 2 && nm[0] == '.' && nm[1] == '.')
737		return (NFS4ERR_BADNAME);
738
739	if (utf8_strchr(str, '/') != NULL)
740		return (NFS4ERR_BADNAME);
741
742	if (utf8_strchr(str, '\0') != NULL)
743		return (NFS4ERR_BADNAME);
744
745	return (NFS4_OK);
746}
747
748/*
749 * from rpcsec module (common/rpcsec)
750 */
751extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
752extern void sec_clnt_freeh(AUTH *);
753extern void sec_clnt_freeinfo(struct sec_data *);
754
755/*
756 * authget() gets an auth handle based on the security
757 * information from the servinfo in mountinfo.
758 * The auth handle is stored in ch_client->cl_auth.
759 *
760 * First security flavor of choice is to use sv_secdata
761 * which is initiated by the client. If that fails, get
762 * secinfo from the server and then select one from the
763 * server secinfo list .
764 *
765 * For RPCSEC_GSS flavor, upon success, a secure context is
766 * established between client and server.
767 */
768int
769authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
770{
771	int error, i;
772
773	/*
774	 * SV4_TRYSECINFO indicates to try the secinfo list from
775	 * sv_secinfo until a successful one is reached. Point
776	 * sv_currsec to the selected security mechanism for
777	 * later sessions.
778	 */
779	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
780	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
781		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
782		    i++) {
783			if (!(error = sec_clnt_geth(ch_client,
784			    &svp->sv_secinfo->sdata[i],
785			    cr, &ch_client->cl_auth))) {
786
787				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
788				svp->sv_secinfo->index = i;
789				/* done */
790				svp->sv_flags &= ~SV4_TRYSECINFO;
791				break;
792			}
793
794			/*
795			 * Allow the caller retry with the security flavor
796			 * pointed by svp->sv_secinfo->index when
797			 * ETIMEDOUT/ECONNRESET occurs.
798			 */
799			if (error == ETIMEDOUT || error == ECONNRESET) {
800				svp->sv_secinfo->index = i;
801				break;
802			}
803		}
804	} else {
805		/* sv_currsec points to one of the entries in sv_secinfo */
806		if (svp->sv_currsec) {
807			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
808			    &ch_client->cl_auth);
809		} else {
810			/* If it's null, use sv_secdata. */
811			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
812			    &ch_client->cl_auth);
813		}
814	}
815	nfs_rw_exit(&svp->sv_lock);
816
817	return (error);
818}
819
820/*
821 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
822 */
823int
824clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
825    struct chtab **chp, struct nfs4_clnt *nfscl)
826{
827	struct chhead *ch, *newch;
828	struct chhead **plistp;
829	struct chtab *cp;
830	int error;
831	k_sigset_t smask;
832
833	if (newcl == NULL || chp == NULL || ci == NULL)
834		return (EINVAL);
835
836	*newcl = NULL;
837	*chp = NULL;
838
839	/*
840	 * Find an unused handle or create one
841	 */
842	newch = NULL;
843	nfscl->nfscl_stat.clgets.value.ui64++;
844top:
845	/*
846	 * Find the correct entry in the cache to check for free
847	 * client handles.  The search is based on the RPC program
848	 * number, program version number, dev_t for the transport
849	 * device, and the protocol family.
850	 */
851	mutex_enter(&nfscl->nfscl_chtable4_lock);
852	plistp = &nfscl->nfscl_chtable4;
853	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
854		if (ch->ch_prog == ci->cl_prog &&
855		    ch->ch_vers == ci->cl_vers &&
856		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
857		    (strcmp(ch->ch_protofmly,
858		    svp->sv_knconf->knc_protofmly) == 0))
859			break;
860		plistp = &ch->ch_next;
861	}
862
863	/*
864	 * If we didn't find a cache entry for this quadruple, then
865	 * create one.  If we don't have one already preallocated,
866	 * then drop the cache lock, create one, and then start over.
867	 * If we did have a preallocated entry, then just add it to
868	 * the front of the list.
869	 */
870	if (ch == NULL) {
871		if (newch == NULL) {
872			mutex_exit(&nfscl->nfscl_chtable4_lock);
873			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
874			newch->ch_timesused = 0;
875			newch->ch_prog = ci->cl_prog;
876			newch->ch_vers = ci->cl_vers;
877			newch->ch_dev = svp->sv_knconf->knc_rdev;
878			newch->ch_protofmly = kmem_alloc(
879			    strlen(svp->sv_knconf->knc_protofmly) + 1,
880			    KM_SLEEP);
881			(void) strcpy(newch->ch_protofmly,
882			    svp->sv_knconf->knc_protofmly);
883			newch->ch_list = NULL;
884			goto top;
885		}
886		ch = newch;
887		newch = NULL;
888		ch->ch_next = nfscl->nfscl_chtable4;
889		nfscl->nfscl_chtable4 = ch;
890	/*
891	 * We found a cache entry, but if it isn't on the front of the
892	 * list, then move it to the front of the list to try to take
893	 * advantage of locality of operations.
894	 */
895	} else if (ch != nfscl->nfscl_chtable4) {
896		*plistp = ch->ch_next;
897		ch->ch_next = nfscl->nfscl_chtable4;
898		nfscl->nfscl_chtable4 = ch;
899	}
900
901	/*
902	 * If there was a free client handle cached, then remove it
903	 * from the list, init it, and use it.
904	 */
905	if (ch->ch_list != NULL) {
906		cp = ch->ch_list;
907		ch->ch_list = cp->ch_list;
908		mutex_exit(&nfscl->nfscl_chtable4_lock);
909		if (newch != NULL) {
910			kmem_free(newch->ch_protofmly,
911			    strlen(newch->ch_protofmly) + 1);
912			kmem_free(newch, sizeof (*newch));
913		}
914		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
915		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
916
917		/*
918		 * Get an auth handle.
919		 */
920		error = authget(svp, cp->ch_client, cr);
921		if (error || cp->ch_client->cl_auth == NULL) {
922			CLNT_DESTROY(cp->ch_client);
923			kmem_cache_free(chtab4_cache, cp);
924			return ((error != 0) ? error : EINTR);
925		}
926		ch->ch_timesused++;
927		*newcl = cp->ch_client;
928		*chp = cp;
929		return (0);
930	}
931
932	/*
933	 * There weren't any free client handles which fit, so allocate
934	 * a new one and use that.
935	 */
936#ifdef DEBUG
937	atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
938#endif
939	mutex_exit(&nfscl->nfscl_chtable4_lock);
940
941	nfscl->nfscl_stat.cltoomany.value.ui64++;
942	if (newch != NULL) {
943		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
944		kmem_free(newch, sizeof (*newch));
945	}
946
947	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
948	cp->ch_head = ch;
949
950	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
951	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
952	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
953	sigunintr(&smask);
954
955	if (error != 0) {
956		kmem_cache_free(chtab4_cache, cp);
957#ifdef DEBUG
958		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
959#endif
960		/*
961		 * Warning is unnecessary if error is EINTR.
962		 */
963		if (error != EINTR) {
964			nfs_cmn_err(error, CE_WARN,
965			    "clget: couldn't create handle: %m\n");
966		}
967		return (error);
968	}
969	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
970	auth_destroy(cp->ch_client->cl_auth);
971
972	/*
973	 * Get an auth handle.
974	 */
975	error = authget(svp, cp->ch_client, cr);
976	if (error || cp->ch_client->cl_auth == NULL) {
977		CLNT_DESTROY(cp->ch_client);
978		kmem_cache_free(chtab4_cache, cp);
979#ifdef DEBUG
980		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
981#endif
982		return ((error != 0) ? error : EINTR);
983	}
984	ch->ch_timesused++;
985	*newcl = cp->ch_client;
986	ASSERT(cp->ch_client->cl_nosignal == FALSE);
987	*chp = cp;
988	return (0);
989}
990
991static int
992nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
993    struct chtab **chp, struct nfs4_clnt *nfscl)
994{
995	clinfo_t ci;
996	bool_t is_recov;
997	int firstcall, error = 0;
998
999	/*
1000	 * Set read buffer size to rsize
1001	 * and add room for RPC headers.
1002	 */
1003	ci.cl_readsize = mi->mi_tsize;
1004	if (ci.cl_readsize != 0)
1005		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
1006
1007	/*
1008	 * If soft mount and server is down just try once.
1009	 * meaning: do not retransmit.
1010	 */
1011	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
1012		ci.cl_retrans = 0;
1013	else
1014		ci.cl_retrans = mi->mi_retrans;
1015
1016	ci.cl_prog = mi->mi_prog;
1017	ci.cl_vers = mi->mi_vers;
1018	ci.cl_flags = mi->mi_flags;
1019
1020	/*
1021	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1022	 * security flavor, the client tries to establish a security context
1023	 * by contacting the server. If the connection is timed out or reset,
1024	 * e.g. server reboot, we will try again.
1025	 */
1026	is_recov = (curthread == mi->mi_recovthread);
1027	firstcall = 1;
1028
1029	do {
1030		error = clget4(&ci, svp, cr, newcl, chp, nfscl);
1031
1032		if (error == 0)
1033			break;
1034
1035		/*
1036		 * For forced unmount and zone shutdown, bail out but
1037		 * let the recovery thread do one more transmission.
1038		 */
1039		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
1040		    (!is_recov || !firstcall)) {
1041			error = EIO;
1042			break;
1043		}
1044
1045		/* do not retry for soft mount */
1046		if (!(mi->mi_flags & MI4_HARD))
1047			break;
1048
1049		/* let the caller deal with the failover case */
1050		if (FAILOVER_MOUNT4(mi))
1051			break;
1052
1053		firstcall = 0;
1054
1055	} while (error == ETIMEDOUT || error == ECONNRESET);
1056
1057	return (error);
1058}
1059
1060void
1061clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
1062{
1063	if (cl->cl_auth != NULL) {
1064		sec_clnt_freeh(cl->cl_auth);
1065		cl->cl_auth = NULL;
1066	}
1067
1068	/*
1069	 * Timestamp this cache entry so that we know when it was last
1070	 * used.
1071	 */
1072	cp->ch_freed = gethrestime_sec();
1073
1074	/*
1075	 * Add the free client handle to the front of the list.
1076	 * This way, the list will be sorted in youngest to oldest
1077	 * order.
1078	 */
1079	mutex_enter(&nfscl->nfscl_chtable4_lock);
1080	cp->ch_list = cp->ch_head->ch_list;
1081	cp->ch_head->ch_list = cp;
1082	mutex_exit(&nfscl->nfscl_chtable4_lock);
1083}
1084
1085#define	CL_HOLDTIME	60	/* time to hold client handles */
1086
1087static void
1088clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
1089{
1090	struct chhead *ch;
1091	struct chtab *cp;	/* list of objects that can be reclaimed */
1092	struct chtab *cpe;
1093	struct chtab *cpl;
1094	struct chtab **cpp;
1095#ifdef DEBUG
1096	int n = 0;
1097	clstat4_debug.clreclaim.value.ui64++;
1098#endif
1099
1100	/*
1101	 * Need to reclaim some memory, so step through the cache
1102	 * looking through the lists for entries which can be freed.
1103	 */
1104	cp = NULL;
1105
1106	mutex_enter(&nfscl->nfscl_chtable4_lock);
1107
1108	/*
1109	 * Here we step through each non-NULL quadruple and start to
1110	 * construct the reclaim list pointed to by cp.  Note that
1111	 * cp will contain all eligible chtab entries.  When this traversal
1112	 * completes, chtab entries from the last quadruple will be at the
1113	 * front of cp and entries from previously inspected quadruples have
1114	 * been appended to the rear of cp.
1115	 */
1116	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
1117		if (ch->ch_list == NULL)
1118			continue;
1119		/*
1120		 * Search each list for entries older then
1121		 * cl_holdtime seconds.  The lists are maintained
1122		 * in youngest to oldest order so that when the
1123		 * first entry is found which is old enough, then
1124		 * all of the rest of the entries on the list will
1125		 * be old enough as well.
1126		 */
1127		cpl = ch->ch_list;
1128		cpp = &ch->ch_list;
1129		while (cpl != NULL &&
1130		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
1131			cpp = &cpl->ch_list;
1132			cpl = cpl->ch_list;
1133		}
1134		if (cpl != NULL) {
1135			*cpp = NULL;
1136			if (cp != NULL) {
1137				cpe = cpl;
1138				while (cpe->ch_list != NULL)
1139					cpe = cpe->ch_list;
1140				cpe->ch_list = cp;
1141			}
1142			cp = cpl;
1143		}
1144	}
1145
1146	mutex_exit(&nfscl->nfscl_chtable4_lock);
1147
1148	/*
1149	 * If cp is empty, then there is nothing to reclaim here.
1150	 */
1151	if (cp == NULL)
1152		return;
1153
1154	/*
1155	 * Step through the list of entries to free, destroying each client
1156	 * handle and kmem_free'ing the memory for each entry.
1157	 */
1158	while (cp != NULL) {
1159#ifdef DEBUG
1160		n++;
1161#endif
1162		CLNT_DESTROY(cp->ch_client);
1163		cpl = cp->ch_list;
1164		kmem_cache_free(chtab4_cache, cp);
1165		cp = cpl;
1166	}
1167
1168#ifdef DEBUG
1169	/*
1170	 * Update clalloc so that nfsstat shows the current number
1171	 * of allocated client handles.
1172	 */
1173	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
1174#endif
1175}
1176
1177/* ARGSUSED */
1178static void
1179clreclaim4(void *all)
1180{
1181	struct nfs4_clnt *nfscl;
1182
1183	/*
1184	 * The system is low on memory; go through and try to reclaim some from
1185	 * every zone on the system.
1186	 */
1187	mutex_enter(&nfs4_clnt_list_lock);
1188	nfscl = list_head(&nfs4_clnt_list);
1189	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
1190		clreclaim4_zone(nfscl, CL_HOLDTIME);
1191	mutex_exit(&nfs4_clnt_list_lock);
1192}
1193
1194/*
1195 * Minimum time-out values indexed by call type
1196 * These units are in "eights" of a second to avoid multiplies
1197 */
1198static unsigned int minimum_timeo[] = {
1199	6, 7, 10
1200};
1201
1202#define	SHORTWAIT	(NFS_COTS_TIMEO / 10)
1203
1204/*
1205 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1206 */
1207#define	MAXTIMO	(20*hz)
1208#define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1209#define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1210
1211static int
1212nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1213    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
1214    enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
1215{
1216	CLIENT *client;
1217	struct chtab *ch;
1218	cred_t *cr = icr;
1219	struct rpc_err rpcerr, rpcerr_tmp;
1220	enum clnt_stat status;
1221	int error;
1222	struct timeval wait;
1223	int timeo;		/* in units of hz */
1224	bool_t tryagain, is_recov;
1225	bool_t cred_cloned = FALSE;
1226	k_sigset_t smask;
1227	servinfo4_t *svp;
1228#ifdef DEBUG
1229	char *bufp;
1230#endif
1231	int firstcall;
1232
1233	rpcerr.re_status = RPC_SUCCESS;
1234
1235	/*
1236	 * If we know that we are rebooting then let's
1237	 * not bother with doing any over the wireness.
1238	 */
1239	mutex_enter(&mi->mi_lock);
1240	if (mi->mi_flags & MI4_SHUTDOWN) {
1241		mutex_exit(&mi->mi_lock);
1242		return (EIO);
1243	}
1244	mutex_exit(&mi->mi_lock);
1245
1246	/* For TSOL, use a new cred which has net_mac_aware flag */
1247	if (!cred_cloned && is_system_labeled()) {
1248		cred_cloned = TRUE;
1249		cr = crdup(icr);
1250		(void) setpflags(NET_MAC_AWARE, 1, cr);
1251	}
1252
1253	/*
1254	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1255	 * are guaranteed to reprocess the retry as a new request.
1256	 */
1257	svp = mi->mi_curr_serv;
1258	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
1259	if (rpcerr.re_errno != 0)
1260		return (rpcerr.re_errno);
1261
1262	timeo = (mi->mi_timeo * hz) / 10;
1263
1264	/*
1265	 * If hard mounted fs, retry call forever unless hard error
1266	 * occurs.
1267	 *
1268	 * For forced unmount, let the recovery thread through but return
1269	 * an error for all others.  This is so that user processes can
1270	 * exit quickly.  The recovery thread bails out after one
1271	 * transmission so that it can tell if it needs to continue.
1272	 *
1273	 * For zone shutdown, behave as above to encourage quick
1274	 * process exit, but also fail quickly when servers have
1275	 * timed out before and reduce the timeouts.
1276	 */
1277	is_recov = (curthread == mi->mi_recovthread);
1278	firstcall = 1;
1279	do {
1280		tryagain = FALSE;
1281
1282		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
1283		    "nfs4_rfscall: vfs_flag=0x%x, %s",
1284		    mi->mi_vfsp->vfs_flag,
1285		    is_recov ? "recov thread" : "not recov thread"));
1286
1287		/*
1288		 * It's possible while we're retrying the admin
1289		 * decided to reboot.
1290		 */
1291		mutex_enter(&mi->mi_lock);
1292		if (mi->mi_flags & MI4_SHUTDOWN) {
1293			mutex_exit(&mi->mi_lock);
1294			clfree4(client, ch, nfscl);
1295			if (cred_cloned)
1296				crfree(cr);
1297			return (EIO);
1298		}
1299		mutex_exit(&mi->mi_lock);
1300
1301		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1302		    (!is_recov || !firstcall)) {
1303			clfree4(client, ch, nfscl);
1304			if (cred_cloned)
1305				crfree(cr);
1306			return (EIO);
1307		}
1308
1309		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
1310			mutex_enter(&mi->mi_lock);
1311			if ((mi->mi_flags & MI4_TIMEDOUT) ||
1312			    !is_recov || !firstcall) {
1313				mutex_exit(&mi->mi_lock);
1314				clfree4(client, ch, nfscl);
1315				if (cred_cloned)
1316					crfree(cr);
1317				return (EIO);
1318			}
1319			mutex_exit(&mi->mi_lock);
1320			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
1321		}
1322
1323		firstcall = 0;
1324		TICK_TO_TIMEVAL(timeo, &wait);
1325
1326		/*
1327		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1328		 * and SIGTERM. (Preserving the existing masks).
1329		 * Mask out SIGINT if mount option nointr is specified.
1330		 */
1331		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
1332		if (!(mi->mi_flags & MI4_INT))
1333			client->cl_nosignal = TRUE;
1334
1335		/*
1336		 * If there is a current signal, then don't bother
1337		 * even trying to send out the request because we
1338		 * won't be able to block waiting for the response.
1339		 * Simply assume RPC_INTR and get on with it.
1340		 */
1341		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1342			status = RPC_INTR;
1343		else {
1344			status = CLNT_CALL(client, which, xdrargs, argsp,
1345			    xdrres, resp, wait);
1346		}
1347
1348		if (!(mi->mi_flags & MI4_INT))
1349			client->cl_nosignal = FALSE;
1350		/*
1351		 * restore original signal mask
1352		 */
1353		sigunintr(&smask);
1354
1355		switch (status) {
1356		case RPC_SUCCESS:
1357			break;
1358
1359		case RPC_INTR:
1360			/*
1361			 * There is no way to recover from this error,
1362			 * even if mount option nointr is specified.
1363			 * SIGKILL, for example, cannot be blocked.
1364			 */
1365			rpcerr.re_status = RPC_INTR;
1366			rpcerr.re_errno = EINTR;
1367			break;
1368
1369		case RPC_UDERROR:
1370			/*
1371			 * If the NFS server is local (vold) and
1372			 * it goes away then we get RPC_UDERROR.
1373			 * This is a retryable error, so we would
1374			 * loop, so check to see if the specific
1375			 * error was ECONNRESET, indicating that
1376			 * target did not exist at all.  If so,
1377			 * return with RPC_PROGUNAVAIL and
1378			 * ECONNRESET to indicate why.
1379			 */
1380			CLNT_GETERR(client, &rpcerr);
1381			if (rpcerr.re_errno == ECONNRESET) {
1382				rpcerr.re_status = RPC_PROGUNAVAIL;
1383				rpcerr.re_errno = ECONNRESET;
1384				break;
1385			}
1386			/*FALLTHROUGH*/
1387
1388		default:		/* probably RPC_TIMEDOUT */
1389
1390			if (IS_UNRECOVERABLE_RPC(status))
1391				break;
1392
1393			/*
1394			 * increment server not responding count
1395			 */
1396			mutex_enter(&mi->mi_lock);
1397			mi->mi_noresponse++;
1398			mutex_exit(&mi->mi_lock);
1399#ifdef DEBUG
1400			nfscl->nfscl_stat.noresponse.value.ui64++;
1401#endif
1402			/*
1403			 * On zone shutdown, mark server dead and move on.
1404			 */
1405			if (zone_status_get(curproc->p_zone) >=
1406			    ZONE_IS_SHUTTING_DOWN) {
1407				mutex_enter(&mi->mi_lock);
1408				mi->mi_flags |= MI4_TIMEDOUT;
1409				mutex_exit(&mi->mi_lock);
1410				clfree4(client, ch, nfscl);
1411				if (cred_cloned)
1412					crfree(cr);
1413				return (EIO);
1414			}
1415
1416			/*
1417			 * NFS client failover support:
1418			 * return and let the caller take care of
1419			 * failover.  We only return for failover mounts
1420			 * because otherwise we want the "not responding"
1421			 * message, the timer updates, etc.
1422			 */
1423			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
1424			    (error = try_failover(status)) != 0) {
1425				clfree4(client, ch, nfscl);
1426				if (cred_cloned)
1427					crfree(cr);
1428				*rpc_statusp = status;
1429				return (error);
1430			}
1431
1432			if (flags & RFSCALL_SOFT)
1433				break;
1434
1435			tryagain = TRUE;
1436
1437			/*
1438			 * The call is in progress (over COTS).
1439			 * Try the CLNT_CALL again, but don't
1440			 * print a noisy error message.
1441			 */
1442			if (status == RPC_INPROGRESS)
1443				break;
1444
1445			timeo = backoff(timeo);
1446			CLNT_GETERR(client, &rpcerr_tmp);
1447
1448			mutex_enter(&mi->mi_lock);
1449			if (!(mi->mi_flags & MI4_PRINTED)) {
1450				mi->mi_flags |= MI4_PRINTED;
1451				mutex_exit(&mi->mi_lock);
1452				if ((status == RPC_CANTSEND) &&
1453				    (rpcerr_tmp.re_errno == ENOBUFS))
1454					nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
1455					    0, 0, FALSE, NULL, 0, NULL);
1456				else
1457					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
1458					    0, 0, 0, FALSE, NULL, 0, NULL);
1459			} else
1460				mutex_exit(&mi->mi_lock);
1461
1462			if (*doqueue && nfs_has_ctty()) {
1463				*doqueue = 0;
1464				if (!(mi->mi_flags & MI4_NOPRINT)) {
1465					if ((status == RPC_CANTSEND) &&
1466					    (rpcerr_tmp.re_errno == ENOBUFS))
1467						nfs4_queue_fact(RF_SENDQ_FULL,
1468						    mi, 0, 0, 0, FALSE, NULL,
1469						    0, NULL);
1470					else
1471						nfs4_queue_fact(
1472						    RF_SRV_NOT_RESPOND, mi, 0,
1473						    0, 0, FALSE, NULL, 0, NULL);
1474				}
1475			}
1476		}
1477	} while (tryagain);
1478
1479	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
1480	    int, rpcerr.re_errno);
1481
1482	if (status != RPC_SUCCESS) {
1483		zoneid_t zoneid = mi->mi_zone->zone_id;
1484
1485		/*
1486		 * Let soft mounts use the timed out message.
1487		 */
1488		if (status == RPC_INPROGRESS)
1489			status = RPC_TIMEDOUT;
1490		nfscl->nfscl_stat.badcalls.value.ui64++;
1491		if (status != RPC_INTR) {
1492			mutex_enter(&mi->mi_lock);
1493			mi->mi_flags |= MI4_DOWN;
1494			mutex_exit(&mi->mi_lock);
1495			CLNT_GETERR(client, &rpcerr);
1496#ifdef DEBUG
1497			bufp = clnt_sperror(client, svp->sv_hostname);
1498			zprintf(zoneid, "NFS%d %s failed for %s\n",
1499			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1500			if (nfs_has_ctty()) {
1501				if (!(mi->mi_flags & MI4_NOPRINT)) {
1502					uprintf("NFS%d %s failed for %s\n",
1503					    mi->mi_vers, mi->mi_rfsnames[which],
1504					    bufp);
1505				}
1506			}
1507			kmem_free(bufp, MAXPATHLEN);
1508#else
1509			zprintf(zoneid,
1510			    "NFS %s failed for server %s: error %d (%s)\n",
1511			    mi->mi_rfsnames[which], svp->sv_hostname,
1512			    status, clnt_sperrno(status));
1513			if (nfs_has_ctty()) {
1514				if (!(mi->mi_flags & MI4_NOPRINT)) {
1515					uprintf(
1516				"NFS %s failed for server %s: error %d (%s)\n",
1517					    mi->mi_rfsnames[which],
1518					    svp->sv_hostname, status,
1519					    clnt_sperrno(status));
1520				}
1521			}
1522#endif
1523			/*
1524			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1525			 * re_errno is set appropriately depending on
1526			 * the authentication error
1527			 */
1528			if (status == RPC_VERSMISMATCH ||
1529			    status == RPC_PROGVERSMISMATCH)
1530				rpcerr.re_errno = EIO;
1531		}
1532	} else {
1533		/*
1534		 * Test the value of mi_down and mi_printed without
1535		 * holding the mi_lock mutex.  If they are both zero,
1536		 * then it is okay to skip the down and printed
1537		 * processing.  This saves on a mutex_enter and
1538		 * mutex_exit pair for a normal, successful RPC.
1539		 * This was just complete overhead.
1540		 */
1541		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
1542			mutex_enter(&mi->mi_lock);
1543			mi->mi_flags &= ~MI4_DOWN;
1544			if (mi->mi_flags & MI4_PRINTED) {
1545				mi->mi_flags &= ~MI4_PRINTED;
1546				mutex_exit(&mi->mi_lock);
1547				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1548					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
1549					    0, FALSE, NULL, 0, NULL);
1550			} else
1551				mutex_exit(&mi->mi_lock);
1552		}
1553
1554		if (*doqueue == 0) {
1555			if (!(mi->mi_flags & MI4_NOPRINT) &&
1556			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1557				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
1558				    FALSE, NULL, 0, NULL);
1559
1560			*doqueue = 1;
1561		}
1562	}
1563
1564	clfree4(client, ch, nfscl);
1565	if (cred_cloned)
1566		crfree(cr);
1567
1568	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1569
1570	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
1571	    rpcerr.re_errno);
1572
1573	*rpc_statusp = status;
1574	return (rpcerr.re_errno);
1575}
1576
1577/*
1578 * rfs4call - general wrapper for RPC calls initiated by the client
1579 */
1580void
1581rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
1582    cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
1583{
1584	int i, error;
1585	enum clnt_stat rpc_status = NFS4_OK;
1586	int num_resops;
1587	struct nfs4_clnt *nfscl;
1588
1589	ASSERT(nfs_zone() == mi->mi_zone);
1590	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1591	ASSERT(nfscl != NULL);
1592
1593	nfscl->nfscl_stat.calls.value.ui64++;
1594	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
1595
1596	/* Set up the results struct for XDR usage */
1597	resp->argsp = argsp;
1598	resp->array = NULL;
1599	resp->status = 0;
1600	resp->decode_len = 0;
1601
1602	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
1603	    xdr_COMPOUND4args_clnt, (caddr_t)argsp,
1604	    xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
1605	    doqueue, &rpc_status, flags, nfscl);
1606
1607	/* Return now if it was an RPC error */
1608	if (error) {
1609		ep->error = error;
1610		ep->stat = resp->status;
1611		ep->rpc_status = rpc_status;
1612		return;
1613	}
1614
1615	/* else we'll count the processed operations */
1616	num_resops = resp->decode_len;
1617	for (i = 0; i < num_resops; i++) {
1618		/*
1619		 * Count the individual operations
1620		 * processed by the server.
1621		 */
1622		if (resp->array[i].resop >= NFSPROC4_NULL &&
1623		    resp->array[i].resop <= OP_WRITE)
1624			mi->mi_reqs[resp->array[i].resop].value.ui64++;
1625	}
1626
1627	ep->error = 0;
1628	ep->stat = resp->status;
1629	ep->rpc_status = rpc_status;
1630}
1631
1632/*
1633 * nfs4rename_update - updates stored state after a rename.  Currently this
1634 * is the path of the object and anything under it, and the filehandle of
1635 * the renamed object.
1636 */
1637void
1638nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
1639{
1640	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
1641	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
1642}
1643
1644/*
1645 * Routine to look up the filehandle for the given path and rootvp.
1646 *
1647 * Return values:
1648 * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1649 *   updated.
1650 * - error: return value (errno value) and/or *statp is set appropriately.
1651 */
1652#define	RML_ORDINARY	1
1653#define	RML_NAMED_ATTR	2
1654#define	RML_ATTRDIR	3
1655
1656static void
1657remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
1658    int filetype, cred_t *cr,
1659    nfs_fh4 *fhp, nfs4_ga_res_t *garp,		/* fh, attrs for object */
1660    nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
1661    nfs4_error_t *ep)
1662{
1663	COMPOUND4args_clnt args;
1664	COMPOUND4res_clnt res;
1665	nfs_argop4 *argop;
1666	nfs_resop4 *resop;
1667	int num_argops;
1668	lookup4_param_t lookuparg;
1669	nfs_fh4 *tmpfhp;
1670	int doqueue = 1;
1671	char *path;
1672	mntinfo4_t *mi;
1673
1674	ASSERT(fname != NULL);
1675	ASSERT(rootvp->v_type == VDIR);
1676
1677	mi = VTOMI4(rootvp);
1678	path = fn_path(fname);
1679	switch (filetype) {
1680	case RML_NAMED_ATTR:
1681		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
1682		args.ctag = TAG_REMAP_LOOKUP_NA;
1683		break;
1684	case RML_ATTRDIR:
1685		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
1686		args.ctag = TAG_REMAP_LOOKUP_AD;
1687		break;
1688	case RML_ORDINARY:
1689		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1690		args.ctag = TAG_REMAP_LOOKUP;
1691		break;
1692	default:
1693		ep->error = EINVAL;
1694		return;
1695	}
1696	lookuparg.argsp = &args;
1697	lookuparg.resp = &res;
1698	lookuparg.header_len = 1;	/* Putfh */
1699	lookuparg.trailer_len = 0;
1700	lookuparg.ga_bits = NFS4_VATTR_MASK;
1701	lookuparg.mi = VTOMI4(rootvp);
1702
1703	(void) nfs4lookup_setup(path, &lookuparg, 1);
1704
1705	/* 0: putfh directory */
1706	argop = args.array;
1707	argop[0].argop = OP_CPUTFH;
1708	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
1709
1710	num_argops = args.array_len;
1711
1712	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1713
1714	if (ep->error || res.status != NFS4_OK)
1715		goto exit;
1716
1717	/* get the object filehandle */
1718	resop = &res.array[res.array_len - 2];
1719	if (resop->resop != OP_GETFH) {
1720		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1721		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1722		ep->stat = NFS4ERR_SERVERFAULT;
1723		goto exit;
1724	}
1725	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1726	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1727		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1728		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1729		    TAG_NONE, 0, 0);
1730		ep->stat = NFS4ERR_SERVERFAULT;
1731		goto exit;
1732	}
1733	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1734	nfs_fh4_copy(tmpfhp, fhp);
1735
1736	/* get the object attributes */
1737	resop = &res.array[res.array_len - 1];
1738	if (garp && resop->resop == OP_GETATTR)
1739		*garp = resop->nfs_resop4_u.opgetattr.ga_res;
1740
1741	/* See if there are enough fields in the response for parent info */
1742	if ((int)res.array_len - 5 <= 0)
1743		goto exit;
1744
1745	/* get the parent filehandle */
1746	resop = &res.array[res.array_len - 5];
1747	if (resop->resop != OP_GETFH) {
1748		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1749		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1750		ep->stat = NFS4ERR_SERVERFAULT;
1751		goto exit;
1752	}
1753	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1754	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1755		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1756		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1757		    TAG_NONE, 0, 0);
1758		ep->stat = NFS4ERR_SERVERFAULT;
1759		goto exit;
1760	}
1761	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1762	nfs_fh4_copy(tmpfhp, pfhp);
1763
1764	/* get the parent attributes */
1765	resop = &res.array[res.array_len - 4];
1766	if (pgarp && resop->resop == OP_GETATTR)
1767		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
1768
1769exit:
1770	/*
1771	 * It is too hard to remember where all the OP_LOOKUPs are
1772	 */
1773	nfs4args_lookup_free(argop, num_argops);
1774	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1775
1776	if (!ep->error)
1777		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1778	kmem_free(path, strlen(path)+1);
1779}
1780
1781/*
1782 * NFS client failover / volatile filehandle support
1783 *
1784 * Recover the filehandle for the given rnode.
1785 *
1786 * Errors are returned via the nfs4_error_t parameter.
1787 */
1788
1789void
1790nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
1791{
1792	int is_stub;
1793	rnode4_t *rp = VTOR4(vp);
1794	vnode_t *rootvp = NULL;
1795	vnode_t *dvp = NULL;
1796	cred_t *cr, *cred_otw;
1797	nfs4_ga_res_t gar, pgar;
1798	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
1799	int filetype = RML_ORDINARY;
1800	nfs4_recov_state_t recov = {NULL, 0, 0};
1801	int badfhcount = 0;
1802	nfs4_open_stream_t *osp = NULL;
1803	bool_t first_time = TRUE;	/* first time getting OTW cred */
1804	bool_t last_time = FALSE;	/* last time getting OTW cred */
1805
1806	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1807	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
1808	ASSERT(nfs4_consistent_type(vp));
1809
1810	if (vp->v_flag & VROOT) {
1811		nfs4_remap_root(mi, ep, flags);
1812		return;
1813	}
1814
1815	/*
1816	 * Given the root fh, use the path stored in
1817	 * the rnode to find the fh for the new server.
1818	 */
1819	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1820	if (ep->error != 0)
1821		return;
1822
1823	cr = curthread->t_cred;
1824	ASSERT(cr != NULL);
1825get_remap_cred:
1826	/*
1827	 * Releases the osp, if it is provided.
1828	 * Puts a hold on the cred_otw and the new osp (if found).
1829	 */
1830	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
1831	    &first_time, &last_time);
1832	ASSERT(cred_otw != NULL);
1833
1834	if (rp->r_flags & R4ISXATTR) {
1835		filetype = RML_NAMED_ATTR;
1836		(void) vtodv(vp, &dvp, cred_otw, FALSE);
1837	}
1838
1839	if (vp->v_flag & V_XATTRDIR) {
1840		filetype = RML_ATTRDIR;
1841	}
1842
1843	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
1844		/* file mount, doesn't need a remap */
1845		goto done;
1846	}
1847
1848again:
1849	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
1850	    &newfh, &gar, &newpfh, &pgar, ep);
1851
1852	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1853	    "nfs4_remap_file: remap_lookup returned %d/%d",
1854	    ep->error, ep->stat));
1855
1856	if (last_time == FALSE && ep->error == EACCES) {
1857		crfree(cred_otw);
1858		if (dvp != NULL)
1859			VN_RELE(dvp);
1860		goto get_remap_cred;
1861	}
1862	if (ep->error != 0)
1863		goto done;
1864
1865	switch (ep->stat) {
1866	case NFS4_OK:
1867		badfhcount = 0;
1868		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1869			mutex_enter(&rp->r_statelock);
1870			rp->r_delay_interval = 0;
1871			mutex_exit(&rp->r_statelock);
1872			uprintf("NFS File Available..\n");
1873		}
1874		break;
1875	case NFS4ERR_FHEXPIRED:
1876	case NFS4ERR_BADHANDLE:
1877	case NFS4ERR_STALE:
1878		/*
1879		 * If we ran into filehandle problems, we should try to
1880		 * remap the root vnode first and hope life gets better.
1881		 * But we need to avoid loops.
1882		 */
1883		if (badfhcount++ > 0)
1884			goto done;
1885		if (newfh.nfs_fh4_len != 0) {
1886			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1887			newfh.nfs_fh4_len = 0;
1888		}
1889		if (newpfh.nfs_fh4_len != 0) {
1890			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1891			newpfh.nfs_fh4_len = 0;
1892		}
1893		/* relative path - remap rootvp then retry */
1894		VN_RELE(rootvp);
1895		rootvp = NULL;
1896		nfs4_remap_root(mi, ep, flags);
1897		if (ep->error != 0 || ep->stat != NFS4_OK)
1898			goto done;
1899		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1900		if (ep->error != 0)
1901			goto done;
1902		goto again;
1903	case NFS4ERR_DELAY:
1904		badfhcount = 0;
1905		nfs4_set_delay_wait(vp);
1906		ep->error = nfs4_wait_for_delay(vp, &recov);
1907		if (ep->error != 0)
1908			goto done;
1909		goto again;
1910	case NFS4ERR_ACCESS:
1911		/* get new cred, try again */
1912		if (last_time == TRUE)
1913			goto done;
1914		if (dvp != NULL)
1915			VN_RELE(dvp);
1916		crfree(cred_otw);
1917		goto get_remap_cred;
1918	default:
1919		goto done;
1920	}
1921
1922	/*
1923	 * Check on the new and old rnodes before updating;
1924	 * if the vnode type or size changes, issue a warning
1925	 * and mark the file dead.
1926	 */
1927	mutex_enter(&rp->r_statelock);
1928	if (flags & NFS4_REMAP_CKATTRS) {
1929		if (vp->v_type != gar.n4g_va.va_type ||
1930		    (vp->v_type != VDIR &&
1931		    rp->r_size != gar.n4g_va.va_size)) {
1932			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1933			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
1934			    (int)rp->r_size, (int)gar.n4g_va.va_size,
1935			    vp->v_type, gar.n4g_va.va_type));
1936			mutex_exit(&rp->r_statelock);
1937			nfs4_queue_event(RE_FILE_DIFF, mi,
1938			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
1939			    TAG_NONE, TAG_NONE, 0, 0);
1940			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
1941			goto done;
1942		}
1943	}
1944	ASSERT(gar.n4g_va.va_type != VNON);
1945	rp->r_server = mi->mi_curr_serv;
1946
1947	/*
1948	 * Turn this object into a "stub" object if we
1949	 * crossed an underlying server fs boundary.
1950	 *
1951	 * This stub will be for a mirror-mount.
1952	 * A referral would look like a boundary crossing
1953	 * as well, but would not be the same type of object,
1954	 * so we would expect to mark the object dead.
1955	 *
1956	 * See comment in r4_do_attrcache() for more details.
1957	 */
1958	is_stub = 0;
1959	if (gar.n4g_fsid_valid) {
1960		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
1961		rp->r_srv_fsid = gar.n4g_fsid;
1962		if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
1963			is_stub = 1;
1964		nfs_rw_exit(&rp->r_server->sv_lock);
1965#ifdef DEBUG
1966	} else {
1967		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1968		    "remap_file: fsid attr not provided by server.  rp=%p",
1969		    (void *)rp));
1970#endif
1971	}
1972	if (is_stub)
1973		r4_stub_mirrormount(rp);
1974	else
1975		r4_stub_none(rp);
1976	mutex_exit(&rp->r_statelock);
1977	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
1978	sfh4_update(rp->r_fh, &newfh);
1979	ASSERT(nfs4_consistent_type(vp));
1980
1981	/*
1982	 * If we got parent info, use it to update the parent
1983	 */
1984	if (newpfh.nfs_fh4_len != 0) {
1985		if (rp->r_svnode.sv_dfh != NULL)
1986			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
1987		if (dvp != NULL) {
1988			/* force update of attrs */
1989			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
1990		}
1991	}
1992done:
1993	if (newfh.nfs_fh4_len != 0)
1994		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1995	if (newpfh.nfs_fh4_len != 0)
1996		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1997	if (cred_otw != NULL)
1998		crfree(cred_otw);
1999	if (rootvp != NULL)
2000		VN_RELE(rootvp);
2001	if (dvp != NULL)
2002		VN_RELE(dvp);
2003	if (osp != NULL)
2004		open_stream_rele(osp, rp);
2005}
2006
2007/*
2008 * Client-side failover support: remap the filehandle for vp if it appears
2009 * necessary.  errors are returned via the nfs4_error_t parameter; though,
2010 * if there is a problem, we will just try again later.
2011 */
2012
2013void
2014nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
2015{
2016	if (vp == NULL)
2017		return;
2018
2019	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
2020		return;
2021
2022	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
2023		return;
2024
2025	nfs4_remap_file(mi, vp, flags, ep);
2026}
2027
2028/*
2029 * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
2030 *
2031 * Our caller has a filehandle for ".." relative to a particular
2032 * directory object.  We want to find or create a parent vnode
2033 * with that filehandle and return it.  We can of course create
2034 * a vnode from this filehandle, but we need to also make sure
2035 * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2036 * that we have a parent FH for future reopens as well.  If
2037 * we have a remap failure, we won't be able to reopen this
2038 * file, but we won't treat that as fatal because a reopen
2039 * is at least unlikely.  Someday nfs4_reopen() should look
2040 * for a missing parent FH and try a remap to recover from it.
2041 *
2042 * need_start_op argument indicates whether this function should
2043 * do a start_op before calling remap_lookup().  This should
2044 * be FALSE, if you are the recovery thread or in an op; otherwise,
2045 * set it to TRUE.
2046 */
2047int
2048nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
2049    cred_t *cr, vnode_t **vpp, int need_start_op)
2050{
2051	mntinfo4_t *mi = VTOMI4(dvp);
2052	nfs4_fname_t *np = NULL, *pnp = NULL;
2053	vnode_t *vp = NULL, *rootvp = NULL;
2054	rnode4_t *rp;
2055	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2056	nfs4_ga_res_t gar, pgar;
2057	vattr_t va, pva;
2058	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2059	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
2060	nfs4_recov_state_t recov_state;
2061
2062#ifdef DEBUG
2063	/*
2064	 * ensure need_start_op is correct
2065	 */
2066	{
2067		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
2068		    (curthread == mi->mi_recovthread));
2069		/* C needs a ^^ operator! */
2070		ASSERT(((need_start_op) && (!no_need_start_op)) ||
2071		    ((! need_start_op) && (no_need_start_op)));
2072	}
2073#endif
2074	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
2075
2076	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
2077	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
2078	    rnode4info(VTOR4(dvp))));
2079
2080	/*
2081	 * rootvp might be needed eventually. Holding it now will
2082	 * ensure that r4find_unlocked() will find it, if ".." is the root.
2083	 */
2084	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2085	if (e.error != 0)
2086		goto out;
2087	rp = r4find_unlocked(fhp, mi->mi_vfsp);
2088	if (rp != NULL) {
2089		*vpp = RTOV4(rp);
2090		VN_RELE(rootvp);
2091		return (0);
2092	}
2093
2094	/*
2095	 * Since we don't have the rnode, we have to go over the wire.
2096	 * remap_lookup() can get all of the filehandles and attributes
2097	 * we need in one operation.
2098	 */
2099	np = fn_parent(VTOSV(dvp)->sv_name);
2100	/* if a parent was not found return an error */
2101	if (np == NULL) {
2102		e.error = ENOENT;
2103		goto out;
2104	}
2105
2106	recov_state.rs_flags = 0;
2107	recov_state.rs_num_retry_despite_err = 0;
2108recov_retry:
2109	if (need_start_op) {
2110		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
2111		    &recov_state, NULL);
2112		if (e.error != 0) {
2113			goto out;
2114		}
2115	}
2116
2117	pgar.n4g_va.va_type = VNON;
2118	gar.n4g_va.va_type = VNON;
2119
2120	remap_lookup(np, rootvp, RML_ORDINARY, cr,
2121	    &newfh, &gar, &newpfh, &pgar, &e);
2122	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2123		if (need_start_op) {
2124			bool_t abort;
2125
2126			abort = nfs4_start_recovery(&e, mi,
2127			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL,
2128			    NULL);
2129			if (abort) {
2130				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2131				    &recov_state, FALSE);
2132				if (e.error == 0)
2133					e.error = EIO;
2134				goto out;
2135			}
2136			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2137			    &recov_state, TRUE);
2138			goto recov_retry;
2139		}
2140		if (e.error == 0)
2141			e.error = EIO;
2142		goto out;
2143	}
2144
2145	va = gar.n4g_va;
2146	pva = pgar.n4g_va;
2147
2148	if ((e.error != 0) ||
2149	    (va.va_type != VDIR)) {
2150		if (need_start_op)
2151			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2152			    &recov_state, FALSE);
2153		if (e.error == 0)
2154			e.error = EIO;
2155		goto out;
2156	}
2157
2158	if (e.stat != NFS4_OK) {
2159		if (need_start_op)
2160			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2161			    &recov_state, FALSE);
2162		e.error = EIO;
2163		goto out;
2164	}
2165
2166	/*
2167	 * It is possible for remap_lookup() to return with no error,
2168	 * but without providing the parent filehandle and attrs.
2169	 */
2170	if (pva.va_type != VDIR) {
2171		/*
2172		 * Call remap_lookup() again, this time with the
2173		 * newpfh and pgar args in the first position.
2174		 */
2175		pnp = fn_parent(np);
2176		if (pnp != NULL) {
2177			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
2178			    &newpfh, &pgar, NULL, NULL, &e);
2179			/*
2180			 * This remap_lookup call modifies pgar. The following
2181			 * line prevents trouble when checking the va_type of
2182			 * pva later in this code.
2183			 */
2184			pva = pgar.n4g_va;
2185
2186			if (nfs4_needs_recovery(&e, FALSE,
2187			    mi->mi_vfsp)) {
2188				if (need_start_op) {
2189					bool_t abort;
2190
2191					abort = nfs4_start_recovery(&e, mi,
2192					    rootvp, NULL, NULL, NULL,
2193					    OP_LOOKUP, NULL, NULL, NULL);
2194					if (abort) {
2195						nfs4_end_fop(mi, rootvp, NULL,
2196						    OH_LOOKUP, &recov_state,
2197						    FALSE);
2198						if (e.error == 0)
2199							e.error = EIO;
2200						goto out;
2201					}
2202					nfs4_end_fop(mi, rootvp, NULL,
2203					    OH_LOOKUP, &recov_state, TRUE);
2204					goto recov_retry;
2205				}
2206				if (e.error == 0)
2207					e.error = EIO;
2208				goto out;
2209			}
2210
2211			if (e.stat != NFS4_OK) {
2212				if (need_start_op)
2213					nfs4_end_fop(mi, rootvp, NULL,
2214					    OH_LOOKUP, &recov_state, FALSE);
2215				e.error = EIO;
2216				goto out;
2217			}
2218		}
2219		if ((pnp == NULL) ||
2220		    (e.error != 0) ||
2221		    (pva.va_type == VNON)) {
2222			if (need_start_op)
2223				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2224				    &recov_state, FALSE);
2225			if (e.error == 0)
2226				e.error = EIO;
2227			goto out;
2228		}
2229	}
2230	ASSERT(newpfh.nfs_fh4_len != 0);
2231	if (need_start_op)
2232		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
2233	psfh = sfh4_get(&newpfh, mi);
2234
2235	sfh = sfh4_get(&newfh, mi);
2236	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
2237
2238out:
2239	if (np != NULL)
2240		fn_rele(&np);
2241	if (pnp != NULL)
2242		fn_rele(&pnp);
2243	if (newfh.nfs_fh4_len != 0)
2244		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2245	if (newpfh.nfs_fh4_len != 0)
2246		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2247	if (sfh != NULL)
2248		sfh4_rele(&sfh);
2249	if (psfh != NULL)
2250		sfh4_rele(&psfh);
2251	if (rootvp != NULL)
2252		VN_RELE(rootvp);
2253	*vpp = vp;
2254	return (e.error);
2255}
2256
2257#ifdef DEBUG
2258size_t r_path_memuse = 0;
2259#endif
2260
2261/*
2262 * NFS client failover support
2263 *
2264 * sv4_free() frees the malloc'd portion of a "servinfo_t".
2265 */
2266void
2267sv4_free(servinfo4_t *svp)
2268{
2269	servinfo4_t *next;
2270	struct knetconfig *knconf;
2271
2272	while (svp != NULL) {
2273		next = svp->sv_next;
2274		if (svp->sv_dhsec)
2275			sec_clnt_freeinfo(svp->sv_dhsec);
2276		if (svp->sv_secdata)
2277			sec_clnt_freeinfo(svp->sv_secdata);
2278		if (svp->sv_save_secinfo &&
2279		    svp->sv_save_secinfo != svp->sv_secinfo)
2280			secinfo_free(svp->sv_save_secinfo);
2281		if (svp->sv_secinfo)
2282			secinfo_free(svp->sv_secinfo);
2283		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
2284			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
2285		knconf = svp->sv_knconf;
2286		if (knconf != NULL) {
2287			if (knconf->knc_protofmly != NULL)
2288				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2289			if (knconf->knc_proto != NULL)
2290				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2291			kmem_free(knconf, sizeof (*knconf));
2292		}
2293		knconf = svp->sv_origknconf;
2294		if (knconf != NULL) {
2295			if (knconf->knc_protofmly != NULL)
2296				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2297			if (knconf->knc_proto != NULL)
2298				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2299			kmem_free(knconf, sizeof (*knconf));
2300		}
2301		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
2302			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
2303		if (svp->sv_path != NULL) {
2304			kmem_free(svp->sv_path, svp->sv_pathlen);
2305		}
2306		nfs_rw_destroy(&svp->sv_lock);
2307		kmem_free(svp, sizeof (*svp));
2308		svp = next;
2309	}
2310}
2311
2312void
2313nfs4_printfhandle(nfs4_fhandle_t *fhp)
2314{
2315	int *ip;
2316	char *buf;
2317	size_t bufsize;
2318	char *cp;
2319
2320	/*
2321	 * 13 == "(file handle:"
2322	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2323	 *	1 == ' '
2324	 *	8 == maximum strlen of "%x"
2325	 * 3 == ")\n\0"
2326	 */
2327	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2328	buf = kmem_alloc(bufsize, KM_NOSLEEP);
2329	if (buf == NULL)
2330		return;
2331
2332	cp = buf;
2333	(void) strcpy(cp, "(file handle:");
2334	while (*cp != '\0')
2335		cp++;
2336	for (ip = (int *)fhp->fh_buf;
2337	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
2338	    ip++) {
2339		(void) sprintf(cp, " %x", *ip);
2340		while (*cp != '\0')
2341			cp++;
2342	}
2343	(void) strcpy(cp, ")\n");
2344
2345	zcmn_err(getzoneid(), CE_CONT, "%s", buf);
2346
2347	kmem_free(buf, bufsize);
2348}
2349
2350/*
2351 * The NFSv4 readdir cache subsystem.
2352 *
2353 * We provide a set of interfaces to allow the rest of the system to utilize
2354 * a caching mechanism while encapsulating the details of the actual
2355 * implementation.  This should allow for better maintainability and
2356 * extensibility by consolidating the implementation details in one location.
2357 */
2358
2359/*
2360 * Comparator used by AVL routines.
2361 */
2362static int
2363rddir4_cache_compar(const void *x, const void *y)
2364{
2365	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
2366	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
2367	rddir4_cache *a = &ai->rc;
2368	rddir4_cache *b = &bi->rc;
2369
2370	if (a->nfs4_cookie == b->nfs4_cookie) {
2371		if (a->buflen == b->buflen)
2372			return (0);
2373		if (a->buflen < b->buflen)
2374			return (-1);
2375		return (1);
2376	}
2377
2378	if (a->nfs4_cookie < b->nfs4_cookie)
2379			return (-1);
2380
2381	return (1);
2382}
2383
2384/*
2385 * Allocate an opaque handle for the readdir cache.
2386 */
2387void
2388rddir4_cache_create(rnode4_t *rp)
2389{
2390	ASSERT(rp->r_dir == NULL);
2391
2392	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2393
2394	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
2395	    offsetof(rddir4_cache_impl, tree));
2396}
2397
2398/*
2399 *  Purge the cache of all cached readdir responses.
2400 */
2401void
2402rddir4_cache_purge(rnode4_t *rp)
2403{
2404	rddir4_cache_impl	*rdip;
2405	rddir4_cache_impl	*nrdip;
2406
2407	ASSERT(MUTEX_HELD(&rp->r_statelock));
2408
2409	if (rp->r_dir == NULL)
2410		return;
2411
2412	rdip = avl_first(rp->r_dir);
2413
2414	while (rdip != NULL) {
2415		nrdip = AVL_NEXT(rp->r_dir, rdip);
2416		avl_remove(rp->r_dir, rdip);
2417		rdip->rc.flags &= ~RDDIRCACHED;
2418		rddir4_cache_rele(rp, &rdip->rc);
2419		rdip = nrdip;
2420	}
2421	ASSERT(avl_numnodes(rp->r_dir) == 0);
2422}
2423
2424/*
2425 * Destroy the readdir cache.
2426 */
2427void
2428rddir4_cache_destroy(rnode4_t *rp)
2429{
2430	ASSERT(MUTEX_HELD(&rp->r_statelock));
2431	if (rp->r_dir == NULL)
2432		return;
2433
2434	rddir4_cache_purge(rp);
2435	avl_destroy(rp->r_dir);
2436	kmem_free(rp->r_dir, sizeof (avl_tree_t));
2437	rp->r_dir = NULL;
2438}
2439
2440/*
2441 * Locate a readdir response from the readdir cache.
2442 *
2443 * Return values:
2444 *
2445 * NULL - If there is an unrecoverable situation like the operation may have
2446 *	  been interrupted.
2447 *
2448 * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2449 *		    The flags are set approprately, such that the caller knows
2450 *		    what state the entry is in.
2451 */
2452rddir4_cache *
2453rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
2454{
2455	rddir4_cache_impl	*rdip = NULL;
2456	rddir4_cache_impl	srdip;
2457	rddir4_cache		*srdc;
2458	rddir4_cache		*rdc = NULL;
2459	rddir4_cache		*nrdc = NULL;
2460	avl_index_t		where;
2461
2462top:
2463	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2464	ASSERT(MUTEX_HELD(&rp->r_statelock));
2465	/*
2466	 * Check to see if the readdir cache has been disabled.  If so, then
2467	 * simply allocate an rddir4_cache entry and return it, since caching
2468	 * operations do not apply.
2469	 */
2470	if (rp->r_dir == NULL) {
2471		if (nrdc == NULL) {
2472			/*
2473			 * Drop the lock because we are doing a sleeping
2474			 * allocation.
2475			 */
2476			mutex_exit(&rp->r_statelock);
2477			rdc = rddir4_cache_alloc(KM_SLEEP);
2478			rdc->nfs4_cookie = cookie;
2479			rdc->buflen = count;
2480			mutex_enter(&rp->r_statelock);
2481			return (rdc);
2482		}
2483		return (nrdc);
2484	}
2485
2486	srdc = &srdip.rc;
2487	srdc->nfs4_cookie = cookie;
2488	srdc->buflen = count;
2489
2490	rdip = avl_find(rp->r_dir, &srdip, &where);
2491
2492	/*
2493	 * If we didn't find an entry then create one and insert it
2494	 * into the cache.
2495	 */
2496	if (rdip == NULL) {
2497		/*
2498		 * Check for the case where we have made a second pass through
2499		 * the cache due to a lockless allocation.  If we find that no
2500		 * thread has already inserted this entry, do the insert now
2501		 * and return.
2502		 */
2503		if (nrdc != NULL) {
2504			avl_insert(rp->r_dir, nrdc->data, where);
2505			nrdc->flags |= RDDIRCACHED;
2506			rddir4_cache_hold(nrdc);
2507			return (nrdc);
2508		}
2509
2510#ifdef DEBUG
2511		nfs4_readdir_cache_misses++;
2512#endif
2513		/*
2514		 * First, try to allocate an entry without sleeping.  If that
2515		 * fails then drop the lock and do a sleeping allocation.
2516		 */
2517		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
2518		if (nrdc != NULL) {
2519			nrdc->nfs4_cookie = cookie;
2520			nrdc->buflen = count;
2521			avl_insert(rp->r_dir, nrdc->data, where);
2522			nrdc->flags |= RDDIRCACHED;
2523			rddir4_cache_hold(nrdc);
2524			return (nrdc);
2525		}
2526
2527		/*
2528		 * Drop the lock and do a sleeping allocation.	We incur
2529		 * additional overhead by having to search the cache again,
2530		 * but this case should be rare.
2531		 */
2532		mutex_exit(&rp->r_statelock);
2533		nrdc = rddir4_cache_alloc(KM_SLEEP);
2534		nrdc->nfs4_cookie = cookie;
2535		nrdc->buflen = count;
2536		mutex_enter(&rp->r_statelock);
2537		/*
2538		 * We need to take another pass through the cache
2539		 * since we dropped our lock to perform the alloc.
2540		 * Another thread may have come by and inserted the
2541		 * entry we are interested in.
2542		 */
2543		goto top;
2544	}
2545
2546	/*
2547	 * Check to see if we need to free our entry.  This can happen if
2548	 * another thread came along beat us to the insert.  We can
2549	 * safely call rddir4_cache_free directly because no other thread
2550	 * would have a reference to this entry.
2551	 */
2552	if (nrdc != NULL)
2553		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
2554
2555#ifdef DEBUG
2556	nfs4_readdir_cache_hits++;
2557#endif
2558	/*
2559	 * Found something.  Make sure it's ready to return.
2560	 */
2561	rdc = &rdip->rc;
2562	rddir4_cache_hold(rdc);
2563	/*
2564	 * If the cache entry is in the process of being filled in, wait
2565	 * until this completes.  The RDDIRWAIT bit is set to indicate that
2566	 * someone is waiting and when the thread currently filling the entry
2567	 * is done, it should do a cv_broadcast to wakeup all of the threads
2568	 * waiting for it to finish. If the thread wakes up to find that
2569	 * someone new is now trying to complete the the entry, go back
2570	 * to sleep.
2571	 */
2572	while (rdc->flags & RDDIR) {
2573		/*
2574		 * The entry is not complete.
2575		 */
2576		nfs_rw_exit(&rp->r_rwlock);
2577		rdc->flags |= RDDIRWAIT;
2578#ifdef DEBUG
2579		nfs4_readdir_cache_waits++;
2580#endif
2581		while (rdc->flags & RDDIRWAIT) {
2582			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2583				/*
2584				 * We got interrupted, probably the user
2585				 * typed ^C or an alarm fired.  We free the
2586				 * new entry if we allocated one.
2587				 */
2588				rddir4_cache_rele(rp, rdc);
2589				mutex_exit(&rp->r_statelock);
2590				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2591				    RW_READER, FALSE);
2592				mutex_enter(&rp->r_statelock);
2593				return (NULL);
2594			}
2595		}
2596		mutex_exit(&rp->r_statelock);
2597		(void) nfs_rw_enter_sig(&rp->r_rwlock,
2598		    RW_READER, FALSE);
2599		mutex_enter(&rp->r_statelock);
2600	}
2601
2602	/*
2603	 * The entry we were waiting on may have been purged from
2604	 * the cache and should no longer be used, release it and
2605	 * start over.
2606	 */
2607	if (!(rdc->flags & RDDIRCACHED)) {
2608		rddir4_cache_rele(rp, rdc);
2609		goto top;
2610	}
2611
2612	/*
2613	 * The entry is completed.  Return it.
2614	 */
2615	return (rdc);
2616}
2617
2618/*
2619 * Allocate a cache element and return it.  Can return NULL if memory is
2620 * low.
2621 */
2622static rddir4_cache *
2623rddir4_cache_alloc(int flags)
2624{
2625	rddir4_cache_impl	*rdip = NULL;
2626	rddir4_cache		*rc = NULL;
2627
2628	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
2629
2630	if (rdip != NULL) {
2631		rc = &rdip->rc;
2632		rc->data = (void *)rdip;
2633		rc->nfs4_cookie = 0;
2634		rc->nfs4_ncookie = 0;
2635		rc->entries = NULL;
2636		rc->eof = 0;
2637		rc->entlen = 0;
2638		rc->buflen = 0;
2639		rc->actlen = 0;
2640		/*
2641		 * A readdir is required so set the flag.
2642		 */
2643		rc->flags = RDDIRREQ;
2644		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
2645		rc->error = 0;
2646		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
2647		rdip->count = 1;
2648#ifdef DEBUG
2649		atomic_inc_64(&clstat4_debug.dirent.value.ui64);
2650#endif
2651	}
2652	return (rc);
2653}
2654
2655/*
2656 * Increment the reference count to this cache element.
2657 */
2658static void
2659rddir4_cache_hold(rddir4_cache *rc)
2660{
2661	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
2662
2663	mutex_enter(&rdip->lock);
2664	rdip->count++;
2665	mutex_exit(&rdip->lock);
2666}
2667
2668/*
2669 * Release a reference to this cache element.  If the count is zero then
2670 * free the element.
2671 */
2672void
2673rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
2674{
2675	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
2676
2677	ASSERT(MUTEX_HELD(&rp->r_statelock));
2678
2679	/*
2680	 * Check to see if we have any waiters.  If so, we can wake them
2681	 * so that they can proceed.
2682	 */
2683	if (rdc->flags & RDDIRWAIT) {
2684		rdc->flags &= ~RDDIRWAIT;
2685		cv_broadcast(&rdc->cv);
2686	}
2687
2688	mutex_enter(&rdip->lock);
2689	ASSERT(rdip->count > 0);
2690	if (--rdip->count == 0) {
2691		mutex_exit(&rdip->lock);
2692		rddir4_cache_free(rdip);
2693	} else
2694		mutex_exit(&rdip->lock);
2695}
2696
2697/*
2698 * Free a cache element.
2699 */
2700static void
2701rddir4_cache_free(rddir4_cache_impl *rdip)
2702{
2703	rddir4_cache *rc = &rdip->rc;
2704
2705#ifdef DEBUG
2706	atomic_dec_64(&clstat4_debug.dirent.value.ui64);
2707#endif
2708	if (rc->entries != NULL)
2709		kmem_free(rc->entries, rc->buflen);
2710	cv_destroy(&rc->cv);
2711	mutex_destroy(&rdip->lock);
2712	kmem_free(rdip, sizeof (*rdip));
2713}
2714
2715/*
2716 * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2717 * framework.
2718 */
2719static int
2720cl4_snapshot(kstat_t *ksp, void *buf, int rw)
2721{
2722	ksp->ks_snaptime = gethrtime();
2723	if (rw == KSTAT_WRITE) {
2724		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
2725#ifdef DEBUG
2726		/*
2727		 * Currently only the global zone can write to kstats, but we
2728		 * add the check just for paranoia.
2729		 */
2730		if (INGLOBALZONE(curproc))
2731			bcopy((char *)buf + sizeof (clstat4_tmpl),
2732			    &clstat4_debug, sizeof (clstat4_debug));
2733#endif
2734	} else {
2735		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
2736#ifdef DEBUG
2737		/*
2738		 * If we're displaying the "global" debug kstat values, we
2739		 * display them as-is to all zones since in fact they apply to
2740		 * the system as a whole.
2741		 */
2742		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
2743		    sizeof (clstat4_debug));
2744#endif
2745	}
2746	return (0);
2747}
2748
2749
2750
2751/*
2752 * Zone support
2753 */
2754static void *
2755clinit4_zone(zoneid_t zoneid)
2756{
2757	kstat_t *nfs4_client_kstat;
2758	struct nfs4_clnt *nfscl;
2759	uint_t ndata;
2760
2761	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
2762	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
2763	nfscl->nfscl_chtable4 = NULL;
2764	nfscl->nfscl_zoneid = zoneid;
2765
2766	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
2767	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
2768#ifdef DEBUG
2769	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
2770#endif
2771	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
2772	    "misc", KSTAT_TYPE_NAMED, ndata,
2773	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
2774		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
2775		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
2776		kstat_install(nfs4_client_kstat);
2777	}
2778	mutex_enter(&nfs4_clnt_list_lock);
2779	list_insert_head(&nfs4_clnt_list, nfscl);
2780	mutex_exit(&nfs4_clnt_list_lock);
2781
2782	return (nfscl);
2783}
2784
2785/*ARGSUSED*/
2786static void
2787clfini4_zone(zoneid_t zoneid, void *arg)
2788{
2789	struct nfs4_clnt *nfscl = arg;
2790	chhead_t *chp, *next;
2791
2792	if (nfscl == NULL)
2793		return;
2794	mutex_enter(&nfs4_clnt_list_lock);
2795	list_remove(&nfs4_clnt_list, nfscl);
2796	mutex_exit(&nfs4_clnt_list_lock);
2797	clreclaim4_zone(nfscl, 0);
2798	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
2799		ASSERT(chp->ch_list == NULL);
2800		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
2801		next = chp->ch_next;
2802		kmem_free(chp, sizeof (*chp));
2803	}
2804	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
2805	mutex_destroy(&nfscl->nfscl_chtable4_lock);
2806	kmem_free(nfscl, sizeof (*nfscl));
2807}
2808
2809/*
2810 * Called by endpnt_destructor to make sure the client handles are
2811 * cleaned up before the RPC endpoints.  This becomes a no-op if
2812 * clfini_zone (above) is called first.  This function is needed
2813 * (rather than relying on clfini_zone to clean up) because the ZSD
2814 * callbacks have no ordering mechanism, so we have no way to ensure
2815 * that clfini_zone is called before endpnt_destructor.
2816 */
2817void
2818clcleanup4_zone(zoneid_t zoneid)
2819{
2820	struct nfs4_clnt *nfscl;
2821
2822	mutex_enter(&nfs4_clnt_list_lock);
2823	nfscl = list_head(&nfs4_clnt_list);
2824	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
2825		if (nfscl->nfscl_zoneid == zoneid) {
2826			clreclaim4_zone(nfscl, 0);
2827			break;
2828		}
2829	}
2830	mutex_exit(&nfs4_clnt_list_lock);
2831}
2832
2833int
2834nfs4_subr_init(void)
2835{
2836	/*
2837	 * Allocate and initialize the client handle cache
2838	 */
2839	chtab4_cache = kmem_cache_create("client_handle4_cache",
2840	    sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
2841	    NULL, 0);
2842
2843	/*
2844	 * Initialize the list of per-zone client handles (and associated data).
2845	 * This needs to be done before we call zone_key_create().
2846	 */
2847	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
2848	    offsetof(struct nfs4_clnt, nfscl_node));
2849
2850	/*
2851	 * Initialize the zone_key for per-zone client handle lists.
2852	 */
2853	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
2854
2855	if (nfs4err_delay_time == 0)
2856		nfs4err_delay_time = NFS4ERR_DELAY_TIME;
2857
2858	return (0);
2859}
2860
2861int
2862nfs4_subr_fini(void)
2863{
2864	/*
2865	 * Deallocate the client handle cache
2866	 */
2867	kmem_cache_destroy(chtab4_cache);
2868
2869	/*
2870	 * Destroy the zone_key
2871	 */
2872	(void) zone_key_delete(nfs4clnt_zone_key);
2873
2874	return (0);
2875}
2876/*
2877 * Set or Clear direct I/O flag
2878 * VOP_RWLOCK() is held for write access to prevent a race condition
2879 * which would occur if a process is in the middle of a write when
2880 * directio flag gets set. It is possible that all pages may not get flushed.
2881 *
2882 * This is a copy of nfs_directio, changes here may need to be made
2883 * there and vice versa.
2884 */
2885
2886int
2887nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
2888{
2889	int	error = 0;
2890	rnode4_t *rp;
2891
2892	rp = VTOR4(vp);
2893
2894	if (cmd == DIRECTIO_ON) {
2895
2896		if (rp->r_flags & R4DIRECTIO)
2897			return (0);
2898
2899		/*
2900		 * Flush the page cache.
2901		 */
2902
2903		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
2904
2905		if (rp->r_flags & R4DIRECTIO) {
2906			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2907			return (0);
2908		}
2909
2910		if (nfs4_has_pages(vp) &&
2911		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
2912			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
2913			    B_INVAL, cr, NULL);
2914			if (error) {
2915				if (error == ENOSPC || error == EDQUOT) {
2916					mutex_enter(&rp->r_statelock);
2917					if (!rp->r_error)
2918						rp->r_error = error;
2919					mutex_exit(&rp->r_statelock);
2920				}
2921				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2922				return (error);
2923			}
2924		}
2925
2926		mutex_enter(&rp->r_statelock);
2927		rp->r_flags |= R4DIRECTIO;
2928		mutex_exit(&rp->r_statelock);
2929		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2930		return (0);
2931	}
2932
2933	if (cmd == DIRECTIO_OFF) {
2934		mutex_enter(&rp->r_statelock);
2935		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
2936		mutex_exit(&rp->r_statelock);
2937		return (0);
2938	}
2939
2940	return (EINVAL);
2941}
2942
2943/*
2944 * Return TRUE if the file has any pages.  Always go back to
2945 * the master vnode to check v_pages since none of the shadows
2946 * can have pages.
2947 */
2948
2949bool_t
2950nfs4_has_pages(vnode_t *vp)
2951{
2952	rnode4_t *rp;
2953
2954	rp = VTOR4(vp);
2955	if (IS_SHADOW(vp, rp))
2956		vp = RTOV4(rp);	/* RTOV4 always gives the master */
2957
2958	return (vn_has_cached_data(vp));
2959}
2960
2961/*
2962 * This table is used to determine whether the client should attempt
2963 * failover based on the clnt_stat value returned by CLNT_CALL.  The
2964 * clnt_stat is used as an index into the table.  If
2965 * the error value that corresponds to the clnt_stat value in the
2966 * table is non-zero, then that is the error to be returned AND
2967 * that signals that failover should be attempted.
2968 *
2969 * Special note: If the RPC_ values change, then direct indexing of the
2970 * table is no longer valid, but having the RPC_ values in the table
2971 * allow the functions to detect the change and issue a warning.
2972 * In this case, the code will always attempt failover as a defensive
2973 * measure.
2974 */
2975
2976static struct try_failover_tab {
2977	enum clnt_stat	cstat;
2978	int		error;
2979} try_failover_table [] = {
2980
2981	RPC_SUCCESS,		0,
2982	RPC_CANTENCODEARGS,	0,
2983	RPC_CANTDECODERES,	0,
2984	RPC_CANTSEND,		ECOMM,
2985	RPC_CANTRECV,		ECOMM,
2986	RPC_TIMEDOUT,		ETIMEDOUT,
2987	RPC_VERSMISMATCH,	0,
2988	RPC_AUTHERROR,		0,
2989	RPC_PROGUNAVAIL,	0,
2990	RPC_PROGVERSMISMATCH,	0,
2991	RPC_PROCUNAVAIL,	0,
2992	RPC_CANTDECODEARGS,	0,
2993	RPC_SYSTEMERROR,	ENOSR,
2994	RPC_UNKNOWNHOST,	EHOSTUNREACH,
2995	RPC_RPCBFAILURE,	ENETUNREACH,
2996	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
2997	RPC_FAILED,		ETIMEDOUT,
2998	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
2999	RPC_INTR,		0,
3000	RPC_UNKNOWNADDR,	EHOSTUNREACH,
3001	RPC_TLIERROR,		0,
3002	RPC_NOBROADCAST,	EHOSTUNREACH,
3003	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
3004	RPC_UDERROR,		0,
3005	RPC_INPROGRESS,		0,
3006	RPC_STALERACHANDLE,	EINVAL,
3007	RPC_CANTCONNECT,	ECONNREFUSED,
3008	RPC_XPRTFAILED,		ECONNABORTED,
3009	RPC_CANTCREATESTREAM,	ECONNREFUSED,
3010	RPC_CANTSTORE,		ENOBUFS
3011};
3012
3013/*
3014 * nfs4_try_failover - determine whether the client should
3015 * attempt failover based on the values stored in the nfs4_error_t.
3016 */
3017int
3018nfs4_try_failover(nfs4_error_t *ep)
3019{
3020	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
3021		return (TRUE);
3022
3023	if (ep->error && ep->rpc_status != RPC_SUCCESS)
3024		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
3025
3026	return (FALSE);
3027}
3028
3029/*
3030 * try_failover - internal version of nfs4_try_failover, called
3031 * only by rfscall and aclcall.  Determine if failover is warranted
3032 * based on the clnt_stat and return the error number if it is.
3033 */
3034static int
3035try_failover(enum clnt_stat rpc_status)
3036{
3037	int err = 0;
3038
3039	if (rpc_status == RPC_SUCCESS)
3040		return (0);
3041
3042#ifdef	DEBUG
3043	if (rpc_status != 0 && nfs4_try_failover_any) {
3044		err = ETIMEDOUT;
3045		goto done;
3046	}
3047#endif
3048	/*
3049	 * The rpc status is used as an index into the table.
3050	 * If the rpc status is outside of the range of the
3051	 * table or if the rpc error numbers have been changed
3052	 * since the table was constructed, then print a warning
3053	 * (DEBUG only) and try failover anyway.  Otherwise, just
3054	 * grab the resulting error number out of the table.
3055	 */
3056	if (rpc_status < RPC_SUCCESS || rpc_status >=
3057	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
3058	    try_failover_table[rpc_status].cstat != rpc_status) {
3059
3060		err = ETIMEDOUT;
3061#ifdef	DEBUG
3062		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
3063		    rpc_status);
3064#endif
3065	} else
3066		err = try_failover_table[rpc_status].error;
3067
3068done:
3069	if (rpc_status)
3070		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3071		    "nfs4_try_failover: %strying failover on error %d",
3072		    err ? "" : "NOT ", rpc_status));
3073
3074	return (err);
3075}
3076
3077void
3078nfs4_error_zinit(nfs4_error_t *ep)
3079{
3080	ep->error = 0;
3081	ep->stat = NFS4_OK;
3082	ep->rpc_status = RPC_SUCCESS;
3083}
3084
3085void
3086nfs4_error_init(nfs4_error_t *ep, int error)
3087{
3088	ep->error = error;
3089	ep->stat = NFS4_OK;
3090	ep->rpc_status = RPC_SUCCESS;
3091}
3092
3093
3094#ifdef DEBUG
3095
3096/*
3097 * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3098 * use the same algorithm as for NFS v3.
3099 *
3100 */
3101int
3102hash16(void *p, int len)
3103{
3104	int i, rem;
3105	uint_t *wp;
3106	uint_t key = 0;
3107
3108	/* protect against non word aligned */
3109	if ((rem = len & 3) != 0)
3110		len &= ~3;
3111
3112	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
3113		key ^= (*wp >> 16) ^ *wp;
3114	}
3115
3116	/* hash left-over bytes */
3117	for (i = 0; i < rem; i++)
3118		key ^= *((uchar_t *)p + i);
3119
3120	return (key & 0xffff);
3121}
3122
3123/*
3124 * rnode4info - return filehandle and path information for an rnode.
3125 * XXX MT issues: uses a single static buffer, no locking of path.
3126 */
3127char *
3128rnode4info(rnode4_t *rp)
3129{
3130	static char buf[80];
3131	nfs4_fhandle_t fhandle;
3132	char *path;
3133	char *type;
3134
3135	if (rp == NULL)
3136		return ("null");
3137	if (rp->r_flags & R4ISXATTR)
3138		type = "attr";
3139	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
3140		type = "attrdir";
3141	else if (RTOV4(rp)->v_flag & VROOT)
3142		type = "root";
3143	else if (RTOV4(rp)->v_type == VDIR)
3144		type = "dir";
3145	else if (RTOV4(rp)->v_type == VREG)
3146		type = "file";
3147	else
3148		type = "other";
3149	sfh4_copyval(rp->r_fh, &fhandle);
3150	path = fn_path(rp->r_svnode.sv_name);
3151	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3152	    (void *)rp, path, type, rp->r_flags,
3153	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
3154	kmem_free(path, strlen(path)+1);
3155	return (buf);
3156}
3157#endif
3158