xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_subr.c (revision c38d8a0cc80f59455547094c187f13232f6a55f5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cmn_err.h>
37 #include <sys/vtrace.h>
38 #include <sys/session.h>
39 #include <sys/thread.h>
40 #include <sys/dnlc.h>
41 #include <sys/cred_impl.h>
42 #include <sys/list.h>
43 #include <sys/sdt.h>
44 #include <sys/policy.h>
45 
46 #include <rpc/types.h>
47 #include <rpc/xdr.h>
48 
49 #include <nfs/nfs.h>
50 
51 #include <nfs/nfs_clnt.h>
52 
53 #include <nfs/nfs4.h>
54 #include <nfs/rnode4.h>
55 #include <nfs/nfs4_clnt.h>
56 
57 /*
58  * client side statistics
59  */
60 static const struct clstat4 clstat4_tmpl = {
61 	{ "calls",	KSTAT_DATA_UINT64 },
62 	{ "badcalls",	KSTAT_DATA_UINT64 },
63 	{ "clgets",	KSTAT_DATA_UINT64 },
64 	{ "cltoomany",	KSTAT_DATA_UINT64 },
65 #ifdef DEBUG
66 	{ "clalloc",	KSTAT_DATA_UINT64 },
67 	{ "noresponse",	KSTAT_DATA_UINT64 },
68 	{ "failover",	KSTAT_DATA_UINT64 },
69 	{ "remap",	KSTAT_DATA_UINT64 },
70 #endif
71 };
72 
73 #ifdef DEBUG
74 struct clstat4_debug clstat4_debug = {
75 	{ "nrnode",	KSTAT_DATA_UINT64 },
76 	{ "access",	KSTAT_DATA_UINT64 },
77 	{ "dirent",	KSTAT_DATA_UINT64 },
78 	{ "dirents",	KSTAT_DATA_UINT64 },
79 	{ "reclaim",	KSTAT_DATA_UINT64 },
80 	{ "clreclaim",	KSTAT_DATA_UINT64 },
81 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
82 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
83 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
84 	{ "r_path",	KSTAT_DATA_UINT64 },
85 };
86 #endif
87 
88 /*
89  * We keep a global list of per-zone client data, so we can clean up all zones
90  * if we get low on memory.
91  */
92 static list_t nfs4_clnt_list;
93 static kmutex_t nfs4_clnt_list_lock;
94 static zone_key_t nfs4clnt_zone_key;
95 
96 static struct kmem_cache *chtab4_cache;
97 
98 #ifdef DEBUG
99 static int nfs4_rfscall_debug;
100 static int nfs4_try_failover_any;
101 int nfs4_utf8_debug = 0;
102 #endif
103 
104 /*
105  * NFSv4 readdir cache implementation
106  */
107 typedef struct rddir4_cache_impl {
108 	rddir4_cache	rc;		/* readdir cache element */
109 	kmutex_t	lock;		/* lock protects count */
110 	uint_t		count;		/* reference count */
111 	avl_node_t	tree;		/* AVL tree link */
112 } rddir4_cache_impl;
113 
114 static int rddir4_cache_compar(const void *, const void *);
115 static void rddir4_cache_free(rddir4_cache_impl *);
116 static rddir4_cache *rddir4_cache_alloc(int);
117 static void rddir4_cache_hold(rddir4_cache *);
118 static int try_failover(enum clnt_stat);
119 
120 static int nfs4_readdir_cache_hits = 0;
121 static int nfs4_readdir_cache_waits = 0;
122 static int nfs4_readdir_cache_misses = 0;
123 
124 /*
125  * Shared nfs4 functions
126  */
127 
128 /*
129  * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
130  * be allocated.
131  */
132 
133 void
134 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
135 {
136 	to->nfs_fh4_len = from->nfs_fh4_len;
137 	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
138 }
139 
140 /*
141  * nfs4cmpfh - compare 2 filehandles.
142  * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
143  * "less" than the second, +1 if the first is "greater" than the second.
144  */
145 
146 int
147 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
148 {
149 	const char *c1, *c2;
150 
151 	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
152 		return (-1);
153 	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
154 		return (1);
155 	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
156 	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
157 	    c1++, c2++) {
158 		if (*c1 < *c2)
159 			return (-1);
160 		if (*c1 > *c2)
161 			return (1);
162 	}
163 
164 	return (0);
165 }
166 
167 /*
168  * Compare two v4 filehandles.  Return zero if they're the same, non-zero
169  * if they're not.  Like nfs4cmpfh(), but different filehandle
170  * representation, and doesn't provide information about greater than or
171  * less than.
172  */
173 
174 int
175 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
176 {
177 	if (fh1->fh_len == fh2->fh_len)
178 		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
179 
180 	return (1);
181 }
182 
183 int
184 stateid4_cmp(stateid4 *s1, stateid4 *s2)
185 {
186 	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
187 		return (1);
188 	else
189 		return (0);
190 }
191 
192 nfsstat4
193 puterrno4(int error)
194 {
195 	switch (error) {
196 	case 0:
197 		return (NFS4_OK);
198 	case EPERM:
199 		return (NFS4ERR_PERM);
200 	case ENOENT:
201 		return (NFS4ERR_NOENT);
202 	case EINTR:
203 		return (NFS4ERR_IO);
204 	case EIO:
205 		return (NFS4ERR_IO);
206 	case ENXIO:
207 		return (NFS4ERR_NXIO);
208 	case ENOMEM:
209 		return (NFS4ERR_RESOURCE);
210 	case EACCES:
211 		return (NFS4ERR_ACCESS);
212 	case EBUSY:
213 		return (NFS4ERR_IO);
214 	case EEXIST:
215 		return (NFS4ERR_EXIST);
216 	case EXDEV:
217 		return (NFS4ERR_XDEV);
218 	case ENODEV:
219 		return (NFS4ERR_IO);
220 	case ENOTDIR:
221 		return (NFS4ERR_NOTDIR);
222 	case EISDIR:
223 		return (NFS4ERR_ISDIR);
224 	case EINVAL:
225 		return (NFS4ERR_INVAL);
226 	case EMFILE:
227 		return (NFS4ERR_RESOURCE);
228 	case EFBIG:
229 		return (NFS4ERR_FBIG);
230 	case ENOSPC:
231 		return (NFS4ERR_NOSPC);
232 	case EROFS:
233 		return (NFS4ERR_ROFS);
234 	case EMLINK:
235 		return (NFS4ERR_MLINK);
236 	case EDEADLK:
237 		return (NFS4ERR_DEADLOCK);
238 	case ENOLCK:
239 		return (NFS4ERR_DENIED);
240 	case EREMOTE:
241 		return (NFS4ERR_SERVERFAULT);
242 	case ENOTSUP:
243 		return (NFS4ERR_NOTSUPP);
244 	case EDQUOT:
245 		return (NFS4ERR_DQUOT);
246 	case ENAMETOOLONG:
247 		return (NFS4ERR_NAMETOOLONG);
248 	case EOVERFLOW:
249 		return (NFS4ERR_INVAL);
250 	case ENOSYS:
251 		return (NFS4ERR_NOTSUPP);
252 	case ENOTEMPTY:
253 		return (NFS4ERR_NOTEMPTY);
254 	case EOPNOTSUPP:
255 		return (NFS4ERR_NOTSUPP);
256 	case ESTALE:
257 		return (NFS4ERR_STALE);
258 	case EAGAIN:
259 		if (curthread->t_flag & T_WOULDBLOCK) {
260 			curthread->t_flag &= ~T_WOULDBLOCK;
261 			return (NFS4ERR_DELAY);
262 		}
263 		return (NFS4ERR_LOCKED);
264 	default:
265 		return ((enum nfsstat4)error);
266 	}
267 }
268 
269 int
270 geterrno4(enum nfsstat4 status)
271 {
272 	switch (status) {
273 	case NFS4_OK:
274 		return (0);
275 	case NFS4ERR_PERM:
276 		return (EPERM);
277 	case NFS4ERR_NOENT:
278 		return (ENOENT);
279 	case NFS4ERR_IO:
280 		return (EIO);
281 	case NFS4ERR_NXIO:
282 		return (ENXIO);
283 	case NFS4ERR_ACCESS:
284 		return (EACCES);
285 	case NFS4ERR_EXIST:
286 		return (EEXIST);
287 	case NFS4ERR_XDEV:
288 		return (EXDEV);
289 	case NFS4ERR_NOTDIR:
290 		return (ENOTDIR);
291 	case NFS4ERR_ISDIR:
292 		return (EISDIR);
293 	case NFS4ERR_INVAL:
294 		return (EINVAL);
295 	case NFS4ERR_FBIG:
296 		return (EFBIG);
297 	case NFS4ERR_NOSPC:
298 		return (ENOSPC);
299 	case NFS4ERR_ROFS:
300 		return (EROFS);
301 	case NFS4ERR_MLINK:
302 		return (EMLINK);
303 	case NFS4ERR_NAMETOOLONG:
304 		return (ENAMETOOLONG);
305 	case NFS4ERR_NOTEMPTY:
306 		return (ENOTEMPTY);
307 	case NFS4ERR_DQUOT:
308 		return (EDQUOT);
309 	case NFS4ERR_STALE:
310 		return (ESTALE);
311 	case NFS4ERR_BADHANDLE:
312 		return (ESTALE);
313 	case NFS4ERR_BAD_COOKIE:
314 		return (EINVAL);
315 	case NFS4ERR_NOTSUPP:
316 		return (EOPNOTSUPP);
317 	case NFS4ERR_TOOSMALL:
318 		return (EINVAL);
319 	case NFS4ERR_SERVERFAULT:
320 		return (EIO);
321 	case NFS4ERR_BADTYPE:
322 		return (EINVAL);
323 	case NFS4ERR_DELAY:
324 		return (ENXIO);
325 	case NFS4ERR_SAME:
326 		return (EPROTO);
327 	case NFS4ERR_DENIED:
328 		return (ENOLCK);
329 	case NFS4ERR_EXPIRED:
330 		return (EPROTO);
331 	case NFS4ERR_LOCKED:
332 		return (EACCES);
333 	case NFS4ERR_GRACE:
334 		return (EAGAIN);
335 	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
336 		return (ESTALE);
337 	case NFS4ERR_SHARE_DENIED:
338 		return (EACCES);
339 	case NFS4ERR_WRONGSEC:
340 		return (EPERM);
341 	case NFS4ERR_CLID_INUSE:
342 		return (EAGAIN);
343 	case NFS4ERR_RESOURCE:
344 		return (EAGAIN);
345 	case NFS4ERR_MOVED:
346 		return (EPROTO);
347 	case NFS4ERR_NOFILEHANDLE:
348 		return (EIO);
349 	case NFS4ERR_MINOR_VERS_MISMATCH:
350 		return (ENOTSUP);
351 	case NFS4ERR_STALE_CLIENTID:
352 		return (EIO);
353 	case NFS4ERR_STALE_STATEID:
354 		return (EIO);
355 	case NFS4ERR_OLD_STATEID:
356 		return (EIO);
357 	case NFS4ERR_BAD_STATEID:
358 		return (EIO);
359 	case NFS4ERR_BAD_SEQID:
360 		return (EIO);
361 	case NFS4ERR_NOT_SAME:
362 		return (EPROTO);
363 	case NFS4ERR_LOCK_RANGE:
364 		return (EPROTO);
365 	case NFS4ERR_SYMLINK:
366 		return (EPROTO);
367 	case NFS4ERR_RESTOREFH:
368 		return (EPROTO);
369 	case NFS4ERR_LEASE_MOVED:
370 		return (EPROTO);
371 	case NFS4ERR_ATTRNOTSUPP:
372 		return (ENOTSUP);
373 	case NFS4ERR_NO_GRACE:
374 		return (EPROTO);
375 	case NFS4ERR_RECLAIM_BAD:
376 		return (EPROTO);
377 	case NFS4ERR_RECLAIM_CONFLICT:
378 		return (EPROTO);
379 	case NFS4ERR_BADXDR:
380 		return (EINVAL);
381 	case NFS4ERR_LOCKS_HELD:
382 		return (EIO);
383 	case NFS4ERR_OPENMODE:
384 		return (EACCES);
385 	case NFS4ERR_BADOWNER:
386 		/*
387 		 * Client and server are in different DNS domains
388 		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
389 		 * doesn't match.  No good answer here.  Return
390 		 * EACCESS, which translates to "permission denied".
391 		 */
392 		return (EACCES);
393 	case NFS4ERR_BADCHAR:
394 		return (EINVAL);
395 	case NFS4ERR_BADNAME:
396 		return (EINVAL);
397 	case NFS4ERR_BAD_RANGE:
398 		return (EIO);
399 	case NFS4ERR_LOCK_NOTSUPP:
400 		return (ENOTSUP);
401 	case NFS4ERR_OP_ILLEGAL:
402 		return (EINVAL);
403 	case NFS4ERR_DEADLOCK:
404 		return (EDEADLK);
405 	case NFS4ERR_FILE_OPEN:
406 		return (EACCES);
407 	case NFS4ERR_ADMIN_REVOKED:
408 		return (EPROTO);
409 	case NFS4ERR_CB_PATH_DOWN:
410 		return (EPROTO);
411 	default:
412 #ifdef DEBUG
413 		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
414 		    status);
415 #endif
416 		return ((int)status);
417 	}
418 }
419 
420 void
421 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
422 {
423 	nfs4_server_t *server;
424 
425 	/*
426 	 * Return if already printed/queued a msg
427 	 * for this mount point.
428 	 */
429 	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
430 		return;
431 	/*
432 	 * Happens once per client <-> server pair.
433 	 */
434 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
435 		mi->mi_flags & MI4_INT))
436 		return;
437 
438 	server = find_nfs4_server(mi);
439 	if (server == NULL) {
440 		nfs_rw_exit(&mi->mi_recovlock);
441 		return;
442 	}
443 
444 	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
445 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
446 		    "!NFSMAPID_DOMAIN does not match"
447 		    " the server: %s domain.\n"
448 		    "Please check configuration",
449 		    mi->mi_curr_serv->sv_hostname);
450 		server->s_flags |= N4S_BADOWNER_DEBUG;
451 	}
452 	mutex_exit(&server->s_lock);
453 	nfs4_server_rele(server);
454 	nfs_rw_exit(&mi->mi_recovlock);
455 
456 	/*
457 	 * Happens once per mntinfo4_t.
458 	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
459 	 * queue this in the mesg queue for this mount_info. This message
460 	 * is not printed, meaning its absent from id_to_dump_solo_fact()
461 	 * but its there for inspection if the queue is ever dumped/inspected.
462 	 */
463 	mutex_enter(&mi->mi_lock);
464 	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
465 		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
466 		    FALSE, NULL, 0, NULL);
467 		mi->mi_flags |= MI4_BADOWNER_DEBUG;
468 	}
469 	mutex_exit(&mi->mi_lock);
470 }
471 
472 
473 
474 int
475 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
476 {
477 	int64_t sec;
478 	int32_t nsec;
479 
480 	/*
481 	 * Here check that the nfsv4 time is valid for the system.
482 	 * nfsv4 time value is a signed 64-bit, and the system time
483 	 * may be either int64_t or int32_t (depends on the kernel),
484 	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
485 	 */
486 #ifndef _LP64
487 	if (! NFS4_TIME_OK(ntime->seconds)) {
488 		return (EOVERFLOW);
489 	}
490 #endif
491 
492 	/* Invalid to specify 1 billion (or more) nsecs */
493 	if (ntime->nseconds >= 1000000000)
494 		return (EINVAL);
495 
496 	if (ntime->seconds < 0) {
497 		sec = ntime->seconds + 1;
498 		nsec = -1000000000 + ntime->nseconds;
499 	} else {
500 		sec = ntime->seconds;
501 		nsec = ntime->nseconds;
502 	}
503 
504 	vatime->tv_sec = sec;
505 	vatime->tv_nsec = nsec;
506 
507 	return (0);
508 }
509 
510 int
511 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
512 {
513 	int64_t sec;
514 	uint32_t nsec;
515 
516 	/*
517 	 * nfsv4 time value is a signed 64-bit, and the system time
518 	 * may be either int64_t or int32_t (depends on the kernel),
519 	 * so all system time values will fit.
520 	 */
521 	if (vatime->tv_nsec >= 0) {
522 		sec = vatime->tv_sec;
523 		nsec = vatime->tv_nsec;
524 	} else {
525 		sec = vatime->tv_sec - 1;
526 		nsec = 1000000000 + vatime->tv_nsec;
527 	}
528 	ntime->seconds = sec;
529 	ntime->nseconds = nsec;
530 
531 	return (0);
532 }
533 
534 /*
535  * Converts a utf8 string to a valid null terminated filename string.
536  *
537  * XXX - Not actually translating the UTF-8 string as per RFC 2279.
538  *	 For now, just validate that the UTF-8 string off the wire
539  *	 does not have characters that will freak out UFS, and leave
540  *	 it at that.
541  */
542 char *
543 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
544 {
545 	ASSERT(lenp != NULL);
546 
547 	if (u8s == NULL || u8s->utf8string_len <= 0 ||
548 					u8s->utf8string_val == NULL)
549 		return (NULL);
550 
551 	/*
552 	 * Check for obvious illegal filename chars
553 	 */
554 	if (utf8_strchr(u8s, '/') != NULL) {
555 #ifdef DEBUG
556 		if (nfs4_utf8_debug) {
557 			char *path;
558 			int len = u8s->utf8string_len;
559 
560 			path = kmem_alloc(len + 1, KM_SLEEP);
561 			bcopy(u8s->utf8string_val, path, len);
562 			path[len] = '\0';
563 
564 			zcmn_err(getzoneid(), CE_WARN,
565 			    "Invalid UTF-8 filename: %s", path);
566 
567 			kmem_free(path, len + 1);
568 		}
569 #endif
570 		return (NULL);
571 	}
572 
573 	return (utf8_to_str(u8s, lenp, s));
574 }
575 
576 /*
577  * Converts a utf8 string to a C string.
578  * kmem_allocs a new string if not supplied
579  */
580 char *
581 utf8_to_str(utf8string *str, uint_t *lenp, char *s)
582 {
583 	char	*sp;
584 	char	*u8p;
585 	int	len;
586 	int	 i;
587 
588 	ASSERT(lenp != NULL);
589 
590 	if (str == NULL)
591 		return (NULL);
592 
593 	u8p = str->utf8string_val;
594 	len = str->utf8string_len;
595 	if (len <= 0 || u8p == NULL) {
596 		if (s)
597 			*s = '\0';
598 		return (NULL);
599 	}
600 
601 	sp = s;
602 	if (sp == NULL)
603 		sp = kmem_alloc(len + 1, KM_SLEEP);
604 
605 	/*
606 	 * At least check for embedded nulls
607 	 */
608 	for (i = 0; i < len; i++) {
609 		sp[i] = u8p[i];
610 		if (u8p[i] == '\0') {
611 #ifdef	DEBUG
612 			zcmn_err(getzoneid(), CE_WARN,
613 			    "Embedded NULL in UTF-8 string");
614 #endif
615 			if (s == NULL)
616 				kmem_free(sp, len + 1);
617 			return (NULL);
618 		}
619 	}
620 	sp[len] = '\0';
621 	*lenp = len + 1;
622 
623 	return (sp);
624 }
625 
626 /*
627  * str_to_utf8 - converts a null-terminated C string to a utf8 string
628  */
629 utf8string *
630 str_to_utf8(char *nm, utf8string *str)
631 {
632 	int len;
633 
634 	if (str == NULL)
635 		return (NULL);
636 
637 	if (nm == NULL || *nm == '\0') {
638 		str->utf8string_len = 0;
639 		str->utf8string_val = NULL;
640 	}
641 
642 	len = strlen(nm);
643 
644 	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
645 	str->utf8string_len = len;
646 	bcopy(nm, str->utf8string_val, len);
647 
648 	return (str);
649 }
650 
651 utf8string *
652 utf8_copy(utf8string *src, utf8string *dest)
653 {
654 	if (src == NULL)
655 		return (NULL);
656 	if (dest == NULL)
657 		return (NULL);
658 
659 	if (src->utf8string_len > 0) {
660 		dest->utf8string_val = kmem_alloc(src->utf8string_len,
661 			KM_SLEEP);
662 		bcopy(src->utf8string_val, dest->utf8string_val,
663 			src->utf8string_len);
664 		dest->utf8string_len = src->utf8string_len;
665 	} else {
666 		dest->utf8string_val = NULL;
667 		dest->utf8string_len = 0;
668 	}
669 
670 	return (dest);
671 }
672 
673 int
674 utf8_compare(const utf8string *a, const utf8string *b)
675 {
676 	int mlen, cmp;
677 	int alen, blen;
678 	char *aval, *bval;
679 
680 	if ((a == NULL) && (b == NULL))
681 		return (0);
682 	else if (a == NULL)
683 		return (-1);
684 	else if (b == NULL)
685 		return (1);
686 
687 	alen = a->utf8string_len;
688 	blen = b->utf8string_len;
689 	aval = a->utf8string_val;
690 	bval = b->utf8string_val;
691 
692 	if (((alen == 0) || (aval == NULL)) &&
693 	    ((blen == 0) || (bval == NULL)))
694 		return (0);
695 	else if ((alen == 0) || (aval == NULL))
696 		return (-1);
697 	else if ((blen == 0) || (bval == NULL))
698 		return (1);
699 
700 	mlen = MIN(alen, blen);
701 	cmp = strncmp(aval, bval, mlen);
702 
703 	if ((cmp == 0) && (alen == blen))
704 		return (0);
705 	else if ((cmp == 0) && (alen < blen))
706 		return (-1);
707 	else if (cmp == 0)
708 		return (1);
709 	else if (cmp < 0)
710 		return (-1);
711 	return (1);
712 }
713 
714 /*
715  * utf8_dir_verify - checks that the utf8 string is valid
716  */
717 int
718 utf8_dir_verify(utf8string *str)
719 {
720 	char *nm;
721 	int len;
722 
723 	if (str == NULL)
724 		return (0);
725 
726 	nm = str->utf8string_val;
727 	len = str->utf8string_len;
728 	if (nm == NULL || len == 0) {
729 		return (0);
730 	}
731 
732 	if (len == 1 && nm[0] == '.')
733 		return (0);
734 	if (len == 2 && nm[0] == '.' && nm[1] == '.')
735 		return (0);
736 
737 	if (utf8_strchr(str, '/') != NULL)
738 		return (0);
739 
740 	if (utf8_strchr(str, '\0') != NULL)
741 		return (0);
742 
743 	return (1);
744 }
745 
746 /*
747  * from rpcsec module (common/rpcsec)
748  */
749 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
750 extern void sec_clnt_freeh(AUTH *);
751 extern void sec_clnt_freeinfo(struct sec_data *);
752 
753 /*
754  * authget() gets an auth handle based on the security
755  * information from the servinfo in mountinfo.
756  * The auth handle is stored in ch_client->cl_auth.
757  *
758  * First security flavor of choice is to use sv_secdata
759  * which is initiated by the client. If that fails, get
760  * secinfo from the server and then select one from the
761  * server secinfo list .
762  *
763  * For RPCSEC_GSS flavor, upon success, a secure context is
764  * established between client and server.
765  */
766 int
767 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
768 {
769 	int error, i;
770 
771 	/*
772 	 * SV4_TRYSECINFO indicates to try the secinfo list from
773 	 * sv_secinfo until a successful one is reached. Point
774 	 * sv_currsec to the selected security mechanism for
775 	 * later sessions.
776 	 */
777 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
778 	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
779 		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
780 								i++) {
781 			if (!(error = sec_clnt_geth(ch_client,
782 				&svp->sv_secinfo->sdata[i],
783 				cr, &ch_client->cl_auth))) {
784 
785 				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
786 				svp->sv_secinfo->index = i;
787 				/* done */
788 				svp->sv_flags &= ~SV4_TRYSECINFO;
789 				break;
790 			}
791 
792 			/*
793 			 * Allow the caller retry with the security flavor
794 			 * pointed by svp->sv_secinfo->index when
795 			 * ETIMEDOUT/ECONNRESET occurs.
796 			 */
797 			if (error == ETIMEDOUT || error == ECONNRESET) {
798 				svp->sv_secinfo->index = i;
799 				break;
800 			}
801 		}
802 	} else {
803 		/* sv_currsec points to one of the entries in sv_secinfo */
804 		if (svp->sv_currsec) {
805 			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
806 				&ch_client->cl_auth);
807 		} else {
808 			/* If it's null, use sv_secdata. */
809 			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
810 				&ch_client->cl_auth);
811 		}
812 	}
813 	nfs_rw_exit(&svp->sv_lock);
814 
815 	return (error);
816 }
817 
818 /*
819  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
820  */
821 int
822 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
823     struct chtab **chp, struct nfs4_clnt *nfscl)
824 {
825 	struct chhead *ch, *newch;
826 	struct chhead **plistp;
827 	struct chtab *cp;
828 	int error;
829 	k_sigset_t smask;
830 
831 	if (newcl == NULL || chp == NULL || ci == NULL)
832 		return (EINVAL);
833 
834 	*newcl = NULL;
835 	*chp = NULL;
836 
837 	/*
838 	 * Find an unused handle or create one
839 	 */
840 	newch = NULL;
841 	nfscl->nfscl_stat.clgets.value.ui64++;
842 top:
843 	/*
844 	 * Find the correct entry in the cache to check for free
845 	 * client handles.  The search is based on the RPC program
846 	 * number, program version number, dev_t for the transport
847 	 * device, and the protocol family.
848 	 */
849 	mutex_enter(&nfscl->nfscl_chtable4_lock);
850 	plistp = &nfscl->nfscl_chtable4;
851 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
852 		if (ch->ch_prog == ci->cl_prog &&
853 		    ch->ch_vers == ci->cl_vers &&
854 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
855 		    (strcmp(ch->ch_protofmly,
856 			svp->sv_knconf->knc_protofmly) == 0))
857 			break;
858 		plistp = &ch->ch_next;
859 	}
860 
861 	/*
862 	 * If we didn't find a cache entry for this quadruple, then
863 	 * create one.  If we don't have one already preallocated,
864 	 * then drop the cache lock, create one, and then start over.
865 	 * If we did have a preallocated entry, then just add it to
866 	 * the front of the list.
867 	 */
868 	if (ch == NULL) {
869 		if (newch == NULL) {
870 			mutex_exit(&nfscl->nfscl_chtable4_lock);
871 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
872 			newch->ch_timesused = 0;
873 			newch->ch_prog = ci->cl_prog;
874 			newch->ch_vers = ci->cl_vers;
875 			newch->ch_dev = svp->sv_knconf->knc_rdev;
876 			newch->ch_protofmly = kmem_alloc(
877 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
878 			    KM_SLEEP);
879 			(void) strcpy(newch->ch_protofmly,
880 			    svp->sv_knconf->knc_protofmly);
881 			newch->ch_list = NULL;
882 			goto top;
883 		}
884 		ch = newch;
885 		newch = NULL;
886 		ch->ch_next = nfscl->nfscl_chtable4;
887 		nfscl->nfscl_chtable4 = ch;
888 	/*
889 	 * We found a cache entry, but if it isn't on the front of the
890 	 * list, then move it to the front of the list to try to take
891 	 * advantage of locality of operations.
892 	 */
893 	} else if (ch != nfscl->nfscl_chtable4) {
894 		*plistp = ch->ch_next;
895 		ch->ch_next = nfscl->nfscl_chtable4;
896 		nfscl->nfscl_chtable4 = ch;
897 	}
898 
899 	/*
900 	 * If there was a free client handle cached, then remove it
901 	 * from the list, init it, and use it.
902 	 */
903 	if (ch->ch_list != NULL) {
904 		cp = ch->ch_list;
905 		ch->ch_list = cp->ch_list;
906 		mutex_exit(&nfscl->nfscl_chtable4_lock);
907 		if (newch != NULL) {
908 			kmem_free(newch->ch_protofmly,
909 			    strlen(newch->ch_protofmly) + 1);
910 			kmem_free(newch, sizeof (*newch));
911 		}
912 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
913 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
914 
915 		/*
916 		 * Get an auth handle.
917 		 */
918 		error = authget(svp, cp->ch_client, cr);
919 		if (error || cp->ch_client->cl_auth == NULL) {
920 			CLNT_DESTROY(cp->ch_client);
921 			kmem_cache_free(chtab4_cache, cp);
922 			return ((error != 0) ? error : EINTR);
923 		}
924 		ch->ch_timesused++;
925 		*newcl = cp->ch_client;
926 		*chp = cp;
927 		return (0);
928 	}
929 
930 	/*
931 	 * There weren't any free client handles which fit, so allocate
932 	 * a new one and use that.
933 	 */
934 #ifdef DEBUG
935 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
936 #endif
937 	mutex_exit(&nfscl->nfscl_chtable4_lock);
938 
939 	nfscl->nfscl_stat.cltoomany.value.ui64++;
940 	if (newch != NULL) {
941 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
942 		kmem_free(newch, sizeof (*newch));
943 	}
944 
945 	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
946 	cp->ch_head = ch;
947 
948 	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
949 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
950 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
951 	sigunintr(&smask);
952 
953 	if (error != 0) {
954 		kmem_cache_free(chtab4_cache, cp);
955 #ifdef DEBUG
956 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
957 #endif
958 		/*
959 		 * Warning is unnecessary if error is EINTR.
960 		 */
961 		if (error != EINTR) {
962 			nfs_cmn_err(error, CE_WARN,
963 			    "clget: couldn't create handle: %m\n");
964 		}
965 		return (error);
966 	}
967 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
968 	auth_destroy(cp->ch_client->cl_auth);
969 
970 	/*
971 	 * Get an auth handle.
972 	 */
973 	error = authget(svp, cp->ch_client, cr);
974 	if (error || cp->ch_client->cl_auth == NULL) {
975 		CLNT_DESTROY(cp->ch_client);
976 		kmem_cache_free(chtab4_cache, cp);
977 #ifdef DEBUG
978 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
979 #endif
980 		return ((error != 0) ? error : EINTR);
981 	}
982 	ch->ch_timesused++;
983 	*newcl = cp->ch_client;
984 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
985 	*chp = cp;
986 	return (0);
987 }
988 
989 static int
990 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
991     struct chtab **chp, struct nfs4_clnt *nfscl)
992 {
993 	clinfo_t ci;
994 	bool_t is_recov;
995 	int firstcall, error = 0;
996 
997 	/*
998 	 * Set read buffer size to rsize
999 	 * and add room for RPC headers.
1000 	 */
1001 	ci.cl_readsize = mi->mi_tsize;
1002 	if (ci.cl_readsize != 0)
1003 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
1004 
1005 	/*
1006 	 * If soft mount and server is down just try once.
1007 	 * meaning: do not retransmit.
1008 	 */
1009 	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
1010 		ci.cl_retrans = 0;
1011 	else
1012 		ci.cl_retrans = mi->mi_retrans;
1013 
1014 	ci.cl_prog = mi->mi_prog;
1015 	ci.cl_vers = mi->mi_vers;
1016 	ci.cl_flags = mi->mi_flags;
1017 
1018 	/*
1019 	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1020 	 * security flavor, the client tries to establish a security context
1021 	 * by contacting the server. If the connection is timed out or reset,
1022 	 * e.g. server reboot, we will try again.
1023 	 */
1024 	is_recov = (curthread == mi->mi_recovthread);
1025 	firstcall = 1;
1026 
1027 	do {
1028 		error = clget4(&ci, svp, cr, newcl, chp, nfscl);
1029 
1030 		if (error == 0)
1031 			break;
1032 
1033 		/*
1034 		 * For forced unmount and zone shutdown, bail out but
1035 		 * let the recovery thread do one more transmission.
1036 		 */
1037 		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
1038 		    (!is_recov || !firstcall)) {
1039 			error = EIO;
1040 			break;
1041 		}
1042 
1043 		/* do not retry for soft mount */
1044 		if (!(mi->mi_flags & MI4_HARD))
1045 			break;
1046 
1047 		/* let the caller deal with the failover case */
1048 		if (FAILOVER_MOUNT4(mi))
1049 			break;
1050 
1051 		firstcall = 0;
1052 
1053 	} while (error == ETIMEDOUT || error == ECONNRESET);
1054 
1055 	return (error);
1056 }
1057 
1058 void
1059 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
1060 {
1061 	if (cl->cl_auth != NULL) {
1062 		sec_clnt_freeh(cl->cl_auth);
1063 		cl->cl_auth = NULL;
1064 	}
1065 
1066 	/*
1067 	 * Timestamp this cache entry so that we know when it was last
1068 	 * used.
1069 	 */
1070 	cp->ch_freed = gethrestime_sec();
1071 
1072 	/*
1073 	 * Add the free client handle to the front of the list.
1074 	 * This way, the list will be sorted in youngest to oldest
1075 	 * order.
1076 	 */
1077 	mutex_enter(&nfscl->nfscl_chtable4_lock);
1078 	cp->ch_list = cp->ch_head->ch_list;
1079 	cp->ch_head->ch_list = cp;
1080 	mutex_exit(&nfscl->nfscl_chtable4_lock);
1081 }
1082 
1083 #define	CL_HOLDTIME	60	/* time to hold client handles */
1084 
1085 static void
1086 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
1087 {
1088 	struct chhead *ch;
1089 	struct chtab *cp;	/* list of objects that can be reclaimed */
1090 	struct chtab *cpe;
1091 	struct chtab *cpl;
1092 	struct chtab **cpp;
1093 #ifdef DEBUG
1094 	int n = 0;
1095 	clstat4_debug.clreclaim.value.ui64++;
1096 #endif
1097 
1098 	/*
1099 	 * Need to reclaim some memory, so step through the cache
1100 	 * looking through the lists for entries which can be freed.
1101 	 */
1102 	cp = NULL;
1103 
1104 	mutex_enter(&nfscl->nfscl_chtable4_lock);
1105 
1106 	/*
1107 	 * Here we step through each non-NULL quadruple and start to
1108 	 * construct the reclaim list pointed to by cp.  Note that
1109 	 * cp will contain all eligible chtab entries.  When this traversal
1110 	 * completes, chtab entries from the last quadruple will be at the
1111 	 * front of cp and entries from previously inspected quadruples have
1112 	 * been appended to the rear of cp.
1113 	 */
1114 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
1115 		if (ch->ch_list == NULL)
1116 			continue;
1117 		/*
1118 		 * Search each list for entries older then
1119 		 * cl_holdtime seconds.  The lists are maintained
1120 		 * in youngest to oldest order so that when the
1121 		 * first entry is found which is old enough, then
1122 		 * all of the rest of the entries on the list will
1123 		 * be old enough as well.
1124 		 */
1125 		cpl = ch->ch_list;
1126 		cpp = &ch->ch_list;
1127 		while (cpl != NULL &&
1128 			cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
1129 			cpp = &cpl->ch_list;
1130 			cpl = cpl->ch_list;
1131 		}
1132 		if (cpl != NULL) {
1133 			*cpp = NULL;
1134 			if (cp != NULL) {
1135 				cpe = cpl;
1136 				while (cpe->ch_list != NULL)
1137 					cpe = cpe->ch_list;
1138 				cpe->ch_list = cp;
1139 			}
1140 			cp = cpl;
1141 		}
1142 	}
1143 
1144 	mutex_exit(&nfscl->nfscl_chtable4_lock);
1145 
1146 	/*
1147 	 * If cp is empty, then there is nothing to reclaim here.
1148 	 */
1149 	if (cp == NULL)
1150 		return;
1151 
1152 	/*
1153 	 * Step through the list of entries to free, destroying each client
1154 	 * handle and kmem_free'ing the memory for each entry.
1155 	 */
1156 	while (cp != NULL) {
1157 #ifdef DEBUG
1158 		n++;
1159 #endif
1160 		CLNT_DESTROY(cp->ch_client);
1161 		cpl = cp->ch_list;
1162 		kmem_cache_free(chtab4_cache, cp);
1163 		cp = cpl;
1164 	}
1165 
1166 #ifdef DEBUG
1167 	/*
1168 	 * Update clalloc so that nfsstat shows the current number
1169 	 * of allocated client handles.
1170 	 */
1171 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
1172 #endif
1173 }
1174 
1175 /* ARGSUSED */
1176 static void
1177 clreclaim4(void *all)
1178 {
1179 	struct nfs4_clnt *nfscl;
1180 
1181 	/*
1182 	 * The system is low on memory; go through and try to reclaim some from
1183 	 * every zone on the system.
1184 	 */
1185 	mutex_enter(&nfs4_clnt_list_lock);
1186 	nfscl = list_head(&nfs4_clnt_list);
1187 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
1188 		clreclaim4_zone(nfscl, CL_HOLDTIME);
1189 	mutex_exit(&nfs4_clnt_list_lock);
1190 }
1191 
1192 /*
1193  * Minimum time-out values indexed by call type
1194  * These units are in "eights" of a second to avoid multiplies
1195  */
1196 static unsigned int minimum_timeo[] = {
1197 	6, 7, 10
1198 };
1199 
1200 #define	SHORTWAIT	(NFS_COTS_TIMEO / 10)
1201 
1202 /*
1203  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1204  */
1205 #define	MAXTIMO	(20*hz)
1206 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1207 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1208 
1209 static int
1210 nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1211     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
1212     enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
1213 {
1214 	CLIENT *client;
1215 	struct chtab *ch;
1216 	cred_t *cr = icr;
1217 	struct rpc_err rpcerr;
1218 	enum clnt_stat status;
1219 	int error;
1220 	struct timeval wait;
1221 	int timeo;		/* in units of hz */
1222 	bool_t tryagain, is_recov;
1223 	bool_t cred_cloned = FALSE;
1224 	k_sigset_t smask;
1225 	servinfo4_t *svp;
1226 #ifdef DEBUG
1227 	char *bufp;
1228 #endif
1229 	int firstcall;
1230 
1231 	rpcerr.re_status = RPC_SUCCESS;
1232 
1233 	/*
1234 	 * If we know that we are rebooting then let's
1235 	 * not bother with doing any over the wireness.
1236 	 */
1237 	mutex_enter(&mi->mi_lock);
1238 	if (mi->mi_flags & MI4_SHUTDOWN) {
1239 		mutex_exit(&mi->mi_lock);
1240 		return (EIO);
1241 	}
1242 	mutex_exit(&mi->mi_lock);
1243 
1244 	/* For TSOL, use a new cred which has net_mac_aware flag */
1245 	if (!cred_cloned && is_system_labeled()) {
1246 		cred_cloned = TRUE;
1247 		cr = crdup(icr);
1248 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1249 	}
1250 
1251 	/*
1252 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1253 	 * are guaranteed to reprocess the retry as a new request.
1254 	 */
1255 	svp = mi->mi_curr_serv;
1256 	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
1257 	if (rpcerr.re_errno != 0)
1258 		return (rpcerr.re_errno);
1259 
1260 	timeo = (mi->mi_timeo * hz) / 10;
1261 
1262 	/*
1263 	 * If hard mounted fs, retry call forever unless hard error
1264 	 * occurs.
1265 	 *
1266 	 * For forced unmount, let the recovery thread through but return
1267 	 * an error for all others.  This is so that user processes can
1268 	 * exit quickly.  The recovery thread bails out after one
1269 	 * transmission so that it can tell if it needs to continue.
1270 	 *
1271 	 * For zone shutdown, behave as above to encourage quick
1272 	 * process exit, but also fail quickly when servers have
1273 	 * timed out before and reduce the timeouts.
1274 	 */
1275 	is_recov = (curthread == mi->mi_recovthread);
1276 	firstcall = 1;
1277 	do {
1278 		tryagain = FALSE;
1279 
1280 		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
1281 			"nfs4_rfscall: vfs_flag=0x%x, %s",
1282 			mi->mi_vfsp->vfs_flag,
1283 			is_recov ? "recov thread" : "not recov thread"));
1284 
1285 		/*
1286 		 * It's possible while we're retrying the admin
1287 		 * decided to reboot.
1288 		 */
1289 		mutex_enter(&mi->mi_lock);
1290 		if (mi->mi_flags & MI4_SHUTDOWN) {
1291 			mutex_exit(&mi->mi_lock);
1292 			clfree4(client, ch, nfscl);
1293 			if (cred_cloned)
1294 				crfree(cr);
1295 			return (EIO);
1296 		}
1297 		mutex_exit(&mi->mi_lock);
1298 
1299 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1300 		    (!is_recov || !firstcall)) {
1301 			clfree4(client, ch, nfscl);
1302 			if (cred_cloned)
1303 				crfree(cr);
1304 			return (EIO);
1305 		}
1306 
1307 		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
1308 			mutex_enter(&mi->mi_lock);
1309 			if ((mi->mi_flags & MI4_TIMEDOUT) ||
1310 			    !is_recov || !firstcall) {
1311 				mutex_exit(&mi->mi_lock);
1312 				clfree4(client, ch, nfscl);
1313 				if (cred_cloned)
1314 					crfree(cr);
1315 				return (EIO);
1316 			}
1317 			mutex_exit(&mi->mi_lock);
1318 			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
1319 		}
1320 
1321 		firstcall = 0;
1322 		TICK_TO_TIMEVAL(timeo, &wait);
1323 
1324 		/*
1325 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1326 		 * and SIGTERM. (Preserving the existing masks).
1327 		 * Mask out SIGINT if mount option nointr is specified.
1328 		 */
1329 		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
1330 		if (!(mi->mi_flags & MI4_INT))
1331 			client->cl_nosignal = TRUE;
1332 
1333 		/*
1334 		 * If there is a current signal, then don't bother
1335 		 * even trying to send out the request because we
1336 		 * won't be able to block waiting for the response.
1337 		 * Simply assume RPC_INTR and get on with it.
1338 		 */
1339 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1340 			status = RPC_INTR;
1341 		else {
1342 			status = CLNT_CALL(client, which, xdrargs, argsp,
1343 			    xdrres, resp, wait);
1344 		}
1345 
1346 		if (!(mi->mi_flags & MI4_INT))
1347 			client->cl_nosignal = FALSE;
1348 		/*
1349 		 * restore original signal mask
1350 		 */
1351 		sigunintr(&smask);
1352 
1353 		switch (status) {
1354 		case RPC_SUCCESS:
1355 			break;
1356 
1357 		case RPC_INTR:
1358 			/*
1359 			 * There is no way to recover from this error,
1360 			 * even if mount option nointr is specified.
1361 			 * SIGKILL, for example, cannot be blocked.
1362 			 */
1363 			rpcerr.re_status = RPC_INTR;
1364 			rpcerr.re_errno = EINTR;
1365 			break;
1366 
1367 		case RPC_UDERROR:
1368 			/*
1369 			 * If the NFS server is local (vold) and
1370 			 * it goes away then we get RPC_UDERROR.
1371 			 * This is a retryable error, so we would
1372 			 * loop, so check to see if the specific
1373 			 * error was ECONNRESET, indicating that
1374 			 * target did not exist at all.  If so,
1375 			 * return with RPC_PROGUNAVAIL and
1376 			 * ECONNRESET to indicate why.
1377 			 */
1378 			CLNT_GETERR(client, &rpcerr);
1379 			if (rpcerr.re_errno == ECONNRESET) {
1380 				rpcerr.re_status = RPC_PROGUNAVAIL;
1381 				rpcerr.re_errno = ECONNRESET;
1382 				break;
1383 			}
1384 			/*FALLTHROUGH*/
1385 
1386 		default:		/* probably RPC_TIMEDOUT */
1387 
1388 			if (IS_UNRECOVERABLE_RPC(status))
1389 				break;
1390 
1391 			/*
1392 			 * increment server not responding count
1393 			 */
1394 			mutex_enter(&mi->mi_lock);
1395 			mi->mi_noresponse++;
1396 			mutex_exit(&mi->mi_lock);
1397 #ifdef DEBUG
1398 			nfscl->nfscl_stat.noresponse.value.ui64++;
1399 #endif
1400 			/*
1401 			 * On zone shutdown, mark server dead and move on.
1402 			 */
1403 			if (zone_status_get(curproc->p_zone) >=
1404 			    ZONE_IS_SHUTTING_DOWN) {
1405 				mutex_enter(&mi->mi_lock);
1406 				mi->mi_flags |= MI4_TIMEDOUT;
1407 				mutex_exit(&mi->mi_lock);
1408 				clfree4(client, ch, nfscl);
1409 				if (cred_cloned)
1410 					crfree(cr);
1411 				return (EIO);
1412 			}
1413 
1414 			/*
1415 			 * NFS client failover support:
1416 			 * return and let the caller take care of
1417 			 * failover.  We only return for failover mounts
1418 			 * because otherwise we want the "not responding"
1419 			 * message, the timer updates, etc.
1420 			 */
1421 			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
1422 			    (error = try_failover(status)) != 0) {
1423 				clfree4(client, ch, nfscl);
1424 				if (cred_cloned)
1425 					crfree(cr);
1426 				*rpc_statusp = status;
1427 				return (error);
1428 			}
1429 
1430 			if (flags & RFSCALL_SOFT)
1431 				break;
1432 
1433 			tryagain = TRUE;
1434 
1435 			/*
1436 			 * The call is in progress (over COTS).
1437 			 * Try the CLNT_CALL again, but don't
1438 			 * print a noisy error message.
1439 			 */
1440 			if (status == RPC_INPROGRESS)
1441 				break;
1442 
1443 			timeo = backoff(timeo);
1444 			mutex_enter(&mi->mi_lock);
1445 			if (!(mi->mi_flags & MI4_PRINTED)) {
1446 				mi->mi_flags |= MI4_PRINTED;
1447 				mutex_exit(&mi->mi_lock);
1448 				nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi, 0, 0, 0,
1449 				    FALSE, NULL, 0, NULL);
1450 			} else
1451 				mutex_exit(&mi->mi_lock);
1452 
1453 			if (*doqueue && curproc->p_sessp->s_vp != NULL) {
1454 				*doqueue = 0;
1455 				if (!(mi->mi_flags & MI4_NOPRINT))
1456 					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
1457 					    0, 0, 0, FALSE, NULL, 0, NULL);
1458 			}
1459 		}
1460 	} while (tryagain);
1461 
1462 	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
1463 			int, rpcerr.re_errno);
1464 
1465 	if (status != RPC_SUCCESS) {
1466 		zoneid_t zoneid = mi->mi_zone->zone_id;
1467 
1468 		/*
1469 		 * Let soft mounts use the timed out message.
1470 		 */
1471 		if (status == RPC_INPROGRESS)
1472 			status = RPC_TIMEDOUT;
1473 		nfscl->nfscl_stat.badcalls.value.ui64++;
1474 		if (status != RPC_INTR) {
1475 			mutex_enter(&mi->mi_lock);
1476 			mi->mi_flags |= MI4_DOWN;
1477 			mutex_exit(&mi->mi_lock);
1478 			CLNT_GETERR(client, &rpcerr);
1479 #ifdef DEBUG
1480 			bufp = clnt_sperror(client, svp->sv_hostname);
1481 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1482 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1483 			if (curproc->p_sessp->s_vp != NULL) {
1484 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1485 					uprintf("NFS%d %s failed for %s\n",
1486 					    mi->mi_vers, mi->mi_rfsnames[which],
1487 					    bufp);
1488 				}
1489 			}
1490 			kmem_free(bufp, MAXPATHLEN);
1491 #else
1492 			zprintf(zoneid,
1493 			    "NFS %s failed for server %s: error %d (%s)\n",
1494 			    mi->mi_rfsnames[which], svp->sv_hostname,
1495 			    status, clnt_sperrno(status));
1496 			if (curproc->p_sessp->s_vp != NULL) {
1497 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1498 					uprintf(
1499 				"NFS %s failed for server %s: error %d (%s)\n",
1500 					    mi->mi_rfsnames[which],
1501 					    svp->sv_hostname, status,
1502 					    clnt_sperrno(status));
1503 				}
1504 			}
1505 #endif
1506 			/*
1507 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1508 			 * re_errno is set appropriately depending on
1509 			 * the authentication error
1510 			 */
1511 			if (status == RPC_VERSMISMATCH ||
1512 			    status == RPC_PROGVERSMISMATCH)
1513 				rpcerr.re_errno = EIO;
1514 		}
1515 	} else {
1516 		/*
1517 		 * Test the value of mi_down and mi_printed without
1518 		 * holding the mi_lock mutex.  If they are both zero,
1519 		 * then it is okay to skip the down and printed
1520 		 * processing.  This saves on a mutex_enter and
1521 		 * mutex_exit pair for a normal, successful RPC.
1522 		 * This was just complete overhead.
1523 		 */
1524 		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
1525 			mutex_enter(&mi->mi_lock);
1526 			mi->mi_flags &= ~MI4_DOWN;
1527 			if (mi->mi_flags & MI4_PRINTED) {
1528 				mi->mi_flags &= ~MI4_PRINTED;
1529 				mutex_exit(&mi->mi_lock);
1530 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1531 					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
1532 					    0, FALSE, NULL, 0, NULL);
1533 			} else
1534 				mutex_exit(&mi->mi_lock);
1535 		}
1536 
1537 		if (*doqueue == 0) {
1538 			if (!(mi->mi_flags & MI4_NOPRINT) &&
1539 			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1540 				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
1541 				    FALSE, NULL, 0, NULL);
1542 
1543 			*doqueue = 1;
1544 		}
1545 	}
1546 
1547 	clfree4(client, ch, nfscl);
1548 	if (cred_cloned)
1549 		crfree(cr);
1550 
1551 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1552 
1553 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
1554 	    rpcerr.re_errno);
1555 
1556 	*rpc_statusp = status;
1557 	return (rpcerr.re_errno);
1558 }
1559 
1560 /*
1561  * rfs4call - general wrapper for RPC calls initiated by the client
1562  */
1563 void
1564 rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
1565 	cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
1566 {
1567 	int i, error;
1568 	enum clnt_stat rpc_status = NFS4_OK;
1569 	int num_resops;
1570 	struct nfs4_clnt *nfscl;
1571 
1572 	ASSERT(nfs_zone() == mi->mi_zone);
1573 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1574 	ASSERT(nfscl != NULL);
1575 
1576 	nfscl->nfscl_stat.calls.value.ui64++;
1577 	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
1578 
1579 	/* Set up the results struct for XDR usage */
1580 	resp->argsp = argsp;
1581 	resp->array = NULL;
1582 	resp->status = 0;
1583 	resp->decode_len = 0;
1584 
1585 	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
1586 		xdr_COMPOUND4args_clnt, (caddr_t)argsp,
1587 		xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
1588 		doqueue, &rpc_status, flags, nfscl);
1589 
1590 	/* Return now if it was an RPC error */
1591 	if (error) {
1592 		ep->error = error;
1593 		ep->stat = resp->status;
1594 		ep->rpc_status = rpc_status;
1595 		return;
1596 	}
1597 
1598 	/* else we'll count the processed operations */
1599 	num_resops = resp->decode_len;
1600 	for (i = 0; i < num_resops; i++) {
1601 		/*
1602 		 * Count the individual operations
1603 		 * processed by the server.
1604 		 */
1605 		if (resp->array[i].resop >= NFSPROC4_NULL &&
1606 		    resp->array[i].resop <= OP_WRITE)
1607 			mi->mi_reqs[resp->array[i].resop].value.ui64++;
1608 	}
1609 
1610 	ep->error = 0;
1611 	ep->stat = resp->status;
1612 	ep->rpc_status = rpc_status;
1613 }
1614 
1615 /*
1616  * nfs4rename_update - updates stored state after a rename.  Currently this
1617  * is the path of the object and anything under it, and the filehandle of
1618  * the renamed object.
1619  */
1620 void
1621 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
1622 {
1623 	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
1624 	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
1625 }
1626 
1627 /*
1628  * Routine to look up the filehandle for the given path and rootvp.
1629  *
1630  * Return values:
1631  * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1632  *   updated.
1633  * - error: return value (errno value) and/or *statp is set appropriately.
1634  */
1635 #define	RML_ORDINARY	1
1636 #define	RML_NAMED_ATTR	2
1637 #define	RML_ATTRDIR	3
1638 
1639 static void
1640 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
1641 	int filetype, cred_t *cr,
1642 	nfs_fh4 *fhp, nfs4_ga_res_t *garp,	/* fh, attrs for object */
1643 	nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
1644 	nfs4_error_t *ep)
1645 {
1646 	COMPOUND4args_clnt args;
1647 	COMPOUND4res_clnt res;
1648 	nfs_argop4 *argop;
1649 	nfs_resop4 *resop;
1650 	int num_argops;
1651 	lookup4_param_t lookuparg;
1652 	nfs_fh4 *tmpfhp;
1653 	int doqueue = 1;
1654 	char *path;
1655 	mntinfo4_t *mi;
1656 
1657 	ASSERT(fname != NULL);
1658 	ASSERT(rootvp->v_type == VDIR);
1659 
1660 	mi = VTOMI4(rootvp);
1661 	path = fn_path(fname);
1662 	switch (filetype) {
1663 	case RML_NAMED_ATTR:
1664 		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
1665 		args.ctag = TAG_REMAP_LOOKUP_NA;
1666 		break;
1667 	case RML_ATTRDIR:
1668 		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
1669 		args.ctag = TAG_REMAP_LOOKUP_AD;
1670 		break;
1671 	case RML_ORDINARY:
1672 		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1673 		args.ctag = TAG_REMAP_LOOKUP;
1674 		break;
1675 	default:
1676 		ep->error = EINVAL;
1677 		return;
1678 	}
1679 	lookuparg.argsp = &args;
1680 	lookuparg.resp = &res;
1681 	lookuparg.header_len = 1;	/* Putfh */
1682 	lookuparg.trailer_len = 0;
1683 	lookuparg.ga_bits = NFS4_VATTR_MASK;
1684 	lookuparg.mi = VTOMI4(rootvp);
1685 
1686 	(void) nfs4lookup_setup(path, &lookuparg, 1);
1687 
1688 	/* 0: putfh directory */
1689 	argop = args.array;
1690 	argop[0].argop = OP_CPUTFH;
1691 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
1692 
1693 	num_argops = args.array_len;
1694 
1695 	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1696 
1697 	if (ep->error || res.status != NFS4_OK)
1698 		goto exit;
1699 
1700 	/* get the object filehandle */
1701 	resop = &res.array[res.array_len - 2];
1702 	if (resop->resop != OP_GETFH) {
1703 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1704 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1705 		ep->stat = NFS4ERR_SERVERFAULT;
1706 		goto exit;
1707 	}
1708 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1709 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1710 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1711 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1712 		    TAG_NONE, 0, 0);
1713 		ep->stat = NFS4ERR_SERVERFAULT;
1714 		goto exit;
1715 	}
1716 	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1717 	nfs_fh4_copy(tmpfhp, fhp);
1718 
1719 	/* get the object attributes */
1720 	resop = &res.array[res.array_len - 1];
1721 	if (garp && resop->resop == OP_GETATTR)
1722 		*garp = resop->nfs_resop4_u.opgetattr.ga_res;
1723 
1724 	/* See if there are enough fields in the response for parent info */
1725 	if ((int)res.array_len - 5 <= 0)
1726 		goto exit;
1727 
1728 	/* get the parent filehandle */
1729 	resop = &res.array[res.array_len - 5];
1730 	if (resop->resop != OP_GETFH) {
1731 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1732 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1733 		ep->stat = NFS4ERR_SERVERFAULT;
1734 		goto exit;
1735 	}
1736 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1737 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1738 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1739 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1740 		    TAG_NONE, 0, 0);
1741 		ep->stat = NFS4ERR_SERVERFAULT;
1742 		goto exit;
1743 	}
1744 	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1745 	nfs_fh4_copy(tmpfhp, pfhp);
1746 
1747 	/* get the parent attributes */
1748 	resop = &res.array[res.array_len - 4];
1749 	if (pgarp && resop->resop == OP_GETATTR)
1750 		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
1751 
1752 exit:
1753 	/*
1754 	 * It is too hard to remember where all the OP_LOOKUPs are
1755 	 */
1756 	nfs4args_lookup_free(argop, num_argops);
1757 	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1758 
1759 	if (!ep->error)
1760 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1761 	kmem_free(path, strlen(path)+1);
1762 }
1763 
1764 /*
1765  * NFS client failover / volatile filehandle support
1766  *
1767  * Recover the filehandle for the given rnode.
1768  *
1769  * Errors are returned via the nfs4_error_t parameter.
1770  */
1771 
1772 void
1773 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
1774 {
1775 	rnode4_t *rp = VTOR4(vp);
1776 	vnode_t *rootvp = NULL;
1777 	vnode_t *dvp = NULL;
1778 	cred_t *cr, *cred_otw;
1779 	nfs4_ga_res_t gar, pgar;
1780 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
1781 	int filetype = RML_ORDINARY;
1782 	nfs4_recov_state_t recov = {NULL, 0, 0};
1783 	int badfhcount = 0;
1784 	nfs4_open_stream_t *osp = NULL;
1785 	bool_t first_time = TRUE;	/* first time getting OTW cred */
1786 	bool_t last_time = FALSE;	/* last time getting OTW cred */
1787 
1788 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1789 	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
1790 	ASSERT(nfs4_consistent_type(vp));
1791 
1792 	if (vp->v_flag & VROOT) {
1793 		nfs4_remap_root(mi, ep, flags);
1794 		return;
1795 	}
1796 
1797 	/*
1798 	 * Given the root fh, use the path stored in
1799 	 * the rnode to find the fh for the new server.
1800 	 */
1801 	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1802 	if (ep->error != 0)
1803 		return;
1804 
1805 	cr = curthread->t_cred;
1806 	ASSERT(cr != NULL);
1807 get_remap_cred:
1808 	/*
1809 	 * Releases the osp, if it is provided.
1810 	 * Puts a hold on the cred_otw and the new osp (if found).
1811 	 */
1812 	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
1813 		&first_time, &last_time);
1814 	ASSERT(cred_otw != NULL);
1815 
1816 	if (rp->r_flags & R4ISXATTR) {
1817 		filetype = RML_NAMED_ATTR;
1818 		(void) vtodv(vp, &dvp, cred_otw, FALSE);
1819 	}
1820 
1821 	if (vp->v_flag & V_XATTRDIR) {
1822 		filetype = RML_ATTRDIR;
1823 	}
1824 
1825 	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
1826 		/* file mount, doesn't need a remap */
1827 		goto done;
1828 	}
1829 
1830 again:
1831 	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
1832 			&newfh, &gar, &newpfh, &pgar, ep);
1833 
1834 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1835 	    "nfs4_remap_file: remap_lookup returned %d/%d",
1836 	    ep->error, ep->stat));
1837 
1838 	if (last_time == FALSE && ep->error == EACCES) {
1839 		crfree(cred_otw);
1840 		if (dvp != NULL)
1841 			VN_RELE(dvp);
1842 		goto get_remap_cred;
1843 	}
1844 	if (ep->error != 0)
1845 		goto done;
1846 
1847 	switch (ep->stat) {
1848 	case NFS4_OK:
1849 		badfhcount = 0;
1850 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1851 			mutex_enter(&rp->r_statelock);
1852 			rp->r_delay_interval = 0;
1853 			mutex_exit(&rp->r_statelock);
1854 			uprintf("NFS File Available..\n");
1855 		}
1856 		break;
1857 	case NFS4ERR_FHEXPIRED:
1858 	case NFS4ERR_BADHANDLE:
1859 		/*
1860 		 * If we ran into filehandle problems, we should try to
1861 		 * remap the root vnode first and hope life gets better.
1862 		 * But we need to avoid loops.
1863 		 */
1864 		if (badfhcount++ > 0)
1865 			goto done;
1866 		if (newfh.nfs_fh4_len != 0) {
1867 			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1868 			newfh.nfs_fh4_len = 0;
1869 		}
1870 		if (newpfh.nfs_fh4_len != 0) {
1871 			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1872 			newpfh.nfs_fh4_len = 0;
1873 		}
1874 		/* relative path - remap rootvp then retry */
1875 		VN_RELE(rootvp);
1876 		rootvp = NULL;
1877 		nfs4_remap_root(mi, ep, flags);
1878 		if (ep->error != 0 || ep->stat != NFS4_OK)
1879 			goto done;
1880 		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1881 		if (ep->error != 0)
1882 			goto done;
1883 		goto again;
1884 	case NFS4ERR_DELAY:
1885 		badfhcount = 0;
1886 		nfs4_set_delay_wait(vp);
1887 		ep->error = nfs4_wait_for_delay(vp, &recov);
1888 		if (ep->error != 0)
1889 			goto done;
1890 		goto again;
1891 	case NFS4ERR_ACCESS:
1892 		/* get new cred, try again */
1893 		if (last_time == TRUE)
1894 			goto done;
1895 		if (dvp != NULL)
1896 			VN_RELE(dvp);
1897 		crfree(cred_otw);
1898 		goto get_remap_cred;
1899 	default:
1900 		goto done;
1901 	}
1902 
1903 	/*
1904 	 * Check on the new and old rnodes before updating;
1905 	 * if the vnode type or size changes, issue a warning
1906 	 * and mark the file dead.
1907 	 */
1908 	mutex_enter(&rp->r_statelock);
1909 	if (flags & NFS4_REMAP_CKATTRS) {
1910 		if (vp->v_type != gar.n4g_va.va_type ||
1911 			(vp->v_type != VDIR &&
1912 			rp->r_size != gar.n4g_va.va_size)) {
1913 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1914 			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
1915 			    (int)rp->r_size, (int)gar.n4g_va.va_size,
1916 			    vp->v_type, gar.n4g_va.va_type));
1917 			mutex_exit(&rp->r_statelock);
1918 			nfs4_queue_event(RE_FILE_DIFF, mi,
1919 			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
1920 			    TAG_NONE, TAG_NONE, 0, 0);
1921 			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
1922 			goto done;
1923 		}
1924 	}
1925 	ASSERT(gar.n4g_va.va_type != VNON);
1926 	rp->r_server = mi->mi_curr_serv;
1927 
1928 	if (gar.n4g_fsid_valid) {
1929 		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
1930 		rp->r_srv_fsid = gar.n4g_fsid;
1931 		if (FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
1932 			rp->r_flags &= ~R4SRVSTUB;
1933 		else
1934 			rp->r_flags |= R4SRVSTUB;
1935 		nfs_rw_exit(&rp->r_server->sv_lock);
1936 #ifdef DEBUG
1937 	} else {
1938 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1939 			"remap_file: fsid attr not provided by server.  rp=%p",
1940 			(void *)rp));
1941 #endif
1942 	}
1943 	mutex_exit(&rp->r_statelock);
1944 	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
1945 	sfh4_update(rp->r_fh, &newfh);
1946 	ASSERT(nfs4_consistent_type(vp));
1947 
1948 	/*
1949 	 * If we got parent info, use it to update the parent
1950 	 */
1951 	if (newpfh.nfs_fh4_len != 0) {
1952 		if (rp->r_svnode.sv_dfh != NULL)
1953 			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
1954 		if (dvp != NULL) {
1955 			/* force update of attrs */
1956 			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
1957 		}
1958 	}
1959 done:
1960 	if (newfh.nfs_fh4_len != 0)
1961 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1962 	if (newpfh.nfs_fh4_len != 0)
1963 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1964 	if (cred_otw != NULL)
1965 		crfree(cred_otw);
1966 	if (rootvp != NULL)
1967 		VN_RELE(rootvp);
1968 	if (dvp != NULL)
1969 		VN_RELE(dvp);
1970 	if (osp != NULL)
1971 		open_stream_rele(osp, rp);
1972 }
1973 
1974 /*
1975  * Client-side failover support: remap the filehandle for vp if it appears
1976  * necessary.  errors are returned via the nfs4_error_t parameter; though,
1977  * if there is a problem, we will just try again later.
1978  */
1979 
1980 void
1981 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
1982 {
1983 	if (vp == NULL)
1984 		return;
1985 
1986 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
1987 		return;
1988 
1989 	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
1990 		return;
1991 
1992 	nfs4_remap_file(mi, vp, flags, ep);
1993 }
1994 
1995 /*
1996  * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
1997  *
1998  * Our caller has a filehandle for ".." relative to a particular
1999  * directory object.  We want to find or create a parent vnode
2000  * with that filehandle and return it.  We can of course create
2001  * a vnode from this filehandle, but we need to also make sure
2002  * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2003  * that we have a parent FH for future reopens as well.  If
2004  * we have a remap failure, we won't be able to reopen this
2005  * file, but we won't treat that as fatal because a reopen
2006  * is at least unlikely.  Someday nfs4_reopen() should look
2007  * for a missing parent FH and try a remap to recover from it.
2008  *
2009  * need_start_op argument indicates whether this function should
2010  * do a start_op before calling remap_lookup().  This should
2011  * be FALSE, if you are the recovery thread or in an op; otherwise,
2012  * set it to TRUE.
2013  */
2014 int
2015 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
2016     cred_t *cr, vnode_t **vpp, int need_start_op)
2017 {
2018 	mntinfo4_t *mi = VTOMI4(dvp);
2019 	nfs4_fname_t *np = NULL, *pnp = NULL;
2020 	vnode_t *vp = NULL, *rootvp = NULL;
2021 	rnode4_t *rp;
2022 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2023 	nfs4_ga_res_t gar, pgar;
2024 	vattr_t va, pva;
2025 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2026 	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
2027 	nfs4_recov_state_t recov_state;
2028 
2029 #ifdef DEBUG
2030 	/*
2031 	 * ensure need_start_op is correct
2032 	 */
2033 	{
2034 		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
2035 		    (curthread == mi->mi_recovthread));
2036 		/* C needs a ^^ operator! */
2037 		ASSERT(((need_start_op) && (!no_need_start_op)) ||
2038 		    ((! need_start_op) && (no_need_start_op)));
2039 	}
2040 #endif
2041 	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
2042 
2043 	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
2044 	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
2045 	    rnode4info(VTOR4(dvp))));
2046 
2047 	/*
2048 	 * rootvp might be needed eventually. Holding it now will
2049 	 * ensure that r4find_unlocked() will find it, if ".." is the root.
2050 	 */
2051 	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2052 	if (e.error != 0)
2053 		goto out;
2054 	rp = r4find_unlocked(fhp, mi->mi_vfsp);
2055 	if (rp != NULL) {
2056 		*vpp = RTOV4(rp);
2057 		VN_RELE(rootvp);
2058 		return (0);
2059 	}
2060 
2061 	/*
2062 	 * Since we don't have the rnode, we have to go over the wire.
2063 	 * remap_lookup() can get all of the filehandles and attributes
2064 	 * we need in one operation.
2065 	 */
2066 	np = fn_parent(VTOSV(dvp)->sv_name);
2067 	ASSERT(np != NULL);
2068 
2069 	recov_state.rs_flags = 0;
2070 	recov_state.rs_num_retry_despite_err = 0;
2071 recov_retry:
2072 	if (need_start_op) {
2073 		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
2074 		    &recov_state, NULL);
2075 		if (e.error != 0) {
2076 			goto out;
2077 		}
2078 	}
2079 	va.va_type = VNON;
2080 	pva.va_type = VNON;
2081 	remap_lookup(np, rootvp, RML_ORDINARY, cr,
2082 	    &newfh, &gar, &newpfh, &pgar, &e);
2083 	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2084 		if (need_start_op) {
2085 			bool_t abort;
2086 
2087 			abort = nfs4_start_recovery(&e, mi,
2088 			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL);
2089 			if (abort) {
2090 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2091 				    &recov_state, FALSE);
2092 				if (e.error == 0)
2093 					e.error = EIO;
2094 				goto out;
2095 			}
2096 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2097 			    &recov_state, TRUE);
2098 			goto recov_retry;
2099 		}
2100 		if (e.error == 0)
2101 			e.error = EIO;
2102 		goto out;
2103 	}
2104 
2105 	if (!e.error) {
2106 		va = gar.n4g_va;
2107 		pva = pgar.n4g_va;
2108 	}
2109 
2110 	if ((e.error != 0) ||
2111 	    (va.va_type != VDIR)) {
2112 		if (need_start_op)
2113 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2114 			    &recov_state, FALSE);
2115 		if (e.error == 0)
2116 			e.error = EIO;
2117 		goto out;
2118 	}
2119 
2120 	if (e.stat != NFS4_OK) {
2121 		if (need_start_op)
2122 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2123 			    &recov_state, FALSE);
2124 		e.error = EIO;
2125 		goto out;
2126 	}
2127 
2128 	/*
2129 	 * It is possible for remap_lookup() to return with no error,
2130 	 * but without providing the parent filehandle and attrs.
2131 	 */
2132 	if (pva.va_type != VDIR) {
2133 		/*
2134 		 * Call remap_lookup() again, this time with the
2135 		 * newpfh and pgar args in the first position.
2136 		 */
2137 		pnp = fn_parent(np);
2138 		if (pnp != NULL) {
2139 			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
2140 			    &newpfh, &pgar, NULL, NULL, &e);
2141 			if (nfs4_needs_recovery(&e, FALSE,
2142 			    mi->mi_vfsp)) {
2143 				if (need_start_op) {
2144 					bool_t abort;
2145 
2146 					abort = nfs4_start_recovery(&e, mi,
2147 					    rootvp, NULL, NULL, NULL,
2148 					    OP_LOOKUP, NULL);
2149 					if (abort) {
2150 						nfs4_end_fop(mi, rootvp, NULL,
2151 						    OH_LOOKUP, &recov_state,
2152 						    FALSE);
2153 						if (e.error == 0)
2154 							e.error = EIO;
2155 						goto out;
2156 					}
2157 					nfs4_end_fop(mi, rootvp, NULL,
2158 					    OH_LOOKUP, &recov_state, TRUE);
2159 					goto recov_retry;
2160 				}
2161 				if (e.error == 0)
2162 					e.error = EIO;
2163 				goto out;
2164 			}
2165 
2166 			if (e.stat != NFS4_OK) {
2167 				if (need_start_op)
2168 					nfs4_end_fop(mi, rootvp, NULL,
2169 					    OH_LOOKUP, &recov_state, FALSE);
2170 				e.error = EIO;
2171 				goto out;
2172 			}
2173 		}
2174 		if ((pnp == NULL) ||
2175 		    (e.error != 0) ||
2176 		    (pva.va_type == VNON)) {
2177 			if (need_start_op)
2178 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2179 				    &recov_state, FALSE);
2180 			if (e.error == 0)
2181 				e.error = EIO;
2182 			goto out;
2183 		}
2184 	}
2185 	ASSERT(newpfh.nfs_fh4_len != 0);
2186 	if (need_start_op)
2187 		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
2188 	psfh = sfh4_get(&newpfh, mi);
2189 
2190 	sfh = sfh4_get(&newfh, mi);
2191 	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
2192 
2193 out:
2194 	if (np != NULL)
2195 		fn_rele(&np);
2196 	if (pnp != NULL)
2197 		fn_rele(&pnp);
2198 	if (newfh.nfs_fh4_len != 0)
2199 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2200 	if (newpfh.nfs_fh4_len != 0)
2201 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2202 	if (sfh != NULL)
2203 		sfh4_rele(&sfh);
2204 	if (psfh != NULL)
2205 		sfh4_rele(&psfh);
2206 	if (rootvp != NULL)
2207 		VN_RELE(rootvp);
2208 	*vpp = vp;
2209 	return (e.error);
2210 }
2211 
2212 #ifdef DEBUG
2213 size_t r_path_memuse = 0;
2214 #endif
2215 
2216 /*
2217  * NFS client failover support
2218  *
2219  * sv4_free() frees the malloc'd portion of a "servinfo_t".
2220  */
2221 void
2222 sv4_free(servinfo4_t *svp)
2223 {
2224 	servinfo4_t *next;
2225 	struct knetconfig *knconf;
2226 
2227 	while (svp != NULL) {
2228 		next = svp->sv_next;
2229 		if (svp->sv_dhsec)
2230 			sec_clnt_freeinfo(svp->sv_dhsec);
2231 		if (svp->sv_secdata)
2232 			sec_clnt_freeinfo(svp->sv_secdata);
2233 		if (svp->sv_save_secinfo &&
2234 				svp->sv_save_secinfo != svp->sv_secinfo)
2235 			secinfo_free(svp->sv_save_secinfo);
2236 		if (svp->sv_secinfo)
2237 			secinfo_free(svp->sv_secinfo);
2238 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
2239 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
2240 		knconf = svp->sv_knconf;
2241 		if (knconf != NULL) {
2242 			if (knconf->knc_protofmly != NULL)
2243 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2244 			if (knconf->knc_proto != NULL)
2245 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2246 			kmem_free(knconf, sizeof (*knconf));
2247 		}
2248 		knconf = svp->sv_origknconf;
2249 		if (knconf != NULL) {
2250 			if (knconf->knc_protofmly != NULL)
2251 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2252 			if (knconf->knc_proto != NULL)
2253 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2254 			kmem_free(knconf, sizeof (*knconf));
2255 		}
2256 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
2257 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
2258 		if (svp->sv_path != NULL) {
2259 			kmem_free(svp->sv_path, svp->sv_pathlen);
2260 		}
2261 		nfs_rw_destroy(&svp->sv_lock);
2262 		kmem_free(svp, sizeof (*svp));
2263 		svp = next;
2264 	}
2265 }
2266 
2267 void
2268 nfs4_printfhandle(nfs4_fhandle_t *fhp)
2269 {
2270 	int *ip;
2271 	char *buf;
2272 	size_t bufsize;
2273 	char *cp;
2274 
2275 	/*
2276 	 * 13 == "(file handle:"
2277 	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2278 	 *	1 == ' '
2279 	 *	8 == maximum strlen of "%x"
2280 	 * 3 == ")\n\0"
2281 	 */
2282 	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2283 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
2284 	if (buf == NULL)
2285 		return;
2286 
2287 	cp = buf;
2288 	(void) strcpy(cp, "(file handle:");
2289 	while (*cp != '\0')
2290 		cp++;
2291 	for (ip = (int *)fhp->fh_buf;
2292 	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
2293 	    ip++) {
2294 		(void) sprintf(cp, " %x", *ip);
2295 		while (*cp != '\0')
2296 			cp++;
2297 	}
2298 	(void) strcpy(cp, ")\n");
2299 
2300 	zcmn_err(getzoneid(), CE_CONT, "%s", buf);
2301 
2302 	kmem_free(buf, bufsize);
2303 }
2304 
2305 /*
2306  * The NFSv4 readdir cache subsystem.
2307  *
2308  * We provide a set of interfaces to allow the rest of the system to utilize
2309  * a caching mechanism while encapsulating the details of the actual
2310  * implementation.  This should allow for better maintainability and
2311  * extensibilty by consolidating the implementation details in one location.
2312  */
2313 
2314 /*
2315  * Comparator used by AVL routines.
2316  */
2317 static int
2318 rddir4_cache_compar(const void *x, const void *y)
2319 {
2320 	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
2321 	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
2322 	rddir4_cache *a = &ai->rc;
2323 	rddir4_cache *b = &bi->rc;
2324 
2325 	if (a->nfs4_cookie == b->nfs4_cookie) {
2326 		if (a->buflen == b->buflen)
2327 			return (0);
2328 		if (a->buflen < b->buflen)
2329 			return (-1);
2330 		return (1);
2331 	}
2332 
2333 	if (a->nfs4_cookie < b->nfs4_cookie)
2334 			return (-1);
2335 
2336 	return (1);
2337 }
2338 
2339 /*
2340  * Allocate an opaque handle for the readdir cache.
2341  */
2342 void
2343 rddir4_cache_create(rnode4_t *rp)
2344 {
2345 	ASSERT(rp->r_dir == NULL);
2346 
2347 	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2348 
2349 	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
2350 			offsetof(rddir4_cache_impl, tree));
2351 }
2352 
2353 /*
2354  *  Purge the cache of all cached readdir responses.
2355  */
2356 void
2357 rddir4_cache_purge(rnode4_t *rp)
2358 {
2359 	rddir4_cache_impl	*rdip;
2360 	rddir4_cache_impl	*nrdip;
2361 
2362 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2363 
2364 	if (rp->r_dir == NULL)
2365 		return;
2366 
2367 	rdip = avl_first(rp->r_dir);
2368 
2369 	while (rdip != NULL) {
2370 		nrdip = AVL_NEXT(rp->r_dir, rdip);
2371 		avl_remove(rp->r_dir, rdip);
2372 		rdip->rc.flags &= ~RDDIRCACHED;
2373 		rddir4_cache_rele(rp, &rdip->rc);
2374 		rdip = nrdip;
2375 	}
2376 	ASSERT(avl_numnodes(rp->r_dir) == 0);
2377 }
2378 
2379 /*
2380  * Destroy the readdir cache.
2381  */
2382 void
2383 rddir4_cache_destroy(rnode4_t *rp)
2384 {
2385 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2386 	if (rp->r_dir == NULL)
2387 		return;
2388 
2389 	rddir4_cache_purge(rp);
2390 	avl_destroy(rp->r_dir);
2391 	kmem_free(rp->r_dir, sizeof (avl_tree_t));
2392 	rp->r_dir = NULL;
2393 }
2394 
2395 /*
2396  * Locate a readdir response from the readdir cache.
2397  *
2398  * Return values:
2399  *
2400  * NULL - If there is an unrecoverable situation like the operation may have
2401  *	  been interrupted.
2402  *
2403  * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2404  *		    The flags are set approprately, such that the caller knows
2405  *		    what state the entry is in.
2406  */
2407 rddir4_cache *
2408 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
2409 {
2410 	rddir4_cache_impl	*rdip = NULL;
2411 	rddir4_cache_impl	srdip;
2412 	rddir4_cache		*srdc;
2413 	rddir4_cache		*rdc = NULL;
2414 	rddir4_cache		*nrdc = NULL;
2415 	avl_index_t		where;
2416 
2417 top:
2418 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2419 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2420 	/*
2421 	 * Check to see if the readdir cache has been disabled.  If so, then
2422 	 * simply allocate an rddir4_cache entry and return it, since caching
2423 	 * operations do not apply.
2424 	 */
2425 	if (rp->r_dir == NULL) {
2426 		if (nrdc == NULL) {
2427 			/*
2428 			 * Drop the lock because we are doing a sleeping
2429 			 * allocation.
2430 			 */
2431 			mutex_exit(&rp->r_statelock);
2432 			rdc = rddir4_cache_alloc(KM_SLEEP);
2433 			rdc->nfs4_cookie = cookie;
2434 			rdc->buflen = count;
2435 			mutex_enter(&rp->r_statelock);
2436 			return (rdc);
2437 		}
2438 		return (nrdc);
2439 	}
2440 
2441 	srdc = &srdip.rc;
2442 	srdc->nfs4_cookie = cookie;
2443 	srdc->buflen = count;
2444 
2445 	rdip = avl_find(rp->r_dir, &srdip, &where);
2446 
2447 	/*
2448 	 * If we didn't find an entry then create one and insert it
2449 	 * into the cache.
2450 	 */
2451 	if (rdip == NULL) {
2452 		/*
2453 		 * Check for the case where we have made a second pass through
2454 		 * the cache due to a lockless allocation.  If we find that no
2455 		 * thread has already inserted this entry, do the insert now
2456 		 * and return.
2457 		 */
2458 		if (nrdc != NULL) {
2459 			avl_insert(rp->r_dir, nrdc->data, where);
2460 			nrdc->flags |= RDDIRCACHED;
2461 			rddir4_cache_hold(nrdc);
2462 			return (nrdc);
2463 		}
2464 
2465 #ifdef DEBUG
2466 		nfs4_readdir_cache_misses++;
2467 #endif
2468 		/*
2469 		 * First, try to allocate an entry without sleeping.  If that
2470 		 * fails then drop the lock and do a sleeping allocation.
2471 		 */
2472 		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
2473 		if (nrdc != NULL) {
2474 			nrdc->nfs4_cookie = cookie;
2475 			nrdc->buflen = count;
2476 			avl_insert(rp->r_dir, nrdc->data, where);
2477 			nrdc->flags |= RDDIRCACHED;
2478 			rddir4_cache_hold(nrdc);
2479 			return (nrdc);
2480 		}
2481 
2482 		/*
2483 		 * Drop the lock and do a sleeping allocation.	We incur
2484 		 * additional overhead by having to search the cache again,
2485 		 * but this case should be rare.
2486 		 */
2487 		mutex_exit(&rp->r_statelock);
2488 		nrdc = rddir4_cache_alloc(KM_SLEEP);
2489 		nrdc->nfs4_cookie = cookie;
2490 		nrdc->buflen = count;
2491 		mutex_enter(&rp->r_statelock);
2492 		/*
2493 		 * We need to take another pass through the cache
2494 		 * since we dropped our lock to perform the alloc.
2495 		 * Another thread may have come by and inserted the
2496 		 * entry we are interested in.
2497 		 */
2498 		goto top;
2499 	}
2500 
2501 	/*
2502 	 * Check to see if we need to free our entry.  This can happen if
2503 	 * another thread came along beat us to the insert.  We can
2504 	 * safely call rddir4_cache_free directly because no other thread
2505 	 * would have a reference to this entry.
2506 	 */
2507 	if (nrdc != NULL)
2508 		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
2509 
2510 #ifdef DEBUG
2511 	nfs4_readdir_cache_hits++;
2512 #endif
2513 	/*
2514 	 * Found something.  Make sure it's ready to return.
2515 	 */
2516 	rdc = &rdip->rc;
2517 	rddir4_cache_hold(rdc);
2518 	/*
2519 	 * If the cache entry is in the process of being filled in, wait
2520 	 * until this completes.  The RDDIRWAIT bit is set to indicate that
2521 	 * someone is waiting and when the thread currently filling the entry
2522 	 * is done, it should do a cv_broadcast to wakeup all of the threads
2523 	 * waiting for it to finish. If the thread wakes up to find that
2524 	 * someone new is now trying to complete the the entry, go back
2525 	 * to sleep.
2526 	 */
2527 	while (rdc->flags & RDDIR) {
2528 		/*
2529 		 * The entry is not complete.
2530 		 */
2531 		nfs_rw_exit(&rp->r_rwlock);
2532 		rdc->flags |= RDDIRWAIT;
2533 #ifdef DEBUG
2534 		nfs4_readdir_cache_waits++;
2535 #endif
2536 		while (rdc->flags & RDDIRWAIT) {
2537 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2538 				/*
2539 				 * We got interrupted, probably the user
2540 				 * typed ^C or an alarm fired.  We free the
2541 				 * new entry if we allocated one.
2542 				 */
2543 				rddir4_cache_rele(rp, rdc);
2544 				mutex_exit(&rp->r_statelock);
2545 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2546 					RW_READER, FALSE);
2547 				mutex_enter(&rp->r_statelock);
2548 				return (NULL);
2549 			}
2550 		}
2551 		mutex_exit(&rp->r_statelock);
2552 		(void) nfs_rw_enter_sig(&rp->r_rwlock,
2553 			RW_READER, FALSE);
2554 		mutex_enter(&rp->r_statelock);
2555 	}
2556 
2557 	/*
2558 	 * The entry we were waiting on may have been purged from
2559 	 * the cache and should no longer be used, release it and
2560 	 * start over.
2561 	 */
2562 	if (!(rdc->flags & RDDIRCACHED)) {
2563 		rddir4_cache_rele(rp, rdc);
2564 		goto top;
2565 	}
2566 
2567 	/*
2568 	 * The entry is completed.  Return it.
2569 	 */
2570 	return (rdc);
2571 }
2572 
2573 /*
2574  * Allocate a cache element and return it.  Can return NULL if memory is
2575  * low.
2576  */
2577 static rddir4_cache *
2578 rddir4_cache_alloc(int flags)
2579 {
2580 	rddir4_cache_impl	*rdip = NULL;
2581 	rddir4_cache		*rc = NULL;
2582 
2583 	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
2584 
2585 	if (rdip != NULL) {
2586 		rc = &rdip->rc;
2587 		rc->data = (void *)rdip;
2588 		rc->nfs4_cookie = 0;
2589 		rc->nfs4_ncookie = 0;
2590 		rc->entries = NULL;
2591 		rc->eof = 0;
2592 		rc->entlen = 0;
2593 		rc->buflen = 0;
2594 		rc->actlen = 0;
2595 		/*
2596 		 * A readdir is required so set the flag.
2597 		 */
2598 		rc->flags = RDDIRREQ;
2599 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
2600 		rc->error = 0;
2601 		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
2602 		rdip->count = 1;
2603 #ifdef DEBUG
2604 		atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
2605 #endif
2606 	}
2607 	return (rc);
2608 }
2609 
2610 /*
2611  * Increment the reference count to this cache element.
2612  */
2613 static void
2614 rddir4_cache_hold(rddir4_cache *rc)
2615 {
2616 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
2617 
2618 	mutex_enter(&rdip->lock);
2619 	rdip->count++;
2620 	mutex_exit(&rdip->lock);
2621 }
2622 
2623 /*
2624  * Release a reference to this cache element.  If the count is zero then
2625  * free the element.
2626  */
2627 void
2628 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
2629 {
2630 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
2631 
2632 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2633 
2634 	/*
2635 	 * Check to see if we have any waiters.  If so, we can wake them
2636 	 * so that they can proceed.
2637 	 */
2638 	if (rdc->flags & RDDIRWAIT) {
2639 		rdc->flags &= ~RDDIRWAIT;
2640 		cv_broadcast(&rdc->cv);
2641 	}
2642 
2643 	mutex_enter(&rdip->lock);
2644 	ASSERT(rdip->count > 0);
2645 	if (--rdip->count == 0) {
2646 		mutex_exit(&rdip->lock);
2647 		rddir4_cache_free(rdip);
2648 	} else
2649 		mutex_exit(&rdip->lock);
2650 }
2651 
2652 /*
2653  * Free a cache element.
2654  */
2655 static void
2656 rddir4_cache_free(rddir4_cache_impl *rdip)
2657 {
2658 	rddir4_cache *rc = &rdip->rc;
2659 
2660 #ifdef DEBUG
2661 	atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
2662 #endif
2663 	if (rc->entries != NULL)
2664 		kmem_free(rc->entries, rc->buflen);
2665 	cv_destroy(&rc->cv);
2666 	mutex_destroy(&rdip->lock);
2667 	kmem_free(rdip, sizeof (*rdip));
2668 }
2669 
2670 /*
2671  * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2672  * framework.
2673  */
2674 static int
2675 cl4_snapshot(kstat_t *ksp, void *buf, int rw)
2676 {
2677 	ksp->ks_snaptime = gethrtime();
2678 	if (rw == KSTAT_WRITE) {
2679 		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
2680 #ifdef DEBUG
2681 		/*
2682 		 * Currently only the global zone can write to kstats, but we
2683 		 * add the check just for paranoia.
2684 		 */
2685 		if (INGLOBALZONE(curproc))
2686 		    bcopy((char *)buf + sizeof (clstat4_tmpl), &clstat4_debug,
2687 			    sizeof (clstat4_debug));
2688 #endif
2689 	} else {
2690 		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
2691 #ifdef DEBUG
2692 		/*
2693 		 * If we're displaying the "global" debug kstat values, we
2694 		 * display them as-is to all zones since in fact they apply to
2695 		 * the system as a whole.
2696 		 */
2697 		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
2698 		    sizeof (clstat4_debug));
2699 #endif
2700 	}
2701 	return (0);
2702 }
2703 
2704 
2705 
2706 /*
2707  * Zone support
2708  */
2709 static void *
2710 clinit4_zone(zoneid_t zoneid)
2711 {
2712 	kstat_t *nfs4_client_kstat;
2713 	struct nfs4_clnt *nfscl;
2714 	uint_t ndata;
2715 
2716 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
2717 	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
2718 	nfscl->nfscl_chtable4 = NULL;
2719 	nfscl->nfscl_zoneid = zoneid;
2720 
2721 	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
2722 	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
2723 #ifdef DEBUG
2724 	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
2725 #endif
2726 	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
2727 	    "misc", KSTAT_TYPE_NAMED, ndata,
2728 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
2729 		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
2730 		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
2731 		kstat_install(nfs4_client_kstat);
2732 	}
2733 	mutex_enter(&nfs4_clnt_list_lock);
2734 	list_insert_head(&nfs4_clnt_list, nfscl);
2735 	mutex_exit(&nfs4_clnt_list_lock);
2736 	return (nfscl);
2737 }
2738 
2739 /*ARGSUSED*/
2740 static void
2741 clfini4_zone(zoneid_t zoneid, void *arg)
2742 {
2743 	struct nfs4_clnt *nfscl = arg;
2744 	chhead_t *chp, *next;
2745 
2746 	if (nfscl == NULL)
2747 		return;
2748 	mutex_enter(&nfs4_clnt_list_lock);
2749 	list_remove(&nfs4_clnt_list, nfscl);
2750 	mutex_exit(&nfs4_clnt_list_lock);
2751 	clreclaim4_zone(nfscl, 0);
2752 	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
2753 		ASSERT(chp->ch_list == NULL);
2754 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
2755 		next = chp->ch_next;
2756 		kmem_free(chp, sizeof (*chp));
2757 	}
2758 	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
2759 	mutex_destroy(&nfscl->nfscl_chtable4_lock);
2760 	kmem_free(nfscl, sizeof (*nfscl));
2761 }
2762 
2763 /*
2764  * Called by endpnt_destructor to make sure the client handles are
2765  * cleaned up before the RPC endpoints.  This becomes a no-op if
2766  * clfini_zone (above) is called first.  This function is needed
2767  * (rather than relying on clfini_zone to clean up) because the ZSD
2768  * callbacks have no ordering mechanism, so we have no way to ensure
2769  * that clfini_zone is called before endpnt_destructor.
2770  */
2771 void
2772 clcleanup4_zone(zoneid_t zoneid)
2773 {
2774 	struct nfs4_clnt *nfscl;
2775 
2776 	mutex_enter(&nfs4_clnt_list_lock);
2777 	nfscl = list_head(&nfs4_clnt_list);
2778 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
2779 		if (nfscl->nfscl_zoneid == zoneid) {
2780 			clreclaim4_zone(nfscl, 0);
2781 			break;
2782 		}
2783 	}
2784 	mutex_exit(&nfs4_clnt_list_lock);
2785 }
2786 
2787 int
2788 nfs4_subr_init(void)
2789 {
2790 	/*
2791 	 * Allocate and initialize the client handle cache
2792 	 */
2793 	chtab4_cache = kmem_cache_create("client_handle4_cache",
2794 		sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
2795 		NULL, 0);
2796 
2797 	/*
2798 	 * Initialize the list of per-zone client handles (and associated data).
2799 	 * This needs to be done before we call zone_key_create().
2800 	 */
2801 	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
2802 	    offsetof(struct nfs4_clnt, nfscl_node));
2803 
2804 	/*
2805 	 * Initialize the zone_key for per-zone client handle lists.
2806 	 */
2807 	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
2808 
2809 	if (nfs4err_delay_time == 0)
2810 		nfs4err_delay_time = NFS4ERR_DELAY_TIME;
2811 
2812 	return (0);
2813 }
2814 
2815 int
2816 nfs4_subr_fini(void)
2817 {
2818 	/*
2819 	 * Deallocate the client handle cache
2820 	 */
2821 	kmem_cache_destroy(chtab4_cache);
2822 
2823 	/*
2824 	 * Destroy the zone_key
2825 	 */
2826 	(void) zone_key_delete(nfs4clnt_zone_key);
2827 
2828 	return (0);
2829 }
2830 /*
2831  * Set or Clear direct I/O flag
2832  * VOP_RWLOCK() is held for write access to prevent a race condition
2833  * which would occur if a process is in the middle of a write when
2834  * directio flag gets set. It is possible that all pages may not get flushed.
2835  *
2836  * This is a copy of nfs_directio, changes here may need to be made
2837  * there and vice versa.
2838  */
2839 
2840 int
2841 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
2842 {
2843 	int	error = 0;
2844 	rnode4_t *rp;
2845 
2846 	rp = VTOR4(vp);
2847 
2848 	if (cmd == DIRECTIO_ON) {
2849 
2850 		if (rp->r_flags & R4DIRECTIO)
2851 			return (0);
2852 
2853 		/*
2854 		 * Flush the page cache.
2855 		 */
2856 
2857 		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
2858 
2859 		if (rp->r_flags & R4DIRECTIO) {
2860 			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2861 			return (0);
2862 		}
2863 
2864 		if (nfs4_has_pages(vp) &&
2865 		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
2866 			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
2867 			    B_INVAL, cr);
2868 			if (error) {
2869 				if (error == ENOSPC || error == EDQUOT) {
2870 					mutex_enter(&rp->r_statelock);
2871 					if (!rp->r_error)
2872 						rp->r_error = error;
2873 					mutex_exit(&rp->r_statelock);
2874 				}
2875 				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2876 				return (error);
2877 			}
2878 		}
2879 
2880 		mutex_enter(&rp->r_statelock);
2881 		rp->r_flags |= R4DIRECTIO;
2882 		mutex_exit(&rp->r_statelock);
2883 		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2884 		return (0);
2885 	}
2886 
2887 	if (cmd == DIRECTIO_OFF) {
2888 		mutex_enter(&rp->r_statelock);
2889 		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
2890 		mutex_exit(&rp->r_statelock);
2891 		return (0);
2892 	}
2893 
2894 	return (EINVAL);
2895 }
2896 
2897 /*
2898  * Return TRUE if the file has any pages.  Always go back to
2899  * the master vnode to check v_pages since none of the shadows
2900  * can have pages.
2901  */
2902 
2903 bool_t
2904 nfs4_has_pages(vnode_t *vp)
2905 {
2906 	rnode4_t *rp;
2907 
2908 	rp = VTOR4(vp);
2909 	if (IS_SHADOW(vp, rp))
2910 		vp = RTOV4(rp);	/* RTOV4 always gives the master */
2911 
2912 	return (vn_has_cached_data(vp));
2913 }
2914 
2915 /*
2916  * This table is used to determine whether the client should attempt
2917  * failover based on the clnt_stat value returned by CLNT_CALL.  The
2918  * clnt_stat is used as an index into the table.  If
2919  * the error value that corresponds to the clnt_stat value in the
2920  * table is non-zero, then that is the error to be returned AND
2921  * that signals that failover should be attempted.
2922  *
2923  * Special note: If the RPC_ values change, then direct indexing of the
2924  * table is no longer valid, but having the RPC_ values in the table
2925  * allow the functions to detect the change and issue a warning.
2926  * In this case, the code will always attempt failover as a defensive
2927  * measure.
2928  */
2929 
2930 static struct try_failover_tab {
2931 	enum clnt_stat	cstat;
2932 	int		error;
2933 } try_failover_table [] = {
2934 
2935 	RPC_SUCCESS,		0,
2936 	RPC_CANTENCODEARGS,	0,
2937 	RPC_CANTDECODERES,	0,
2938 	RPC_CANTSEND,		ECOMM,
2939 	RPC_CANTRECV,		ECOMM,
2940 	RPC_TIMEDOUT,		ETIMEDOUT,
2941 	RPC_VERSMISMATCH,	0,
2942 	RPC_AUTHERROR,		0,
2943 	RPC_PROGUNAVAIL,	0,
2944 	RPC_PROGVERSMISMATCH,	0,
2945 	RPC_PROCUNAVAIL,	0,
2946 	RPC_CANTDECODEARGS,	0,
2947 	RPC_SYSTEMERROR,	ENOSR,
2948 	RPC_UNKNOWNHOST,	EHOSTUNREACH,
2949 	RPC_RPCBFAILURE,	ENETUNREACH,
2950 	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
2951 	RPC_FAILED,		ETIMEDOUT,
2952 	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
2953 	RPC_INTR,		0,
2954 	RPC_UNKNOWNADDR,	EHOSTUNREACH,
2955 	RPC_TLIERROR,		0,
2956 	RPC_NOBROADCAST,	EHOSTUNREACH,
2957 	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
2958 	RPC_UDERROR,		0,
2959 	RPC_INPROGRESS,		0,
2960 	RPC_STALERACHANDLE,	EINVAL,
2961 	RPC_CANTCONNECT,	ECONNREFUSED,
2962 	RPC_XPRTFAILED,		ECONNABORTED,
2963 	RPC_CANTCREATESTREAM,	ECONNREFUSED,
2964 	RPC_CANTSTORE,		ENOBUFS
2965 };
2966 
2967 /*
2968  * nfs4_try_failover - determine whether the client should
2969  * attempt failover based on the values stored in the nfs4_error_t.
2970  */
2971 int
2972 nfs4_try_failover(nfs4_error_t *ep)
2973 {
2974 	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
2975 		return (TRUE);
2976 
2977 	if (ep->error && ep->rpc_status != RPC_SUCCESS)
2978 		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
2979 
2980 	return (FALSE);
2981 }
2982 
2983 /*
2984  * try_failover - internal version of nfs4_try_failover, called
2985  * only by rfscall and aclcall.  Determine if failover is warranted
2986  * based on the clnt_stat and return the error number if it is.
2987  */
2988 static int
2989 try_failover(enum clnt_stat rpc_status)
2990 {
2991 	int err = 0;
2992 
2993 	if (rpc_status == RPC_SUCCESS)
2994 		return (0);
2995 
2996 #ifdef	DEBUG
2997 	if (rpc_status != 0 && nfs4_try_failover_any) {
2998 		err = ETIMEDOUT;
2999 		goto done;
3000 	}
3001 #endif
3002 	/*
3003 	 * The rpc status is used as an index into the table.
3004 	 * If the rpc status is outside of the range of the
3005 	 * table or if the rpc error numbers have been changed
3006 	 * since the table was constructed, then print a warning
3007 	 * (DEBUG only) and try failover anyway.  Otherwise, just
3008 	 * grab the resulting error number out of the table.
3009 	 */
3010 	if (rpc_status < RPC_SUCCESS || rpc_status >=
3011 	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
3012 	    try_failover_table[rpc_status].cstat != rpc_status) {
3013 
3014 		err = ETIMEDOUT;
3015 #ifdef	DEBUG
3016 		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
3017 			rpc_status);
3018 #endif
3019 	} else
3020 		err = try_failover_table[rpc_status].error;
3021 
3022 done:
3023 	if (rpc_status)
3024 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3025 			"nfs4_try_failover: %strying failover on error %d",
3026 			err ? "" : "NOT ", rpc_status));
3027 
3028 	return (err);
3029 }
3030 
3031 void
3032 nfs4_error_zinit(nfs4_error_t *ep)
3033 {
3034 	ep->error = 0;
3035 	ep->stat = NFS4_OK;
3036 	ep->rpc_status = RPC_SUCCESS;
3037 }
3038 
3039 void
3040 nfs4_error_init(nfs4_error_t *ep, int error)
3041 {
3042 	ep->error = error;
3043 	ep->stat = NFS4_OK;
3044 	ep->rpc_status = RPC_SUCCESS;
3045 }
3046 
3047 
3048 #ifdef DEBUG
3049 
3050 /*
3051  * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3052  * use the same algorithm as for NFS v3.
3053  *
3054  */
3055 int
3056 hash16(void *p, int len)
3057 {
3058 	int i, rem;
3059 	uint_t *wp;
3060 	uint_t key = 0;
3061 
3062 	/* protect against non word aligned */
3063 	if ((rem = len & 3) != 0)
3064 		len &= ~3;
3065 
3066 	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
3067 		key ^= (*wp >> 16) ^ *wp;
3068 	}
3069 
3070 	/* hash left-over bytes */
3071 	for (i = 0; i < rem; i++)
3072 		key ^= *((uchar_t *)p + i);
3073 
3074 	return (key & 0xffff);
3075 }
3076 
3077 /*
3078  * rnode4info - return filehandle and path information for an rnode.
3079  * XXX MT issues: uses a single static buffer, no locking of path.
3080  */
3081 char *
3082 rnode4info(rnode4_t *rp)
3083 {
3084 	static char buf[80];
3085 	nfs4_fhandle_t fhandle;
3086 	char *path;
3087 	char *type;
3088 
3089 	if (rp == NULL)
3090 		return ("null");
3091 	if (rp->r_flags & R4ISXATTR)
3092 		type = "attr";
3093 	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
3094 		type = "attrdir";
3095 	else if (RTOV4(rp)->v_flag & VROOT)
3096 		type = "root";
3097 	else if (RTOV4(rp)->v_type == VDIR)
3098 		type = "dir";
3099 	else if (RTOV4(rp)->v_type == VREG)
3100 		type = "file";
3101 	else
3102 		type = "other";
3103 	sfh4_copyval(rp->r_fh, &fhandle);
3104 	path = fn_path(rp->r_svnode.sv_name);
3105 	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3106 	    (void *)rp, path, type, rp->r_flags,
3107 	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
3108 	kmem_free(path, strlen(path)+1);
3109 	return (buf);
3110 }
3111 #endif
3112