xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_subr.c (revision 8bf842e0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27  */
28 
29 /*
30  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
31  *	All Rights Reserved
32  */
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cmn_err.h>
38 #include <sys/vtrace.h>
39 #include <sys/session.h>
40 #include <sys/thread.h>
41 #include <sys/dnlc.h>
42 #include <sys/cred.h>
43 #include <sys/priv.h>
44 #include <sys/list.h>
45 #include <sys/sdt.h>
46 #include <sys/policy.h>
47 
48 #include <rpc/types.h>
49 #include <rpc/xdr.h>
50 
51 #include <nfs/nfs.h>
52 
53 #include <nfs/nfs_clnt.h>
54 
55 #include <nfs/nfs4.h>
56 #include <nfs/rnode4.h>
57 #include <nfs/nfs4_clnt.h>
58 
59 /*
60  * client side statistics
61  */
62 static const struct clstat4 clstat4_tmpl = {
63 	{ "calls",	KSTAT_DATA_UINT64 },
64 	{ "badcalls",	KSTAT_DATA_UINT64 },
65 	{ "referrals",	KSTAT_DATA_UINT64 },
66 	{ "referlinks",	KSTAT_DATA_UINT64 },
67 	{ "clgets",	KSTAT_DATA_UINT64 },
68 	{ "cltoomany",	KSTAT_DATA_UINT64 },
69 #ifdef DEBUG
70 	{ "clalloc",	KSTAT_DATA_UINT64 },
71 	{ "noresponse",	KSTAT_DATA_UINT64 },
72 	{ "failover",	KSTAT_DATA_UINT64 },
73 	{ "remap",	KSTAT_DATA_UINT64 },
74 #endif
75 };
76 
77 #ifdef DEBUG
78 struct clstat4_debug clstat4_debug = {
79 	{ "nrnode",	KSTAT_DATA_UINT64 },
80 	{ "access",	KSTAT_DATA_UINT64 },
81 	{ "dirent",	KSTAT_DATA_UINT64 },
82 	{ "dirents",	KSTAT_DATA_UINT64 },
83 	{ "reclaim",	KSTAT_DATA_UINT64 },
84 	{ "clreclaim",	KSTAT_DATA_UINT64 },
85 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
86 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
87 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
88 	{ "r_path",	KSTAT_DATA_UINT64 },
89 };
90 #endif
91 
92 /*
93  * We keep a global list of per-zone client data, so we can clean up all zones
94  * if we get low on memory.
95  */
96 static list_t nfs4_clnt_list;
97 static kmutex_t nfs4_clnt_list_lock;
98 zone_key_t nfs4clnt_zone_key;
99 
100 static struct kmem_cache *chtab4_cache;
101 
102 #ifdef DEBUG
103 static int nfs4_rfscall_debug;
104 static int nfs4_try_failover_any;
105 int nfs4_utf8_debug = 0;
106 #endif
107 
108 /*
109  * NFSv4 readdir cache implementation
110  */
111 typedef struct rddir4_cache_impl {
112 	rddir4_cache	rc;		/* readdir cache element */
113 	kmutex_t	lock;		/* lock protects count */
114 	uint_t		count;		/* reference count */
115 	avl_node_t	tree;		/* AVL tree link */
116 } rddir4_cache_impl;
117 
118 static int rddir4_cache_compar(const void *, const void *);
119 static void rddir4_cache_free(rddir4_cache_impl *);
120 static rddir4_cache *rddir4_cache_alloc(int);
121 static void rddir4_cache_hold(rddir4_cache *);
122 static int try_failover(enum clnt_stat);
123 
124 static int nfs4_readdir_cache_hits = 0;
125 static int nfs4_readdir_cache_waits = 0;
126 static int nfs4_readdir_cache_misses = 0;
127 
128 /*
129  * Shared nfs4 functions
130  */
131 
132 /*
133  * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
134  * be allocated.
135  */
136 
137 void
nfs_fh4_copy(nfs_fh4 * from,nfs_fh4 * to)138 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
139 {
140 	to->nfs_fh4_len = from->nfs_fh4_len;
141 	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
142 }
143 
144 /*
145  * nfs4cmpfh - compare 2 filehandles.
146  * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
147  * "less" than the second, +1 if the first is "greater" than the second.
148  */
149 
150 int
nfs4cmpfh(const nfs_fh4 * fh4p1,const nfs_fh4 * fh4p2)151 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
152 {
153 	const char *c1, *c2;
154 
155 	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
156 		return (-1);
157 	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
158 		return (1);
159 	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
160 	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
161 	    c1++, c2++) {
162 		if (*c1 < *c2)
163 			return (-1);
164 		if (*c1 > *c2)
165 			return (1);
166 	}
167 
168 	return (0);
169 }
170 
171 /*
172  * Compare two v4 filehandles.  Return zero if they're the same, non-zero
173  * if they're not.  Like nfs4cmpfh(), but different filehandle
174  * representation, and doesn't provide information about greater than or
175  * less than.
176  */
177 
178 int
nfs4cmpfhandle(nfs4_fhandle_t * fh1,nfs4_fhandle_t * fh2)179 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
180 {
181 	if (fh1->fh_len == fh2->fh_len)
182 		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
183 
184 	return (1);
185 }
186 
187 int
stateid4_cmp(stateid4 * s1,stateid4 * s2)188 stateid4_cmp(stateid4 *s1, stateid4 *s2)
189 {
190 	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
191 		return (1);
192 	else
193 		return (0);
194 }
195 
196 nfsstat4
puterrno4(int error)197 puterrno4(int error)
198 {
199 	switch (error) {
200 	case 0:
201 		return (NFS4_OK);
202 	case EPERM:
203 		return (NFS4ERR_PERM);
204 	case ENOENT:
205 		return (NFS4ERR_NOENT);
206 	case EINTR:
207 		return (NFS4ERR_IO);
208 	case EIO:
209 		return (NFS4ERR_IO);
210 	case ENXIO:
211 		return (NFS4ERR_NXIO);
212 	case ENOMEM:
213 		return (NFS4ERR_RESOURCE);
214 	case EACCES:
215 		return (NFS4ERR_ACCESS);
216 	case EBUSY:
217 		return (NFS4ERR_IO);
218 	case EEXIST:
219 		return (NFS4ERR_EXIST);
220 	case EXDEV:
221 		return (NFS4ERR_XDEV);
222 	case ENODEV:
223 		return (NFS4ERR_IO);
224 	case ENOTDIR:
225 		return (NFS4ERR_NOTDIR);
226 	case EISDIR:
227 		return (NFS4ERR_ISDIR);
228 	case EINVAL:
229 		return (NFS4ERR_INVAL);
230 	case EMFILE:
231 		return (NFS4ERR_RESOURCE);
232 	case EFBIG:
233 		return (NFS4ERR_FBIG);
234 	case ENOSPC:
235 		return (NFS4ERR_NOSPC);
236 	case EROFS:
237 		return (NFS4ERR_ROFS);
238 	case EMLINK:
239 		return (NFS4ERR_MLINK);
240 	case EDEADLK:
241 		return (NFS4ERR_DEADLOCK);
242 	case ENOLCK:
243 		return (NFS4ERR_DENIED);
244 	case EREMOTE:
245 		return (NFS4ERR_SERVERFAULT);
246 	case ENOTSUP:
247 		return (NFS4ERR_NOTSUPP);
248 	case EDQUOT:
249 		return (NFS4ERR_DQUOT);
250 	case ENAMETOOLONG:
251 		return (NFS4ERR_NAMETOOLONG);
252 	case EOVERFLOW:
253 		return (NFS4ERR_INVAL);
254 	case ENOSYS:
255 		return (NFS4ERR_NOTSUPP);
256 	case ENOTEMPTY:
257 		return (NFS4ERR_NOTEMPTY);
258 	case EOPNOTSUPP:
259 		return (NFS4ERR_NOTSUPP);
260 	case ESTALE:
261 		return (NFS4ERR_STALE);
262 	case EAGAIN:
263 		if (curthread->t_flag & T_WOULDBLOCK) {
264 			curthread->t_flag &= ~T_WOULDBLOCK;
265 			return (NFS4ERR_DELAY);
266 		}
267 		return (NFS4ERR_LOCKED);
268 	default:
269 		return ((enum nfsstat4)error);
270 	}
271 }
272 
273 int
geterrno4(enum nfsstat4 status)274 geterrno4(enum nfsstat4 status)
275 {
276 	switch (status) {
277 	case NFS4_OK:
278 		return (0);
279 	case NFS4ERR_PERM:
280 		return (EPERM);
281 	case NFS4ERR_NOENT:
282 		return (ENOENT);
283 	case NFS4ERR_IO:
284 		return (EIO);
285 	case NFS4ERR_NXIO:
286 		return (ENXIO);
287 	case NFS4ERR_ACCESS:
288 		return (EACCES);
289 	case NFS4ERR_EXIST:
290 		return (EEXIST);
291 	case NFS4ERR_XDEV:
292 		return (EXDEV);
293 	case NFS4ERR_NOTDIR:
294 		return (ENOTDIR);
295 	case NFS4ERR_ISDIR:
296 		return (EISDIR);
297 	case NFS4ERR_INVAL:
298 		return (EINVAL);
299 	case NFS4ERR_FBIG:
300 		return (EFBIG);
301 	case NFS4ERR_NOSPC:
302 		return (ENOSPC);
303 	case NFS4ERR_ROFS:
304 		return (EROFS);
305 	case NFS4ERR_MLINK:
306 		return (EMLINK);
307 	case NFS4ERR_NAMETOOLONG:
308 		return (ENAMETOOLONG);
309 	case NFS4ERR_NOTEMPTY:
310 		return (ENOTEMPTY);
311 	case NFS4ERR_DQUOT:
312 		return (EDQUOT);
313 	case NFS4ERR_STALE:
314 		return (ESTALE);
315 	case NFS4ERR_BADHANDLE:
316 		return (ESTALE);
317 	case NFS4ERR_BAD_COOKIE:
318 		return (EINVAL);
319 	case NFS4ERR_NOTSUPP:
320 		return (EOPNOTSUPP);
321 	case NFS4ERR_TOOSMALL:
322 		return (EINVAL);
323 	case NFS4ERR_SERVERFAULT:
324 		return (EIO);
325 	case NFS4ERR_BADTYPE:
326 		return (EINVAL);
327 	case NFS4ERR_DELAY:
328 		return (ENXIO);
329 	case NFS4ERR_SAME:
330 		return (EPROTO);
331 	case NFS4ERR_DENIED:
332 		return (ENOLCK);
333 	case NFS4ERR_EXPIRED:
334 		return (EPROTO);
335 	case NFS4ERR_LOCKED:
336 		return (EACCES);
337 	case NFS4ERR_GRACE:
338 		return (EAGAIN);
339 	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
340 		return (ESTALE);
341 	case NFS4ERR_SHARE_DENIED:
342 		return (EACCES);
343 	case NFS4ERR_WRONGSEC:
344 		return (EPERM);
345 	case NFS4ERR_CLID_INUSE:
346 		return (EAGAIN);
347 	case NFS4ERR_RESOURCE:
348 		return (EAGAIN);
349 	case NFS4ERR_MOVED:
350 		return (EPROTO);
351 	case NFS4ERR_NOFILEHANDLE:
352 		return (EIO);
353 	case NFS4ERR_MINOR_VERS_MISMATCH:
354 		return (ENOTSUP);
355 	case NFS4ERR_STALE_CLIENTID:
356 		return (EIO);
357 	case NFS4ERR_STALE_STATEID:
358 		return (EIO);
359 	case NFS4ERR_OLD_STATEID:
360 		return (EIO);
361 	case NFS4ERR_BAD_STATEID:
362 		return (EIO);
363 	case NFS4ERR_BAD_SEQID:
364 		return (EIO);
365 	case NFS4ERR_NOT_SAME:
366 		return (EPROTO);
367 	case NFS4ERR_LOCK_RANGE:
368 		return (EPROTO);
369 	case NFS4ERR_SYMLINK:
370 		return (EPROTO);
371 	case NFS4ERR_RESTOREFH:
372 		return (EPROTO);
373 	case NFS4ERR_LEASE_MOVED:
374 		return (EPROTO);
375 	case NFS4ERR_ATTRNOTSUPP:
376 		return (ENOTSUP);
377 	case NFS4ERR_NO_GRACE:
378 		return (EPROTO);
379 	case NFS4ERR_RECLAIM_BAD:
380 		return (EPROTO);
381 	case NFS4ERR_RECLAIM_CONFLICT:
382 		return (EPROTO);
383 	case NFS4ERR_BADXDR:
384 		return (EINVAL);
385 	case NFS4ERR_LOCKS_HELD:
386 		return (EIO);
387 	case NFS4ERR_OPENMODE:
388 		return (EACCES);
389 	case NFS4ERR_BADOWNER:
390 		/*
391 		 * Client and server are in different DNS domains
392 		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
393 		 * doesn't match.  No good answer here.  Return
394 		 * EACCESS, which translates to "permission denied".
395 		 */
396 		return (EACCES);
397 	case NFS4ERR_BADCHAR:
398 		return (EINVAL);
399 	case NFS4ERR_BADNAME:
400 		return (EINVAL);
401 	case NFS4ERR_BAD_RANGE:
402 		return (EIO);
403 	case NFS4ERR_LOCK_NOTSUPP:
404 		return (ENOTSUP);
405 	case NFS4ERR_OP_ILLEGAL:
406 		return (EINVAL);
407 	case NFS4ERR_DEADLOCK:
408 		return (EDEADLK);
409 	case NFS4ERR_FILE_OPEN:
410 		return (EACCES);
411 	case NFS4ERR_ADMIN_REVOKED:
412 		return (EPROTO);
413 	case NFS4ERR_CB_PATH_DOWN:
414 		return (EPROTO);
415 	default:
416 #ifdef DEBUG
417 		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
418 		    status);
419 #endif
420 		return ((int)status);
421 	}
422 }
423 
424 void
nfs4_log_badowner(mntinfo4_t * mi,nfs_opnum4 op)425 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
426 {
427 	nfs4_server_t *server;
428 
429 	/*
430 	 * Return if already printed/queued a msg
431 	 * for this mount point.
432 	 */
433 	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
434 		return;
435 	/*
436 	 * Happens once per client <-> server pair.
437 	 */
438 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
439 	    mi->mi_flags & MI4_INT))
440 		return;
441 
442 	server = find_nfs4_server(mi);
443 	if (server == NULL) {
444 		nfs_rw_exit(&mi->mi_recovlock);
445 		return;
446 	}
447 
448 	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
449 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
450 		    "!NFSMAPID_DOMAIN does not match"
451 		    " the server: %s domain.\n"
452 		    "Please check configuration",
453 		    mi->mi_curr_serv->sv_hostname);
454 		server->s_flags |= N4S_BADOWNER_DEBUG;
455 	}
456 	mutex_exit(&server->s_lock);
457 	nfs4_server_rele(server);
458 	nfs_rw_exit(&mi->mi_recovlock);
459 
460 	/*
461 	 * Happens once per mntinfo4_t.
462 	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
463 	 * queue this in the mesg queue for this mount_info. This message
464 	 * is not printed, meaning its absent from id_to_dump_solo_fact()
465 	 * but its there for inspection if the queue is ever dumped/inspected.
466 	 */
467 	mutex_enter(&mi->mi_lock);
468 	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
469 		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
470 		    FALSE, NULL, 0, NULL);
471 		mi->mi_flags |= MI4_BADOWNER_DEBUG;
472 	}
473 	mutex_exit(&mi->mi_lock);
474 }
475 
476 int
nfs4_time_ntov(nfstime4 * ntime,timestruc_t * vatime)477 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
478 {
479 	int64_t sec;
480 	int32_t nsec;
481 
482 	/*
483 	 * Here check that the nfsv4 time is valid for the system.
484 	 * nfsv4 time value is a signed 64-bit, and the system time
485 	 * may be either int64_t or int32_t (depends on the kernel),
486 	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
487 	 */
488 #ifndef _LP64
489 	if (! NFS4_TIME_OK(ntime->seconds)) {
490 		return (EOVERFLOW);
491 	}
492 #endif
493 
494 	/* Invalid to specify 1 billion (or more) nsecs */
495 	if (ntime->nseconds >= 1000000000)
496 		return (EINVAL);
497 
498 	if (ntime->seconds < 0) {
499 		sec = ntime->seconds + 1;
500 		nsec = -1000000000 + ntime->nseconds;
501 	} else {
502 		sec = ntime->seconds;
503 		nsec = ntime->nseconds;
504 	}
505 
506 	vatime->tv_sec = sec;
507 	vatime->tv_nsec = nsec;
508 
509 	return (0);
510 }
511 
512 int
nfs4_time_vton(timestruc_t * vatime,nfstime4 * ntime)513 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
514 {
515 	int64_t sec;
516 	uint32_t nsec;
517 
518 	/*
519 	 * nfsv4 time value is a signed 64-bit, and the system time
520 	 * may be either int64_t or int32_t (depends on the kernel),
521 	 * so all system time values will fit.
522 	 */
523 	if (vatime->tv_nsec >= 0) {
524 		sec = vatime->tv_sec;
525 		nsec = vatime->tv_nsec;
526 	} else {
527 		sec = vatime->tv_sec - 1;
528 		nsec = 1000000000 + vatime->tv_nsec;
529 	}
530 	ntime->seconds = sec;
531 	ntime->nseconds = nsec;
532 
533 	return (0);
534 }
535 
536 /*
537  * Converts a utf8 string to a valid null terminated filename string.
538  *
539  * XXX - Not actually translating the UTF-8 string as per RFC 2279.
540  *	 For now, just validate that the UTF-8 string off the wire
541  *	 does not have characters that will freak out UFS, and leave
542  *	 it at that.
543  */
544 char *
utf8_to_fn(utf8string * u8s,uint_t * lenp,char * s)545 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
546 {
547 	ASSERT(lenp != NULL);
548 
549 	if (u8s == NULL || u8s->utf8string_len <= 0 ||
550 	    u8s->utf8string_val == NULL)
551 		return (NULL);
552 
553 	/*
554 	 * Check for obvious illegal filename chars
555 	 */
556 	if (utf8_strchr(u8s, '/') != NULL) {
557 #ifdef DEBUG
558 		if (nfs4_utf8_debug) {
559 			char *path;
560 			int len = u8s->utf8string_len;
561 
562 			path = kmem_alloc(len + 1, KM_SLEEP);
563 			bcopy(u8s->utf8string_val, path, len);
564 			path[len] = '\0';
565 
566 			zcmn_err(getzoneid(), CE_WARN,
567 			    "Invalid UTF-8 filename: %s", path);
568 
569 			kmem_free(path, len + 1);
570 		}
571 #endif
572 		return (NULL);
573 	}
574 
575 	return (utf8_to_str(u8s, lenp, s));
576 }
577 
578 /*
579  * Converts a utf8 string to a C string.
580  * kmem_allocs a new string if not supplied
581  */
582 char *
utf8_to_str(utf8string * str,uint_t * lenp,char * s)583 utf8_to_str(utf8string *str, uint_t *lenp, char *s)
584 {
585 	char	*sp;
586 	char	*u8p;
587 	int	len;
588 	int	 i;
589 
590 	ASSERT(lenp != NULL);
591 
592 	if (str == NULL)
593 		return (NULL);
594 
595 	u8p = str->utf8string_val;
596 	len = str->utf8string_len;
597 	if (len <= 0 || u8p == NULL) {
598 		if (s)
599 			*s = '\0';
600 		return (NULL);
601 	}
602 
603 	sp = s;
604 	if (sp == NULL)
605 		sp = kmem_alloc(len + 1, KM_SLEEP);
606 
607 	/*
608 	 * At least check for embedded nulls
609 	 */
610 	for (i = 0; i < len; i++) {
611 		sp[i] = u8p[i];
612 		if (u8p[i] == '\0') {
613 #ifdef	DEBUG
614 			zcmn_err(getzoneid(), CE_WARN,
615 			    "Embedded NULL in UTF-8 string");
616 #endif
617 			if (s == NULL)
618 				kmem_free(sp, len + 1);
619 			return (NULL);
620 		}
621 	}
622 	sp[len] = '\0';
623 	*lenp = len + 1;
624 
625 	return (sp);
626 }
627 
628 /*
629  * str_to_utf8 - converts a null-terminated C string to a utf8 string
630  */
631 utf8string *
str_to_utf8(char * nm,utf8string * str)632 str_to_utf8(char *nm, utf8string *str)
633 {
634 	int len;
635 
636 	if (str == NULL)
637 		return (NULL);
638 
639 	if (nm == NULL || *nm == '\0') {
640 		str->utf8string_len = 0;
641 		str->utf8string_val = NULL;
642 	}
643 
644 	len = strlen(nm);
645 
646 	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
647 	str->utf8string_len = len;
648 	bcopy(nm, str->utf8string_val, len);
649 
650 	return (str);
651 }
652 
653 utf8string *
utf8_copy(utf8string * src,utf8string * dest)654 utf8_copy(utf8string *src, utf8string *dest)
655 {
656 	if (src == NULL)
657 		return (NULL);
658 	if (dest == NULL)
659 		return (NULL);
660 
661 	if (src->utf8string_len > 0) {
662 		dest->utf8string_val = kmem_alloc(src->utf8string_len,
663 		    KM_SLEEP);
664 		bcopy(src->utf8string_val, dest->utf8string_val,
665 		    src->utf8string_len);
666 		dest->utf8string_len = src->utf8string_len;
667 	} else {
668 		dest->utf8string_val = NULL;
669 		dest->utf8string_len = 0;
670 	}
671 
672 	return (dest);
673 }
674 
675 int
utf8_compare(const utf8string * a,const utf8string * b)676 utf8_compare(const utf8string *a, const utf8string *b)
677 {
678 	int mlen, cmp;
679 	int alen, blen;
680 	char *aval, *bval;
681 
682 	if ((a == NULL) && (b == NULL))
683 		return (0);
684 	else if (a == NULL)
685 		return (-1);
686 	else if (b == NULL)
687 		return (1);
688 
689 	alen = a->utf8string_len;
690 	blen = b->utf8string_len;
691 	aval = a->utf8string_val;
692 	bval = b->utf8string_val;
693 
694 	if (((alen == 0) || (aval == NULL)) &&
695 	    ((blen == 0) || (bval == NULL)))
696 		return (0);
697 	else if ((alen == 0) || (aval == NULL))
698 		return (-1);
699 	else if ((blen == 0) || (bval == NULL))
700 		return (1);
701 
702 	mlen = MIN(alen, blen);
703 	cmp = strncmp(aval, bval, mlen);
704 
705 	if ((cmp == 0) && (alen == blen))
706 		return (0);
707 	else if ((cmp == 0) && (alen < blen))
708 		return (-1);
709 	else if (cmp == 0)
710 		return (1);
711 	else if (cmp < 0)
712 		return (-1);
713 	return (1);
714 }
715 
716 /*
717  * utf8_dir_verify - checks that the utf8 string is valid
718  */
719 nfsstat4
utf8_dir_verify(utf8string * str)720 utf8_dir_verify(utf8string *str)
721 {
722 	char *nm;
723 	int len;
724 
725 	if (str == NULL)
726 		return (NFS4ERR_INVAL);
727 
728 	nm = str->utf8string_val;
729 	len = str->utf8string_len;
730 	if (nm == NULL || len == 0) {
731 		return (NFS4ERR_INVAL);
732 	}
733 
734 	if (len == 1 && nm[0] == '.')
735 		return (NFS4ERR_BADNAME);
736 	if (len == 2 && nm[0] == '.' && nm[1] == '.')
737 		return (NFS4ERR_BADNAME);
738 
739 	if (utf8_strchr(str, '/') != NULL)
740 		return (NFS4ERR_BADNAME);
741 
742 	if (utf8_strchr(str, '\0') != NULL)
743 		return (NFS4ERR_BADNAME);
744 
745 	return (NFS4_OK);
746 }
747 
748 /*
749  * from rpcsec module (common/rpcsec)
750  */
751 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
752 extern void sec_clnt_freeh(AUTH *);
753 extern void sec_clnt_freeinfo(struct sec_data *);
754 
755 /*
756  * authget() gets an auth handle based on the security
757  * information from the servinfo in mountinfo.
758  * The auth handle is stored in ch_client->cl_auth.
759  *
760  * First security flavor of choice is to use sv_secdata
761  * which is initiated by the client. If that fails, get
762  * secinfo from the server and then select one from the
763  * server secinfo list .
764  *
765  * For RPCSEC_GSS flavor, upon success, a secure context is
766  * established between client and server.
767  */
768 int
authget(servinfo4_t * svp,CLIENT * ch_client,cred_t * cr)769 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
770 {
771 	int error, i;
772 
773 	/*
774 	 * SV4_TRYSECINFO indicates to try the secinfo list from
775 	 * sv_secinfo until a successful one is reached. Point
776 	 * sv_currsec to the selected security mechanism for
777 	 * later sessions.
778 	 */
779 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
780 	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
781 		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
782 		    i++) {
783 			if (!(error = sec_clnt_geth(ch_client,
784 			    &svp->sv_secinfo->sdata[i],
785 			    cr, &ch_client->cl_auth))) {
786 
787 				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
788 				svp->sv_secinfo->index = i;
789 				/* done */
790 				svp->sv_flags &= ~SV4_TRYSECINFO;
791 				break;
792 			}
793 
794 			/*
795 			 * Allow the caller retry with the security flavor
796 			 * pointed by svp->sv_secinfo->index when
797 			 * ETIMEDOUT/ECONNRESET occurs.
798 			 */
799 			if (error == ETIMEDOUT || error == ECONNRESET) {
800 				svp->sv_secinfo->index = i;
801 				break;
802 			}
803 		}
804 	} else {
805 		/* sv_currsec points to one of the entries in sv_secinfo */
806 		if (svp->sv_currsec) {
807 			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
808 			    &ch_client->cl_auth);
809 		} else {
810 			/* If it's null, use sv_secdata. */
811 			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
812 			    &ch_client->cl_auth);
813 		}
814 	}
815 	nfs_rw_exit(&svp->sv_lock);
816 
817 	return (error);
818 }
819 
820 /*
821  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
822  */
823 int
clget4(clinfo_t * ci,servinfo4_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs4_clnt * nfscl)824 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
825     struct chtab **chp, struct nfs4_clnt *nfscl)
826 {
827 	struct chhead *ch, *newch;
828 	struct chhead **plistp;
829 	struct chtab *cp;
830 	int error;
831 	k_sigset_t smask;
832 
833 	if (newcl == NULL || chp == NULL || ci == NULL)
834 		return (EINVAL);
835 
836 	*newcl = NULL;
837 	*chp = NULL;
838 
839 	/*
840 	 * Find an unused handle or create one
841 	 */
842 	newch = NULL;
843 	nfscl->nfscl_stat.clgets.value.ui64++;
844 top:
845 	/*
846 	 * Find the correct entry in the cache to check for free
847 	 * client handles.  The search is based on the RPC program
848 	 * number, program version number, dev_t for the transport
849 	 * device, and the protocol family.
850 	 */
851 	mutex_enter(&nfscl->nfscl_chtable4_lock);
852 	plistp = &nfscl->nfscl_chtable4;
853 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
854 		if (ch->ch_prog == ci->cl_prog &&
855 		    ch->ch_vers == ci->cl_vers &&
856 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
857 		    (strcmp(ch->ch_protofmly,
858 		    svp->sv_knconf->knc_protofmly) == 0))
859 			break;
860 		plistp = &ch->ch_next;
861 	}
862 
863 	/*
864 	 * If we didn't find a cache entry for this quadruple, then
865 	 * create one.  If we don't have one already preallocated,
866 	 * then drop the cache lock, create one, and then start over.
867 	 * If we did have a preallocated entry, then just add it to
868 	 * the front of the list.
869 	 */
870 	if (ch == NULL) {
871 		if (newch == NULL) {
872 			mutex_exit(&nfscl->nfscl_chtable4_lock);
873 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
874 			newch->ch_timesused = 0;
875 			newch->ch_prog = ci->cl_prog;
876 			newch->ch_vers = ci->cl_vers;
877 			newch->ch_dev = svp->sv_knconf->knc_rdev;
878 			newch->ch_protofmly = kmem_alloc(
879 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
880 			    KM_SLEEP);
881 			(void) strcpy(newch->ch_protofmly,
882 			    svp->sv_knconf->knc_protofmly);
883 			newch->ch_list = NULL;
884 			goto top;
885 		}
886 		ch = newch;
887 		newch = NULL;
888 		ch->ch_next = nfscl->nfscl_chtable4;
889 		nfscl->nfscl_chtable4 = ch;
890 	/*
891 	 * We found a cache entry, but if it isn't on the front of the
892 	 * list, then move it to the front of the list to try to take
893 	 * advantage of locality of operations.
894 	 */
895 	} else if (ch != nfscl->nfscl_chtable4) {
896 		*plistp = ch->ch_next;
897 		ch->ch_next = nfscl->nfscl_chtable4;
898 		nfscl->nfscl_chtable4 = ch;
899 	}
900 
901 	/*
902 	 * If there was a free client handle cached, then remove it
903 	 * from the list, init it, and use it.
904 	 */
905 	if (ch->ch_list != NULL) {
906 		cp = ch->ch_list;
907 		ch->ch_list = cp->ch_list;
908 		mutex_exit(&nfscl->nfscl_chtable4_lock);
909 		if (newch != NULL) {
910 			kmem_free(newch->ch_protofmly,
911 			    strlen(newch->ch_protofmly) + 1);
912 			kmem_free(newch, sizeof (*newch));
913 		}
914 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
915 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
916 
917 		/*
918 		 * Get an auth handle.
919 		 */
920 		error = authget(svp, cp->ch_client, cr);
921 		if (error || cp->ch_client->cl_auth == NULL) {
922 			CLNT_DESTROY(cp->ch_client);
923 			kmem_cache_free(chtab4_cache, cp);
924 			return ((error != 0) ? error : EINTR);
925 		}
926 		ch->ch_timesused++;
927 		*newcl = cp->ch_client;
928 		*chp = cp;
929 		return (0);
930 	}
931 
932 	/*
933 	 * There weren't any free client handles which fit, so allocate
934 	 * a new one and use that.
935 	 */
936 #ifdef DEBUG
937 	atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
938 #endif
939 	mutex_exit(&nfscl->nfscl_chtable4_lock);
940 
941 	nfscl->nfscl_stat.cltoomany.value.ui64++;
942 	if (newch != NULL) {
943 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
944 		kmem_free(newch, sizeof (*newch));
945 	}
946 
947 	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
948 	cp->ch_head = ch;
949 
950 	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
951 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
952 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
953 	sigunintr(&smask);
954 
955 	if (error != 0) {
956 		kmem_cache_free(chtab4_cache, cp);
957 #ifdef DEBUG
958 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
959 #endif
960 		/*
961 		 * Warning is unnecessary if error is EINTR.
962 		 */
963 		if (error != EINTR) {
964 			nfs_cmn_err(error, CE_WARN,
965 			    "clget: couldn't create handle: %m\n");
966 		}
967 		return (error);
968 	}
969 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
970 	auth_destroy(cp->ch_client->cl_auth);
971 
972 	/*
973 	 * Get an auth handle.
974 	 */
975 	error = authget(svp, cp->ch_client, cr);
976 	if (error || cp->ch_client->cl_auth == NULL) {
977 		CLNT_DESTROY(cp->ch_client);
978 		kmem_cache_free(chtab4_cache, cp);
979 #ifdef DEBUG
980 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
981 #endif
982 		return ((error != 0) ? error : EINTR);
983 	}
984 	ch->ch_timesused++;
985 	*newcl = cp->ch_client;
986 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
987 	*chp = cp;
988 	return (0);
989 }
990 
991 static int
nfs_clget4(mntinfo4_t * mi,servinfo4_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs4_clnt * nfscl)992 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
993     struct chtab **chp, struct nfs4_clnt *nfscl)
994 {
995 	clinfo_t ci;
996 	bool_t is_recov;
997 	int firstcall, error = 0;
998 
999 	/*
1000 	 * Set read buffer size to rsize
1001 	 * and add room for RPC headers.
1002 	 */
1003 	ci.cl_readsize = mi->mi_tsize;
1004 	if (ci.cl_readsize != 0)
1005 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
1006 
1007 	/*
1008 	 * If soft mount and server is down just try once.
1009 	 * meaning: do not retransmit.
1010 	 */
1011 	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
1012 		ci.cl_retrans = 0;
1013 	else
1014 		ci.cl_retrans = mi->mi_retrans;
1015 
1016 	ci.cl_prog = mi->mi_prog;
1017 	ci.cl_vers = mi->mi_vers;
1018 	ci.cl_flags = mi->mi_flags;
1019 
1020 	/*
1021 	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1022 	 * security flavor, the client tries to establish a security context
1023 	 * by contacting the server. If the connection is timed out or reset,
1024 	 * e.g. server reboot, we will try again.
1025 	 */
1026 	is_recov = (curthread == mi->mi_recovthread);
1027 	firstcall = 1;
1028 
1029 	do {
1030 		error = clget4(&ci, svp, cr, newcl, chp, nfscl);
1031 
1032 		if (error == 0)
1033 			break;
1034 
1035 		/*
1036 		 * For forced unmount and zone shutdown, bail out but
1037 		 * let the recovery thread do one more transmission.
1038 		 */
1039 		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
1040 		    (!is_recov || !firstcall)) {
1041 			error = EIO;
1042 			break;
1043 		}
1044 
1045 		/* do not retry for soft mount */
1046 		if (!(mi->mi_flags & MI4_HARD))
1047 			break;
1048 
1049 		/* let the caller deal with the failover case */
1050 		if (FAILOVER_MOUNT4(mi))
1051 			break;
1052 
1053 		firstcall = 0;
1054 
1055 	} while (error == ETIMEDOUT || error == ECONNRESET);
1056 
1057 	return (error);
1058 }
1059 
1060 void
clfree4(CLIENT * cl,struct chtab * cp,struct nfs4_clnt * nfscl)1061 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
1062 {
1063 	if (cl->cl_auth != NULL) {
1064 		sec_clnt_freeh(cl->cl_auth);
1065 		cl->cl_auth = NULL;
1066 	}
1067 
1068 	/*
1069 	 * Timestamp this cache entry so that we know when it was last
1070 	 * used.
1071 	 */
1072 	cp->ch_freed = gethrestime_sec();
1073 
1074 	/*
1075 	 * Add the free client handle to the front of the list.
1076 	 * This way, the list will be sorted in youngest to oldest
1077 	 * order.
1078 	 */
1079 	mutex_enter(&nfscl->nfscl_chtable4_lock);
1080 	cp->ch_list = cp->ch_head->ch_list;
1081 	cp->ch_head->ch_list = cp;
1082 	mutex_exit(&nfscl->nfscl_chtable4_lock);
1083 }
1084 
1085 #define	CL_HOLDTIME	60	/* time to hold client handles */
1086 
1087 static void
clreclaim4_zone(struct nfs4_clnt * nfscl,uint_t cl_holdtime)1088 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
1089 {
1090 	struct chhead *ch;
1091 	struct chtab *cp;	/* list of objects that can be reclaimed */
1092 	struct chtab *cpe;
1093 	struct chtab *cpl;
1094 	struct chtab **cpp;
1095 #ifdef DEBUG
1096 	int n = 0;
1097 	clstat4_debug.clreclaim.value.ui64++;
1098 #endif
1099 
1100 	/*
1101 	 * Need to reclaim some memory, so step through the cache
1102 	 * looking through the lists for entries which can be freed.
1103 	 */
1104 	cp = NULL;
1105 
1106 	mutex_enter(&nfscl->nfscl_chtable4_lock);
1107 
1108 	/*
1109 	 * Here we step through each non-NULL quadruple and start to
1110 	 * construct the reclaim list pointed to by cp.  Note that
1111 	 * cp will contain all eligible chtab entries.  When this traversal
1112 	 * completes, chtab entries from the last quadruple will be at the
1113 	 * front of cp and entries from previously inspected quadruples have
1114 	 * been appended to the rear of cp.
1115 	 */
1116 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
1117 		if (ch->ch_list == NULL)
1118 			continue;
1119 		/*
1120 		 * Search each list for entries older then
1121 		 * cl_holdtime seconds.  The lists are maintained
1122 		 * in youngest to oldest order so that when the
1123 		 * first entry is found which is old enough, then
1124 		 * all of the rest of the entries on the list will
1125 		 * be old enough as well.
1126 		 */
1127 		cpl = ch->ch_list;
1128 		cpp = &ch->ch_list;
1129 		while (cpl != NULL &&
1130 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
1131 			cpp = &cpl->ch_list;
1132 			cpl = cpl->ch_list;
1133 		}
1134 		if (cpl != NULL) {
1135 			*cpp = NULL;
1136 			if (cp != NULL) {
1137 				cpe = cpl;
1138 				while (cpe->ch_list != NULL)
1139 					cpe = cpe->ch_list;
1140 				cpe->ch_list = cp;
1141 			}
1142 			cp = cpl;
1143 		}
1144 	}
1145 
1146 	mutex_exit(&nfscl->nfscl_chtable4_lock);
1147 
1148 	/*
1149 	 * If cp is empty, then there is nothing to reclaim here.
1150 	 */
1151 	if (cp == NULL)
1152 		return;
1153 
1154 	/*
1155 	 * Step through the list of entries to free, destroying each client
1156 	 * handle and kmem_free'ing the memory for each entry.
1157 	 */
1158 	while (cp != NULL) {
1159 #ifdef DEBUG
1160 		n++;
1161 #endif
1162 		CLNT_DESTROY(cp->ch_client);
1163 		cpl = cp->ch_list;
1164 		kmem_cache_free(chtab4_cache, cp);
1165 		cp = cpl;
1166 	}
1167 
1168 #ifdef DEBUG
1169 	/*
1170 	 * Update clalloc so that nfsstat shows the current number
1171 	 * of allocated client handles.
1172 	 */
1173 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
1174 #endif
1175 }
1176 
1177 /* ARGSUSED */
1178 static void
clreclaim4(void * all)1179 clreclaim4(void *all)
1180 {
1181 	struct nfs4_clnt *nfscl;
1182 
1183 	/*
1184 	 * The system is low on memory; go through and try to reclaim some from
1185 	 * every zone on the system.
1186 	 */
1187 	mutex_enter(&nfs4_clnt_list_lock);
1188 	nfscl = list_head(&nfs4_clnt_list);
1189 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
1190 		clreclaim4_zone(nfscl, CL_HOLDTIME);
1191 	mutex_exit(&nfs4_clnt_list_lock);
1192 }
1193 
1194 /*
1195  * Minimum time-out values indexed by call type
1196  * These units are in "eights" of a second to avoid multiplies
1197  */
1198 static unsigned int minimum_timeo[] = {
1199 	6, 7, 10
1200 };
1201 
1202 #define	SHORTWAIT	(NFS_COTS_TIMEO / 10)
1203 
1204 /*
1205  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1206  */
1207 #define	MAXTIMO	(20*hz)
1208 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1209 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1210 
1211 static int
nfs4_rfscall(mntinfo4_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * doqueue,enum clnt_stat * rpc_statusp,int flags,struct nfs4_clnt * nfscl)1212 nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1213     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
1214     enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
1215 {
1216 	CLIENT *client;
1217 	struct chtab *ch;
1218 	cred_t *cr = icr;
1219 	struct rpc_err rpcerr, rpcerr_tmp;
1220 	enum clnt_stat status;
1221 	int error;
1222 	struct timeval wait;
1223 	int timeo;		/* in units of hz */
1224 	bool_t tryagain, is_recov;
1225 	bool_t cred_cloned = FALSE;
1226 	k_sigset_t smask;
1227 	servinfo4_t *svp;
1228 #ifdef DEBUG
1229 	char *bufp;
1230 #endif
1231 	int firstcall;
1232 
1233 	rpcerr.re_status = RPC_SUCCESS;
1234 
1235 	/*
1236 	 * If we know that we are rebooting then let's
1237 	 * not bother with doing any over the wireness.
1238 	 */
1239 	mutex_enter(&mi->mi_lock);
1240 	if (mi->mi_flags & MI4_SHUTDOWN) {
1241 		mutex_exit(&mi->mi_lock);
1242 		return (EIO);
1243 	}
1244 	mutex_exit(&mi->mi_lock);
1245 
1246 	/* For TSOL, use a new cred which has net_mac_aware flag */
1247 	if (!cred_cloned && is_system_labeled()) {
1248 		cred_cloned = TRUE;
1249 		cr = crdup(icr);
1250 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1251 	}
1252 
1253 	/*
1254 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1255 	 * are guaranteed to reprocess the retry as a new request.
1256 	 */
1257 	svp = mi->mi_curr_serv;
1258 	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
1259 	if (rpcerr.re_errno != 0)
1260 		return (rpcerr.re_errno);
1261 
1262 	timeo = (mi->mi_timeo * hz) / 10;
1263 
1264 	/*
1265 	 * If hard mounted fs, retry call forever unless hard error
1266 	 * occurs.
1267 	 *
1268 	 * For forced unmount, let the recovery thread through but return
1269 	 * an error for all others.  This is so that user processes can
1270 	 * exit quickly.  The recovery thread bails out after one
1271 	 * transmission so that it can tell if it needs to continue.
1272 	 *
1273 	 * For zone shutdown, behave as above to encourage quick
1274 	 * process exit, but also fail quickly when servers have
1275 	 * timed out before and reduce the timeouts.
1276 	 */
1277 	is_recov = (curthread == mi->mi_recovthread);
1278 	firstcall = 1;
1279 	do {
1280 		tryagain = FALSE;
1281 
1282 		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
1283 		    "nfs4_rfscall: vfs_flag=0x%x, %s",
1284 		    mi->mi_vfsp->vfs_flag,
1285 		    is_recov ? "recov thread" : "not recov thread"));
1286 
1287 		/*
1288 		 * It's possible while we're retrying the admin
1289 		 * decided to reboot.
1290 		 */
1291 		mutex_enter(&mi->mi_lock);
1292 		if (mi->mi_flags & MI4_SHUTDOWN) {
1293 			mutex_exit(&mi->mi_lock);
1294 			clfree4(client, ch, nfscl);
1295 			if (cred_cloned)
1296 				crfree(cr);
1297 			return (EIO);
1298 		}
1299 		mutex_exit(&mi->mi_lock);
1300 
1301 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1302 		    (!is_recov || !firstcall)) {
1303 			clfree4(client, ch, nfscl);
1304 			if (cred_cloned)
1305 				crfree(cr);
1306 			return (EIO);
1307 		}
1308 
1309 		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
1310 			mutex_enter(&mi->mi_lock);
1311 			if ((mi->mi_flags & MI4_TIMEDOUT) ||
1312 			    !is_recov || !firstcall) {
1313 				mutex_exit(&mi->mi_lock);
1314 				clfree4(client, ch, nfscl);
1315 				if (cred_cloned)
1316 					crfree(cr);
1317 				return (EIO);
1318 			}
1319 			mutex_exit(&mi->mi_lock);
1320 			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
1321 		}
1322 
1323 		firstcall = 0;
1324 		TICK_TO_TIMEVAL(timeo, &wait);
1325 
1326 		/*
1327 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1328 		 * and SIGTERM. (Preserving the existing masks).
1329 		 * Mask out SIGINT if mount option nointr is specified.
1330 		 */
1331 		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
1332 		if (!(mi->mi_flags & MI4_INT))
1333 			client->cl_nosignal = TRUE;
1334 
1335 		/*
1336 		 * If there is a current signal, then don't bother
1337 		 * even trying to send out the request because we
1338 		 * won't be able to block waiting for the response.
1339 		 * Simply assume RPC_INTR and get on with it.
1340 		 */
1341 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1342 			status = RPC_INTR;
1343 		else {
1344 			status = CLNT_CALL(client, which, xdrargs, argsp,
1345 			    xdrres, resp, wait);
1346 		}
1347 
1348 		if (!(mi->mi_flags & MI4_INT))
1349 			client->cl_nosignal = FALSE;
1350 		/*
1351 		 * restore original signal mask
1352 		 */
1353 		sigunintr(&smask);
1354 
1355 		switch (status) {
1356 		case RPC_SUCCESS:
1357 			break;
1358 
1359 		case RPC_INTR:
1360 			/*
1361 			 * There is no way to recover from this error,
1362 			 * even if mount option nointr is specified.
1363 			 * SIGKILL, for example, cannot be blocked.
1364 			 */
1365 			rpcerr.re_status = RPC_INTR;
1366 			rpcerr.re_errno = EINTR;
1367 			break;
1368 
1369 		case RPC_UDERROR:
1370 			/*
1371 			 * If the NFS server is local (vold) and
1372 			 * it goes away then we get RPC_UDERROR.
1373 			 * This is a retryable error, so we would
1374 			 * loop, so check to see if the specific
1375 			 * error was ECONNRESET, indicating that
1376 			 * target did not exist at all.  If so,
1377 			 * return with RPC_PROGUNAVAIL and
1378 			 * ECONNRESET to indicate why.
1379 			 */
1380 			CLNT_GETERR(client, &rpcerr);
1381 			if (rpcerr.re_errno == ECONNRESET) {
1382 				rpcerr.re_status = RPC_PROGUNAVAIL;
1383 				rpcerr.re_errno = ECONNRESET;
1384 				break;
1385 			}
1386 			/*FALLTHROUGH*/
1387 
1388 		default:		/* probably RPC_TIMEDOUT */
1389 
1390 			if (IS_UNRECOVERABLE_RPC(status))
1391 				break;
1392 
1393 			/*
1394 			 * increment server not responding count
1395 			 */
1396 			mutex_enter(&mi->mi_lock);
1397 			mi->mi_noresponse++;
1398 			mutex_exit(&mi->mi_lock);
1399 #ifdef DEBUG
1400 			nfscl->nfscl_stat.noresponse.value.ui64++;
1401 #endif
1402 			/*
1403 			 * On zone shutdown, mark server dead and move on.
1404 			 */
1405 			if (zone_status_get(curproc->p_zone) >=
1406 			    ZONE_IS_SHUTTING_DOWN) {
1407 				mutex_enter(&mi->mi_lock);
1408 				mi->mi_flags |= MI4_TIMEDOUT;
1409 				mutex_exit(&mi->mi_lock);
1410 				clfree4(client, ch, nfscl);
1411 				if (cred_cloned)
1412 					crfree(cr);
1413 				return (EIO);
1414 			}
1415 
1416 			/*
1417 			 * NFS client failover support:
1418 			 * return and let the caller take care of
1419 			 * failover.  We only return for failover mounts
1420 			 * because otherwise we want the "not responding"
1421 			 * message, the timer updates, etc.
1422 			 */
1423 			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
1424 			    (error = try_failover(status)) != 0) {
1425 				clfree4(client, ch, nfscl);
1426 				if (cred_cloned)
1427 					crfree(cr);
1428 				*rpc_statusp = status;
1429 				return (error);
1430 			}
1431 
1432 			if (flags & RFSCALL_SOFT)
1433 				break;
1434 
1435 			tryagain = TRUE;
1436 
1437 			/*
1438 			 * The call is in progress (over COTS).
1439 			 * Try the CLNT_CALL again, but don't
1440 			 * print a noisy error message.
1441 			 */
1442 			if (status == RPC_INPROGRESS)
1443 				break;
1444 
1445 			timeo = backoff(timeo);
1446 			CLNT_GETERR(client, &rpcerr_tmp);
1447 
1448 			mutex_enter(&mi->mi_lock);
1449 			if (!(mi->mi_flags & MI4_PRINTED)) {
1450 				mi->mi_flags |= MI4_PRINTED;
1451 				mutex_exit(&mi->mi_lock);
1452 				if ((status == RPC_CANTSEND) &&
1453 				    (rpcerr_tmp.re_errno == ENOBUFS))
1454 					nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
1455 					    0, 0, FALSE, NULL, 0, NULL);
1456 				else
1457 					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
1458 					    0, 0, 0, FALSE, NULL, 0, NULL);
1459 			} else
1460 				mutex_exit(&mi->mi_lock);
1461 
1462 			if (*doqueue && nfs_has_ctty()) {
1463 				*doqueue = 0;
1464 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1465 					if ((status == RPC_CANTSEND) &&
1466 					    (rpcerr_tmp.re_errno == ENOBUFS))
1467 						nfs4_queue_fact(RF_SENDQ_FULL,
1468 						    mi, 0, 0, 0, FALSE, NULL,
1469 						    0, NULL);
1470 					else
1471 						nfs4_queue_fact(
1472 						    RF_SRV_NOT_RESPOND, mi, 0,
1473 						    0, 0, FALSE, NULL, 0, NULL);
1474 				}
1475 			}
1476 		}
1477 	} while (tryagain);
1478 
1479 	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
1480 	    int, rpcerr.re_errno);
1481 
1482 	if (status != RPC_SUCCESS) {
1483 		zoneid_t zoneid = mi->mi_zone->zone_id;
1484 
1485 		/*
1486 		 * Let soft mounts use the timed out message.
1487 		 */
1488 		if (status == RPC_INPROGRESS)
1489 			status = RPC_TIMEDOUT;
1490 		nfscl->nfscl_stat.badcalls.value.ui64++;
1491 		if (status != RPC_INTR) {
1492 			mutex_enter(&mi->mi_lock);
1493 			mi->mi_flags |= MI4_DOWN;
1494 			mutex_exit(&mi->mi_lock);
1495 			CLNT_GETERR(client, &rpcerr);
1496 #ifdef DEBUG
1497 			bufp = clnt_sperror(client, svp->sv_hostname);
1498 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1499 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1500 			if (nfs_has_ctty()) {
1501 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1502 					uprintf("NFS%d %s failed for %s\n",
1503 					    mi->mi_vers, mi->mi_rfsnames[which],
1504 					    bufp);
1505 				}
1506 			}
1507 			kmem_free(bufp, MAXPATHLEN);
1508 #else
1509 			zprintf(zoneid,
1510 			    "NFS %s failed for server %s: error %d (%s)\n",
1511 			    mi->mi_rfsnames[which], svp->sv_hostname,
1512 			    status, clnt_sperrno(status));
1513 			if (nfs_has_ctty()) {
1514 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1515 					uprintf(
1516 				"NFS %s failed for server %s: error %d (%s)\n",
1517 					    mi->mi_rfsnames[which],
1518 					    svp->sv_hostname, status,
1519 					    clnt_sperrno(status));
1520 				}
1521 			}
1522 #endif
1523 			/*
1524 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1525 			 * re_errno is set appropriately depending on
1526 			 * the authentication error
1527 			 */
1528 			if (status == RPC_VERSMISMATCH ||
1529 			    status == RPC_PROGVERSMISMATCH)
1530 				rpcerr.re_errno = EIO;
1531 		}
1532 	} else {
1533 		/*
1534 		 * Test the value of mi_down and mi_printed without
1535 		 * holding the mi_lock mutex.  If they are both zero,
1536 		 * then it is okay to skip the down and printed
1537 		 * processing.  This saves on a mutex_enter and
1538 		 * mutex_exit pair for a normal, successful RPC.
1539 		 * This was just complete overhead.
1540 		 */
1541 		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
1542 			mutex_enter(&mi->mi_lock);
1543 			mi->mi_flags &= ~MI4_DOWN;
1544 			if (mi->mi_flags & MI4_PRINTED) {
1545 				mi->mi_flags &= ~MI4_PRINTED;
1546 				mutex_exit(&mi->mi_lock);
1547 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1548 					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
1549 					    0, FALSE, NULL, 0, NULL);
1550 			} else
1551 				mutex_exit(&mi->mi_lock);
1552 		}
1553 
1554 		if (*doqueue == 0) {
1555 			if (!(mi->mi_flags & MI4_NOPRINT) &&
1556 			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1557 				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
1558 				    FALSE, NULL, 0, NULL);
1559 
1560 			*doqueue = 1;
1561 		}
1562 	}
1563 
1564 	clfree4(client, ch, nfscl);
1565 	if (cred_cloned)
1566 		crfree(cr);
1567 
1568 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1569 
1570 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
1571 	    rpcerr.re_errno);
1572 
1573 	*rpc_statusp = status;
1574 	return (rpcerr.re_errno);
1575 }
1576 
1577 /*
1578  * rfs4call - general wrapper for RPC calls initiated by the client
1579  */
1580 void
rfs4call(mntinfo4_t * mi,COMPOUND4args_clnt * argsp,COMPOUND4res_clnt * resp,cred_t * cr,int * doqueue,int flags,nfs4_error_t * ep)1581 rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
1582     cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
1583 {
1584 	int i, error;
1585 	enum clnt_stat rpc_status = RPC_SUCCESS;
1586 	int num_resops;
1587 	struct nfs4_clnt *nfscl;
1588 
1589 	ASSERT(nfs_zone() == mi->mi_zone);
1590 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1591 	ASSERT(nfscl != NULL);
1592 
1593 	nfscl->nfscl_stat.calls.value.ui64++;
1594 	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
1595 
1596 	/* Set up the results struct for XDR usage */
1597 	resp->argsp = argsp;
1598 	resp->array = NULL;
1599 	resp->status = 0;
1600 	resp->decode_len = 0;
1601 
1602 	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
1603 	    xdr_COMPOUND4args_clnt, (caddr_t)argsp,
1604 	    xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
1605 	    doqueue, &rpc_status, flags, nfscl);
1606 
1607 	/* Return now if it was an RPC error */
1608 	if (error) {
1609 		ep->error = error;
1610 		ep->stat = resp->status;
1611 		ep->rpc_status = rpc_status;
1612 		return;
1613 	}
1614 
1615 	/* else we'll count the processed operations */
1616 	num_resops = resp->decode_len;
1617 	for (i = 0; i < num_resops; i++) {
1618 		/*
1619 		 * Count the individual operations
1620 		 * processed by the server.
1621 		 */
1622 		if (resp->array[i].resop >= NFSPROC4_NULL &&
1623 		    resp->array[i].resop <= OP_WRITE)
1624 			mi->mi_reqs[resp->array[i].resop].value.ui64++;
1625 	}
1626 
1627 	ep->error = 0;
1628 	ep->stat = resp->status;
1629 	ep->rpc_status = rpc_status;
1630 }
1631 
1632 /*
1633  * nfs4rename_update - updates stored state after a rename.  Currently this
1634  * is the path of the object and anything under it, and the filehandle of
1635  * the renamed object.
1636  */
1637 void
nfs4rename_update(vnode_t * renvp,vnode_t * ndvp,nfs_fh4 * nfh4p,char * nnm)1638 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
1639 {
1640 	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
1641 	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
1642 }
1643 
1644 /*
1645  * Routine to look up the filehandle for the given path and rootvp.
1646  *
1647  * Return values:
1648  * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1649  *   updated.
1650  * - error: return value (errno value) and/or *statp is set appropriately.
1651  */
1652 #define	RML_ORDINARY	1
1653 #define	RML_NAMED_ATTR	2
1654 #define	RML_ATTRDIR	3
1655 
1656 static void
remap_lookup(nfs4_fname_t * fname,vnode_t * rootvp,int filetype,cred_t * cr,nfs_fh4 * fhp,nfs4_ga_res_t * garp,nfs_fh4 * pfhp,nfs4_ga_res_t * pgarp,nfs4_error_t * ep)1657 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
1658     int filetype, cred_t *cr,
1659     nfs_fh4 *fhp, nfs4_ga_res_t *garp,		/* fh, attrs for object */
1660     nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
1661     nfs4_error_t *ep)
1662 {
1663 	COMPOUND4args_clnt args;
1664 	COMPOUND4res_clnt res;
1665 	nfs_argop4 *argop;
1666 	nfs_resop4 *resop;
1667 	int num_argops;
1668 	lookup4_param_t lookuparg;
1669 	nfs_fh4 *tmpfhp;
1670 	int doqueue = 1;
1671 	char *path;
1672 	mntinfo4_t *mi;
1673 
1674 	ASSERT(fname != NULL);
1675 	ASSERT(rootvp->v_type == VDIR);
1676 
1677 	mi = VTOMI4(rootvp);
1678 	path = fn_path(fname);
1679 	switch (filetype) {
1680 	case RML_NAMED_ATTR:
1681 		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
1682 		args.ctag = TAG_REMAP_LOOKUP_NA;
1683 		break;
1684 	case RML_ATTRDIR:
1685 		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
1686 		args.ctag = TAG_REMAP_LOOKUP_AD;
1687 		break;
1688 	case RML_ORDINARY:
1689 		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1690 		args.ctag = TAG_REMAP_LOOKUP;
1691 		break;
1692 	default:
1693 		ep->error = EINVAL;
1694 		return;
1695 	}
1696 	lookuparg.argsp = &args;
1697 	lookuparg.resp = &res;
1698 	lookuparg.header_len = 1;	/* Putfh */
1699 	lookuparg.trailer_len = 0;
1700 	lookuparg.ga_bits = NFS4_VATTR_MASK;
1701 	lookuparg.mi = VTOMI4(rootvp);
1702 
1703 	(void) nfs4lookup_setup(path, &lookuparg, 1);
1704 
1705 	/* 0: putfh directory */
1706 	argop = args.array;
1707 	argop[0].argop = OP_CPUTFH;
1708 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
1709 
1710 	num_argops = args.array_len;
1711 
1712 	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1713 
1714 	if (ep->error || res.status != NFS4_OK)
1715 		goto exit;
1716 
1717 	/* get the object filehandle */
1718 	resop = &res.array[res.array_len - 2];
1719 	if (resop->resop != OP_GETFH) {
1720 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1721 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1722 		ep->stat = NFS4ERR_SERVERFAULT;
1723 		goto exit;
1724 	}
1725 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1726 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1727 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1728 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1729 		    TAG_NONE, 0, 0);
1730 		ep->stat = NFS4ERR_SERVERFAULT;
1731 		goto exit;
1732 	}
1733 	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1734 	nfs_fh4_copy(tmpfhp, fhp);
1735 
1736 	/* get the object attributes */
1737 	resop = &res.array[res.array_len - 1];
1738 	if (garp && resop->resop == OP_GETATTR)
1739 		*garp = resop->nfs_resop4_u.opgetattr.ga_res;
1740 
1741 	/* See if there are enough fields in the response for parent info */
1742 	if ((int)res.array_len - 5 <= 0)
1743 		goto exit;
1744 
1745 	/* get the parent filehandle */
1746 	resop = &res.array[res.array_len - 5];
1747 	if (resop->resop != OP_GETFH) {
1748 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1749 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1750 		ep->stat = NFS4ERR_SERVERFAULT;
1751 		goto exit;
1752 	}
1753 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1754 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1755 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1756 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1757 		    TAG_NONE, 0, 0);
1758 		ep->stat = NFS4ERR_SERVERFAULT;
1759 		goto exit;
1760 	}
1761 	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1762 	nfs_fh4_copy(tmpfhp, pfhp);
1763 
1764 	/* get the parent attributes */
1765 	resop = &res.array[res.array_len - 4];
1766 	if (pgarp && resop->resop == OP_GETATTR)
1767 		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
1768 
1769 exit:
1770 	/*
1771 	 * It is too hard to remember where all the OP_LOOKUPs are
1772 	 */
1773 	nfs4args_lookup_free(argop, num_argops);
1774 	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1775 
1776 	if (!ep->error)
1777 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1778 	kmem_free(path, strlen(path)+1);
1779 }
1780 
1781 /*
1782  * NFS client failover / volatile filehandle support
1783  *
1784  * Recover the filehandle for the given rnode.
1785  *
1786  * Errors are returned via the nfs4_error_t parameter.
1787  */
1788 
1789 void
nfs4_remap_file(mntinfo4_t * mi,vnode_t * vp,int flags,nfs4_error_t * ep)1790 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
1791 {
1792 	int is_stub;
1793 	rnode4_t *rp = VTOR4(vp);
1794 	vnode_t *rootvp = NULL;
1795 	vnode_t *dvp = NULL;
1796 	cred_t *cr, *cred_otw;
1797 	nfs4_ga_res_t gar, pgar;
1798 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
1799 	int filetype = RML_ORDINARY;
1800 	nfs4_recov_state_t recov = {NULL, 0, 0};
1801 	int badfhcount = 0;
1802 	nfs4_open_stream_t *osp = NULL;
1803 	bool_t first_time = TRUE;	/* first time getting OTW cred */
1804 	bool_t last_time = FALSE;	/* last time getting OTW cred */
1805 
1806 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1807 	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
1808 	ASSERT(nfs4_consistent_type(vp));
1809 
1810 	if (vp->v_flag & VROOT) {
1811 		nfs4_remap_root(mi, ep, flags);
1812 		return;
1813 	}
1814 
1815 	/*
1816 	 * Given the root fh, use the path stored in
1817 	 * the rnode to find the fh for the new server.
1818 	 */
1819 	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1820 	if (ep->error != 0)
1821 		return;
1822 
1823 	cr = curthread->t_cred;
1824 	ASSERT(cr != NULL);
1825 get_remap_cred:
1826 	/*
1827 	 * Releases the osp, if it is provided.
1828 	 * Puts a hold on the cred_otw and the new osp (if found).
1829 	 */
1830 	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
1831 	    &first_time, &last_time);
1832 	ASSERT(cred_otw != NULL);
1833 
1834 	if (rp->r_flags & R4ISXATTR) {
1835 		filetype = RML_NAMED_ATTR;
1836 		(void) vtodv(vp, &dvp, cred_otw, FALSE);
1837 	}
1838 
1839 	if (vp->v_flag & V_XATTRDIR) {
1840 		filetype = RML_ATTRDIR;
1841 	}
1842 
1843 	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
1844 		/* file mount, doesn't need a remap */
1845 		goto done;
1846 	}
1847 
1848 again:
1849 	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
1850 	    &newfh, &gar, &newpfh, &pgar, ep);
1851 
1852 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1853 	    "nfs4_remap_file: remap_lookup returned %d/%d",
1854 	    ep->error, ep->stat));
1855 
1856 	if (last_time == FALSE && ep->error == EACCES) {
1857 		crfree(cred_otw);
1858 		if (dvp != NULL)
1859 			VN_RELE(dvp);
1860 		goto get_remap_cred;
1861 	}
1862 	if (ep->error != 0)
1863 		goto done;
1864 
1865 	switch (ep->stat) {
1866 	case NFS4_OK:
1867 		badfhcount = 0;
1868 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1869 			mutex_enter(&rp->r_statelock);
1870 			rp->r_delay_interval = 0;
1871 			mutex_exit(&rp->r_statelock);
1872 			uprintf("NFS File Available..\n");
1873 		}
1874 		break;
1875 	case NFS4ERR_FHEXPIRED:
1876 	case NFS4ERR_BADHANDLE:
1877 	case NFS4ERR_STALE:
1878 		/*
1879 		 * If we ran into filehandle problems, we should try to
1880 		 * remap the root vnode first and hope life gets better.
1881 		 * But we need to avoid loops.
1882 		 */
1883 		if (badfhcount++ > 0)
1884 			goto done;
1885 		if (newfh.nfs_fh4_len != 0) {
1886 			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1887 			newfh.nfs_fh4_len = 0;
1888 		}
1889 		if (newpfh.nfs_fh4_len != 0) {
1890 			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1891 			newpfh.nfs_fh4_len = 0;
1892 		}
1893 		/* relative path - remap rootvp then retry */
1894 		VN_RELE(rootvp);
1895 		rootvp = NULL;
1896 		nfs4_remap_root(mi, ep, flags);
1897 		if (ep->error != 0 || ep->stat != NFS4_OK)
1898 			goto done;
1899 		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1900 		if (ep->error != 0)
1901 			goto done;
1902 		goto again;
1903 	case NFS4ERR_DELAY:
1904 		badfhcount = 0;
1905 		nfs4_set_delay_wait(vp);
1906 		ep->error = nfs4_wait_for_delay(vp, &recov);
1907 		if (ep->error != 0)
1908 			goto done;
1909 		goto again;
1910 	case NFS4ERR_ACCESS:
1911 		/* get new cred, try again */
1912 		if (last_time == TRUE)
1913 			goto done;
1914 		if (dvp != NULL)
1915 			VN_RELE(dvp);
1916 		crfree(cred_otw);
1917 		goto get_remap_cred;
1918 	default:
1919 		goto done;
1920 	}
1921 
1922 	/*
1923 	 * Check on the new and old rnodes before updating;
1924 	 * if the vnode type or size changes, issue a warning
1925 	 * and mark the file dead.
1926 	 */
1927 	mutex_enter(&rp->r_statelock);
1928 	if (flags & NFS4_REMAP_CKATTRS) {
1929 		if (vp->v_type != gar.n4g_va.va_type ||
1930 		    (vp->v_type != VDIR &&
1931 		    rp->r_size != gar.n4g_va.va_size)) {
1932 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1933 			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
1934 			    (int)rp->r_size, (int)gar.n4g_va.va_size,
1935 			    vp->v_type, gar.n4g_va.va_type));
1936 			mutex_exit(&rp->r_statelock);
1937 			nfs4_queue_event(RE_FILE_DIFF, mi,
1938 			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
1939 			    TAG_NONE, TAG_NONE, 0, 0);
1940 			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
1941 			goto done;
1942 		}
1943 	}
1944 	ASSERT(gar.n4g_va.va_type != VNON);
1945 	rp->r_server = mi->mi_curr_serv;
1946 
1947 	/*
1948 	 * Turn this object into a "stub" object if we
1949 	 * crossed an underlying server fs boundary.
1950 	 *
1951 	 * This stub will be for a mirror-mount.
1952 	 * A referral would look like a boundary crossing
1953 	 * as well, but would not be the same type of object,
1954 	 * so we would expect to mark the object dead.
1955 	 *
1956 	 * See comment in r4_do_attrcache() for more details.
1957 	 */
1958 	is_stub = 0;
1959 	if (gar.n4g_fsid_valid) {
1960 		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
1961 		rp->r_srv_fsid = gar.n4g_fsid;
1962 		if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
1963 			is_stub = 1;
1964 		nfs_rw_exit(&rp->r_server->sv_lock);
1965 #ifdef DEBUG
1966 	} else {
1967 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1968 		    "remap_file: fsid attr not provided by server.  rp=%p",
1969 		    (void *)rp));
1970 #endif
1971 	}
1972 	if (is_stub)
1973 		r4_stub_mirrormount(rp);
1974 	else
1975 		r4_stub_none(rp);
1976 	mutex_exit(&rp->r_statelock);
1977 	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
1978 	sfh4_update(rp->r_fh, &newfh);
1979 	ASSERT(nfs4_consistent_type(vp));
1980 
1981 	/*
1982 	 * If we got parent info, use it to update the parent
1983 	 */
1984 	if (newpfh.nfs_fh4_len != 0) {
1985 		if (rp->r_svnode.sv_dfh != NULL)
1986 			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
1987 		if (dvp != NULL) {
1988 			/* force update of attrs */
1989 			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
1990 		}
1991 	}
1992 done:
1993 	if (newfh.nfs_fh4_len != 0)
1994 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1995 	if (newpfh.nfs_fh4_len != 0)
1996 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1997 	if (cred_otw != NULL)
1998 		crfree(cred_otw);
1999 	if (rootvp != NULL)
2000 		VN_RELE(rootvp);
2001 	if (dvp != NULL)
2002 		VN_RELE(dvp);
2003 	if (osp != NULL)
2004 		open_stream_rele(osp, rp);
2005 }
2006 
2007 /*
2008  * Client-side failover support: remap the filehandle for vp if it appears
2009  * necessary.  errors are returned via the nfs4_error_t parameter; though,
2010  * if there is a problem, we will just try again later.
2011  */
2012 
2013 void
nfs4_check_remap(mntinfo4_t * mi,vnode_t * vp,int flags,nfs4_error_t * ep)2014 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
2015 {
2016 	if (vp == NULL)
2017 		return;
2018 
2019 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
2020 		return;
2021 
2022 	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
2023 		return;
2024 
2025 	nfs4_remap_file(mi, vp, flags, ep);
2026 }
2027 
2028 /*
2029  * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
2030  *
2031  * Our caller has a filehandle for ".." relative to a particular
2032  * directory object.  We want to find or create a parent vnode
2033  * with that filehandle and return it.  We can of course create
2034  * a vnode from this filehandle, but we need to also make sure
2035  * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2036  * that we have a parent FH for future reopens as well.  If
2037  * we have a remap failure, we won't be able to reopen this
2038  * file, but we won't treat that as fatal because a reopen
2039  * is at least unlikely.  Someday nfs4_reopen() should look
2040  * for a missing parent FH and try a remap to recover from it.
2041  *
2042  * need_start_op argument indicates whether this function should
2043  * do a start_op before calling remap_lookup().  This should
2044  * be FALSE, if you are the recovery thread or in an op; otherwise,
2045  * set it to TRUE.
2046  */
2047 int
nfs4_make_dotdot(nfs4_sharedfh_t * fhp,hrtime_t t,vnode_t * dvp,cred_t * cr,vnode_t ** vpp,int need_start_op)2048 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
2049     cred_t *cr, vnode_t **vpp, int need_start_op)
2050 {
2051 	mntinfo4_t *mi = VTOMI4(dvp);
2052 	nfs4_fname_t *np = NULL, *pnp = NULL;
2053 	vnode_t *vp = NULL, *rootvp = NULL;
2054 	rnode4_t *rp;
2055 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2056 	nfs4_ga_res_t gar, pgar;
2057 	vattr_t va, pva;
2058 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2059 	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
2060 	nfs4_recov_state_t recov_state;
2061 
2062 #ifdef DEBUG
2063 	/*
2064 	 * ensure need_start_op is correct
2065 	 */
2066 	{
2067 		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
2068 		    (curthread == mi->mi_recovthread));
2069 		/* C needs a ^^ operator! */
2070 		ASSERT(((need_start_op) && (!no_need_start_op)) ||
2071 		    ((! need_start_op) && (no_need_start_op)));
2072 	}
2073 #endif
2074 	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
2075 
2076 	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
2077 	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
2078 	    rnode4info(VTOR4(dvp))));
2079 
2080 	/*
2081 	 * rootvp might be needed eventually. Holding it now will
2082 	 * ensure that r4find_unlocked() will find it, if ".." is the root.
2083 	 */
2084 	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2085 	if (e.error != 0)
2086 		goto out;
2087 	rp = r4find_unlocked(fhp, mi->mi_vfsp);
2088 	if (rp != NULL) {
2089 		*vpp = RTOV4(rp);
2090 		VN_RELE(rootvp);
2091 		return (0);
2092 	}
2093 
2094 	/*
2095 	 * Since we don't have the rnode, we have to go over the wire.
2096 	 * remap_lookup() can get all of the filehandles and attributes
2097 	 * we need in one operation.
2098 	 */
2099 	np = fn_parent(VTOSV(dvp)->sv_name);
2100 	/* if a parent was not found return an error */
2101 	if (np == NULL) {
2102 		e.error = ENOENT;
2103 		goto out;
2104 	}
2105 
2106 	recov_state.rs_flags = 0;
2107 	recov_state.rs_num_retry_despite_err = 0;
2108 recov_retry:
2109 	if (need_start_op) {
2110 		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
2111 		    &recov_state, NULL);
2112 		if (e.error != 0) {
2113 			goto out;
2114 		}
2115 	}
2116 
2117 	pgar.n4g_va.va_type = VNON;
2118 	gar.n4g_va.va_type = VNON;
2119 
2120 	remap_lookup(np, rootvp, RML_ORDINARY, cr,
2121 	    &newfh, &gar, &newpfh, &pgar, &e);
2122 	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2123 		if (need_start_op) {
2124 			bool_t abort;
2125 
2126 			abort = nfs4_start_recovery(&e, mi,
2127 			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL,
2128 			    NULL);
2129 			if (abort) {
2130 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2131 				    &recov_state, FALSE);
2132 				if (e.error == 0)
2133 					e.error = EIO;
2134 				goto out;
2135 			}
2136 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2137 			    &recov_state, TRUE);
2138 			goto recov_retry;
2139 		}
2140 		if (e.error == 0)
2141 			e.error = EIO;
2142 		goto out;
2143 	}
2144 
2145 	va = gar.n4g_va;
2146 	pva = pgar.n4g_va;
2147 
2148 	if ((e.error != 0) ||
2149 	    (va.va_type != VDIR)) {
2150 		if (need_start_op)
2151 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2152 			    &recov_state, FALSE);
2153 		if (e.error == 0)
2154 			e.error = EIO;
2155 		goto out;
2156 	}
2157 
2158 	if (e.stat != NFS4_OK) {
2159 		if (need_start_op)
2160 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2161 			    &recov_state, FALSE);
2162 		e.error = EIO;
2163 		goto out;
2164 	}
2165 
2166 	/*
2167 	 * It is possible for remap_lookup() to return with no error,
2168 	 * but without providing the parent filehandle and attrs.
2169 	 */
2170 	if (pva.va_type != VDIR) {
2171 		/*
2172 		 * Call remap_lookup() again, this time with the
2173 		 * newpfh and pgar args in the first position.
2174 		 */
2175 		pnp = fn_parent(np);
2176 		if (pnp != NULL) {
2177 			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
2178 			    &newpfh, &pgar, NULL, NULL, &e);
2179 			/*
2180 			 * This remap_lookup call modifies pgar. The following
2181 			 * line prevents trouble when checking the va_type of
2182 			 * pva later in this code.
2183 			 */
2184 			pva = pgar.n4g_va;
2185 
2186 			if (nfs4_needs_recovery(&e, FALSE,
2187 			    mi->mi_vfsp)) {
2188 				if (need_start_op) {
2189 					bool_t abort;
2190 
2191 					abort = nfs4_start_recovery(&e, mi,
2192 					    rootvp, NULL, NULL, NULL,
2193 					    OP_LOOKUP, NULL, NULL, NULL);
2194 					if (abort) {
2195 						nfs4_end_fop(mi, rootvp, NULL,
2196 						    OH_LOOKUP, &recov_state,
2197 						    FALSE);
2198 						if (e.error == 0)
2199 							e.error = EIO;
2200 						goto out;
2201 					}
2202 					nfs4_end_fop(mi, rootvp, NULL,
2203 					    OH_LOOKUP, &recov_state, TRUE);
2204 					goto recov_retry;
2205 				}
2206 				if (e.error == 0)
2207 					e.error = EIO;
2208 				goto out;
2209 			}
2210 
2211 			if (e.stat != NFS4_OK) {
2212 				if (need_start_op)
2213 					nfs4_end_fop(mi, rootvp, NULL,
2214 					    OH_LOOKUP, &recov_state, FALSE);
2215 				e.error = EIO;
2216 				goto out;
2217 			}
2218 		}
2219 		if ((pnp == NULL) ||
2220 		    (e.error != 0) ||
2221 		    (pva.va_type == VNON)) {
2222 			if (need_start_op)
2223 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2224 				    &recov_state, FALSE);
2225 			if (e.error == 0)
2226 				e.error = EIO;
2227 			goto out;
2228 		}
2229 	}
2230 	ASSERT(newpfh.nfs_fh4_len != 0);
2231 	if (need_start_op)
2232 		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
2233 	psfh = sfh4_get(&newpfh, mi);
2234 
2235 	sfh = sfh4_get(&newfh, mi);
2236 	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
2237 
2238 out:
2239 	if (np != NULL)
2240 		fn_rele(&np);
2241 	if (pnp != NULL)
2242 		fn_rele(&pnp);
2243 	if (newfh.nfs_fh4_len != 0)
2244 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2245 	if (newpfh.nfs_fh4_len != 0)
2246 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2247 	if (sfh != NULL)
2248 		sfh4_rele(&sfh);
2249 	if (psfh != NULL)
2250 		sfh4_rele(&psfh);
2251 	if (rootvp != NULL)
2252 		VN_RELE(rootvp);
2253 	*vpp = vp;
2254 	return (e.error);
2255 }
2256 
2257 #ifdef DEBUG
2258 size_t r_path_memuse = 0;
2259 #endif
2260 
2261 /*
2262  * NFS client failover support
2263  *
2264  * sv4_free() frees the malloc'd portion of a "servinfo_t".
2265  */
2266 void
sv4_free(servinfo4_t * svp)2267 sv4_free(servinfo4_t *svp)
2268 {
2269 	servinfo4_t *next;
2270 	struct knetconfig *knconf;
2271 
2272 	while (svp != NULL) {
2273 		next = svp->sv_next;
2274 		if (svp->sv_dhsec)
2275 			sec_clnt_freeinfo(svp->sv_dhsec);
2276 		if (svp->sv_secdata)
2277 			sec_clnt_freeinfo(svp->sv_secdata);
2278 		if (svp->sv_save_secinfo &&
2279 		    svp->sv_save_secinfo != svp->sv_secinfo)
2280 			secinfo_free(svp->sv_save_secinfo);
2281 		if (svp->sv_secinfo)
2282 			secinfo_free(svp->sv_secinfo);
2283 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
2284 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
2285 		knconf = svp->sv_knconf;
2286 		if (knconf != NULL) {
2287 			if (knconf->knc_protofmly != NULL)
2288 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2289 			if (knconf->knc_proto != NULL)
2290 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2291 			kmem_free(knconf, sizeof (*knconf));
2292 		}
2293 		knconf = svp->sv_origknconf;
2294 		if (knconf != NULL) {
2295 			if (knconf->knc_protofmly != NULL)
2296 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2297 			if (knconf->knc_proto != NULL)
2298 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2299 			kmem_free(knconf, sizeof (*knconf));
2300 		}
2301 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
2302 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
2303 		if (svp->sv_path != NULL) {
2304 			kmem_free(svp->sv_path, svp->sv_pathlen);
2305 		}
2306 		nfs_rw_destroy(&svp->sv_lock);
2307 		kmem_free(svp, sizeof (*svp));
2308 		svp = next;
2309 	}
2310 }
2311 
2312 void
nfs4_printfhandle(nfs4_fhandle_t * fhp)2313 nfs4_printfhandle(nfs4_fhandle_t *fhp)
2314 {
2315 	int *ip;
2316 	char *buf;
2317 	size_t bufsize;
2318 	char *cp;
2319 
2320 	/*
2321 	 * 13 == "(file handle:"
2322 	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2323 	 *	1 == ' '
2324 	 *	8 == maximum strlen of "%x"
2325 	 * 3 == ")\n\0"
2326 	 */
2327 	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2328 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
2329 	if (buf == NULL)
2330 		return;
2331 
2332 	cp = buf;
2333 	(void) strcpy(cp, "(file handle:");
2334 	while (*cp != '\0')
2335 		cp++;
2336 	for (ip = (int *)fhp->fh_buf;
2337 	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
2338 	    ip++) {
2339 		(void) sprintf(cp, " %x", *ip);
2340 		while (*cp != '\0')
2341 			cp++;
2342 	}
2343 	(void) strcpy(cp, ")\n");
2344 
2345 	zcmn_err(getzoneid(), CE_CONT, "%s", buf);
2346 
2347 	kmem_free(buf, bufsize);
2348 }
2349 
2350 /*
2351  * The NFSv4 readdir cache subsystem.
2352  *
2353  * We provide a set of interfaces to allow the rest of the system to utilize
2354  * a caching mechanism while encapsulating the details of the actual
2355  * implementation.  This should allow for better maintainability and
2356  * extensibility by consolidating the implementation details in one location.
2357  */
2358 
2359 /*
2360  * Comparator used by AVL routines.
2361  */
2362 static int
rddir4_cache_compar(const void * x,const void * y)2363 rddir4_cache_compar(const void *x, const void *y)
2364 {
2365 	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
2366 	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
2367 	rddir4_cache *a = &ai->rc;
2368 	rddir4_cache *b = &bi->rc;
2369 
2370 	if (a->nfs4_cookie == b->nfs4_cookie) {
2371 		if (a->buflen == b->buflen)
2372 			return (0);
2373 		if (a->buflen < b->buflen)
2374 			return (-1);
2375 		return (1);
2376 	}
2377 
2378 	if (a->nfs4_cookie < b->nfs4_cookie)
2379 			return (-1);
2380 
2381 	return (1);
2382 }
2383 
2384 /*
2385  * Allocate an opaque handle for the readdir cache.
2386  */
2387 void
rddir4_cache_create(rnode4_t * rp)2388 rddir4_cache_create(rnode4_t *rp)
2389 {
2390 	ASSERT(rp->r_dir == NULL);
2391 
2392 	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2393 
2394 	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
2395 	    offsetof(rddir4_cache_impl, tree));
2396 }
2397 
2398 /*
2399  *  Purge the cache of all cached readdir responses.
2400  */
2401 void
rddir4_cache_purge(rnode4_t * rp)2402 rddir4_cache_purge(rnode4_t *rp)
2403 {
2404 	rddir4_cache_impl	*rdip;
2405 	rddir4_cache_impl	*nrdip;
2406 
2407 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2408 
2409 	if (rp->r_dir == NULL)
2410 		return;
2411 
2412 	rdip = avl_first(rp->r_dir);
2413 
2414 	while (rdip != NULL) {
2415 		nrdip = AVL_NEXT(rp->r_dir, rdip);
2416 		avl_remove(rp->r_dir, rdip);
2417 		rdip->rc.flags &= ~RDDIRCACHED;
2418 		rddir4_cache_rele(rp, &rdip->rc);
2419 		rdip = nrdip;
2420 	}
2421 	ASSERT(avl_numnodes(rp->r_dir) == 0);
2422 }
2423 
2424 /*
2425  * Destroy the readdir cache.
2426  */
2427 void
rddir4_cache_destroy(rnode4_t * rp)2428 rddir4_cache_destroy(rnode4_t *rp)
2429 {
2430 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2431 	if (rp->r_dir == NULL)
2432 		return;
2433 
2434 	rddir4_cache_purge(rp);
2435 	avl_destroy(rp->r_dir);
2436 	kmem_free(rp->r_dir, sizeof (avl_tree_t));
2437 	rp->r_dir = NULL;
2438 }
2439 
2440 /*
2441  * Locate a readdir response from the readdir cache.
2442  *
2443  * Return values:
2444  *
2445  * NULL - If there is an unrecoverable situation like the operation may have
2446  *	  been interrupted.
2447  *
2448  * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2449  *		    The flags are set approprately, such that the caller knows
2450  *		    what state the entry is in.
2451  */
2452 rddir4_cache *
rddir4_cache_lookup(rnode4_t * rp,offset_t cookie,int count)2453 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
2454 {
2455 	rddir4_cache_impl	*rdip = NULL;
2456 	rddir4_cache_impl	srdip;
2457 	rddir4_cache		*srdc;
2458 	rddir4_cache		*rdc = NULL;
2459 	rddir4_cache		*nrdc = NULL;
2460 	avl_index_t		where;
2461 
2462 top:
2463 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2464 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2465 	/*
2466 	 * Check to see if the readdir cache has been disabled.  If so, then
2467 	 * simply allocate an rddir4_cache entry and return it, since caching
2468 	 * operations do not apply.
2469 	 */
2470 	if (rp->r_dir == NULL) {
2471 		if (nrdc == NULL) {
2472 			/*
2473 			 * Drop the lock because we are doing a sleeping
2474 			 * allocation.
2475 			 */
2476 			mutex_exit(&rp->r_statelock);
2477 			rdc = rddir4_cache_alloc(KM_SLEEP);
2478 			rdc->nfs4_cookie = cookie;
2479 			rdc->buflen = count;
2480 			mutex_enter(&rp->r_statelock);
2481 			return (rdc);
2482 		}
2483 		return (nrdc);
2484 	}
2485 
2486 	srdc = &srdip.rc;
2487 	srdc->nfs4_cookie = cookie;
2488 	srdc->buflen = count;
2489 
2490 	rdip = avl_find(rp->r_dir, &srdip, &where);
2491 
2492 	/*
2493 	 * If we didn't find an entry then create one and insert it
2494 	 * into the cache.
2495 	 */
2496 	if (rdip == NULL) {
2497 		/*
2498 		 * Check for the case where we have made a second pass through
2499 		 * the cache due to a lockless allocation.  If we find that no
2500 		 * thread has already inserted this entry, do the insert now
2501 		 * and return.
2502 		 */
2503 		if (nrdc != NULL) {
2504 			avl_insert(rp->r_dir, nrdc->data, where);
2505 			nrdc->flags |= RDDIRCACHED;
2506 			rddir4_cache_hold(nrdc);
2507 			return (nrdc);
2508 		}
2509 
2510 #ifdef DEBUG
2511 		nfs4_readdir_cache_misses++;
2512 #endif
2513 		/*
2514 		 * First, try to allocate an entry without sleeping.  If that
2515 		 * fails then drop the lock and do a sleeping allocation.
2516 		 */
2517 		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
2518 		if (nrdc != NULL) {
2519 			nrdc->nfs4_cookie = cookie;
2520 			nrdc->buflen = count;
2521 			avl_insert(rp->r_dir, nrdc->data, where);
2522 			nrdc->flags |= RDDIRCACHED;
2523 			rddir4_cache_hold(nrdc);
2524 			return (nrdc);
2525 		}
2526 
2527 		/*
2528 		 * Drop the lock and do a sleeping allocation.	We incur
2529 		 * additional overhead by having to search the cache again,
2530 		 * but this case should be rare.
2531 		 */
2532 		mutex_exit(&rp->r_statelock);
2533 		nrdc = rddir4_cache_alloc(KM_SLEEP);
2534 		nrdc->nfs4_cookie = cookie;
2535 		nrdc->buflen = count;
2536 		mutex_enter(&rp->r_statelock);
2537 		/*
2538 		 * We need to take another pass through the cache
2539 		 * since we dropped our lock to perform the alloc.
2540 		 * Another thread may have come by and inserted the
2541 		 * entry we are interested in.
2542 		 */
2543 		goto top;
2544 	}
2545 
2546 	/*
2547 	 * Check to see if we need to free our entry.  This can happen if
2548 	 * another thread came along beat us to the insert.  We can
2549 	 * safely call rddir4_cache_free directly because no other thread
2550 	 * would have a reference to this entry.
2551 	 */
2552 	if (nrdc != NULL)
2553 		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
2554 
2555 #ifdef DEBUG
2556 	nfs4_readdir_cache_hits++;
2557 #endif
2558 	/*
2559 	 * Found something.  Make sure it's ready to return.
2560 	 */
2561 	rdc = &rdip->rc;
2562 	rddir4_cache_hold(rdc);
2563 	/*
2564 	 * If the cache entry is in the process of being filled in, wait
2565 	 * until this completes.  The RDDIRWAIT bit is set to indicate that
2566 	 * someone is waiting and when the thread currently filling the entry
2567 	 * is done, it should do a cv_broadcast to wakeup all of the threads
2568 	 * waiting for it to finish. If the thread wakes up to find that
2569 	 * someone new is now trying to complete the the entry, go back
2570 	 * to sleep.
2571 	 */
2572 	while (rdc->flags & RDDIR) {
2573 		/*
2574 		 * The entry is not complete.
2575 		 */
2576 		nfs_rw_exit(&rp->r_rwlock);
2577 		rdc->flags |= RDDIRWAIT;
2578 #ifdef DEBUG
2579 		nfs4_readdir_cache_waits++;
2580 #endif
2581 		while (rdc->flags & RDDIRWAIT) {
2582 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2583 				/*
2584 				 * We got interrupted, probably the user
2585 				 * typed ^C or an alarm fired.  We free the
2586 				 * new entry if we allocated one.
2587 				 */
2588 				rddir4_cache_rele(rp, rdc);
2589 				mutex_exit(&rp->r_statelock);
2590 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2591 				    RW_READER, FALSE);
2592 				mutex_enter(&rp->r_statelock);
2593 				return (NULL);
2594 			}
2595 		}
2596 		mutex_exit(&rp->r_statelock);
2597 		(void) nfs_rw_enter_sig(&rp->r_rwlock,
2598 		    RW_READER, FALSE);
2599 		mutex_enter(&rp->r_statelock);
2600 	}
2601 
2602 	/*
2603 	 * The entry we were waiting on may have been purged from
2604 	 * the cache and should no longer be used, release it and
2605 	 * start over.
2606 	 */
2607 	if (!(rdc->flags & RDDIRCACHED)) {
2608 		rddir4_cache_rele(rp, rdc);
2609 		goto top;
2610 	}
2611 
2612 	/*
2613 	 * The entry is completed.  Return it.
2614 	 */
2615 	return (rdc);
2616 }
2617 
2618 /*
2619  * Allocate a cache element and return it.  Can return NULL if memory is
2620  * low.
2621  */
2622 static rddir4_cache *
rddir4_cache_alloc(int flags)2623 rddir4_cache_alloc(int flags)
2624 {
2625 	rddir4_cache_impl	*rdip = NULL;
2626 	rddir4_cache		*rc = NULL;
2627 
2628 	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
2629 
2630 	if (rdip != NULL) {
2631 		rc = &rdip->rc;
2632 		rc->data = (void *)rdip;
2633 		rc->nfs4_cookie = 0;
2634 		rc->nfs4_ncookie = 0;
2635 		rc->entries = NULL;
2636 		rc->eof = 0;
2637 		rc->entlen = 0;
2638 		rc->buflen = 0;
2639 		rc->actlen = 0;
2640 		/*
2641 		 * A readdir is required so set the flag.
2642 		 */
2643 		rc->flags = RDDIRREQ;
2644 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
2645 		rc->error = 0;
2646 		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
2647 		rdip->count = 1;
2648 #ifdef DEBUG
2649 		atomic_inc_64(&clstat4_debug.dirent.value.ui64);
2650 #endif
2651 	}
2652 	return (rc);
2653 }
2654 
2655 /*
2656  * Increment the reference count to this cache element.
2657  */
2658 static void
rddir4_cache_hold(rddir4_cache * rc)2659 rddir4_cache_hold(rddir4_cache *rc)
2660 {
2661 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
2662 
2663 	mutex_enter(&rdip->lock);
2664 	rdip->count++;
2665 	mutex_exit(&rdip->lock);
2666 }
2667 
2668 /*
2669  * Release a reference to this cache element.  If the count is zero then
2670  * free the element.
2671  */
2672 void
rddir4_cache_rele(rnode4_t * rp,rddir4_cache * rdc)2673 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
2674 {
2675 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
2676 
2677 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2678 
2679 	/*
2680 	 * Check to see if we have any waiters.  If so, we can wake them
2681 	 * so that they can proceed.
2682 	 */
2683 	if (rdc->flags & RDDIRWAIT) {
2684 		rdc->flags &= ~RDDIRWAIT;
2685 		cv_broadcast(&rdc->cv);
2686 	}
2687 
2688 	mutex_enter(&rdip->lock);
2689 	ASSERT(rdip->count > 0);
2690 	if (--rdip->count == 0) {
2691 		mutex_exit(&rdip->lock);
2692 		rddir4_cache_free(rdip);
2693 	} else
2694 		mutex_exit(&rdip->lock);
2695 }
2696 
2697 /*
2698  * Free a cache element.
2699  */
2700 static void
rddir4_cache_free(rddir4_cache_impl * rdip)2701 rddir4_cache_free(rddir4_cache_impl *rdip)
2702 {
2703 	rddir4_cache *rc = &rdip->rc;
2704 
2705 #ifdef DEBUG
2706 	atomic_dec_64(&clstat4_debug.dirent.value.ui64);
2707 #endif
2708 	if (rc->entries != NULL)
2709 		kmem_free(rc->entries, rc->buflen);
2710 	cv_destroy(&rc->cv);
2711 	mutex_destroy(&rdip->lock);
2712 	kmem_free(rdip, sizeof (*rdip));
2713 }
2714 
2715 /*
2716  * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2717  * framework.
2718  */
2719 static int
cl4_snapshot(kstat_t * ksp,void * buf,int rw)2720 cl4_snapshot(kstat_t *ksp, void *buf, int rw)
2721 {
2722 	ksp->ks_snaptime = gethrtime();
2723 	if (rw == KSTAT_WRITE) {
2724 		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
2725 #ifdef DEBUG
2726 		/*
2727 		 * Currently only the global zone can write to kstats, but we
2728 		 * add the check just for paranoia.
2729 		 */
2730 		if (INGLOBALZONE(curproc))
2731 			bcopy((char *)buf + sizeof (clstat4_tmpl),
2732 			    &clstat4_debug, sizeof (clstat4_debug));
2733 #endif
2734 	} else {
2735 		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
2736 #ifdef DEBUG
2737 		/*
2738 		 * If we're displaying the "global" debug kstat values, we
2739 		 * display them as-is to all zones since in fact they apply to
2740 		 * the system as a whole.
2741 		 */
2742 		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
2743 		    sizeof (clstat4_debug));
2744 #endif
2745 	}
2746 	return (0);
2747 }
2748 
2749 
2750 
2751 /*
2752  * Zone support
2753  */
2754 static void *
clinit4_zone(zoneid_t zoneid)2755 clinit4_zone(zoneid_t zoneid)
2756 {
2757 	kstat_t *nfs4_client_kstat;
2758 	struct nfs4_clnt *nfscl;
2759 	uint_t ndata;
2760 
2761 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
2762 	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
2763 	nfscl->nfscl_chtable4 = NULL;
2764 	nfscl->nfscl_zoneid = zoneid;
2765 
2766 	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
2767 	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
2768 #ifdef DEBUG
2769 	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
2770 #endif
2771 	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
2772 	    "misc", KSTAT_TYPE_NAMED, ndata,
2773 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
2774 		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
2775 		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
2776 		kstat_install(nfs4_client_kstat);
2777 	}
2778 	mutex_enter(&nfs4_clnt_list_lock);
2779 	list_insert_head(&nfs4_clnt_list, nfscl);
2780 	mutex_exit(&nfs4_clnt_list_lock);
2781 
2782 	return (nfscl);
2783 }
2784 
2785 /*ARGSUSED*/
2786 static void
clfini4_zone(zoneid_t zoneid,void * arg)2787 clfini4_zone(zoneid_t zoneid, void *arg)
2788 {
2789 	struct nfs4_clnt *nfscl = arg;
2790 	chhead_t *chp, *next;
2791 
2792 	if (nfscl == NULL)
2793 		return;
2794 	mutex_enter(&nfs4_clnt_list_lock);
2795 	list_remove(&nfs4_clnt_list, nfscl);
2796 	mutex_exit(&nfs4_clnt_list_lock);
2797 	clreclaim4_zone(nfscl, 0);
2798 	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
2799 		ASSERT(chp->ch_list == NULL);
2800 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
2801 		next = chp->ch_next;
2802 		kmem_free(chp, sizeof (*chp));
2803 	}
2804 	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
2805 	mutex_destroy(&nfscl->nfscl_chtable4_lock);
2806 	kmem_free(nfscl, sizeof (*nfscl));
2807 }
2808 
2809 /*
2810  * Called by endpnt_destructor to make sure the client handles are
2811  * cleaned up before the RPC endpoints.  This becomes a no-op if
2812  * clfini_zone (above) is called first.  This function is needed
2813  * (rather than relying on clfini_zone to clean up) because the ZSD
2814  * callbacks have no ordering mechanism, so we have no way to ensure
2815  * that clfini_zone is called before endpnt_destructor.
2816  */
2817 void
clcleanup4_zone(zoneid_t zoneid)2818 clcleanup4_zone(zoneid_t zoneid)
2819 {
2820 	struct nfs4_clnt *nfscl;
2821 
2822 	mutex_enter(&nfs4_clnt_list_lock);
2823 	nfscl = list_head(&nfs4_clnt_list);
2824 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
2825 		if (nfscl->nfscl_zoneid == zoneid) {
2826 			clreclaim4_zone(nfscl, 0);
2827 			break;
2828 		}
2829 	}
2830 	mutex_exit(&nfs4_clnt_list_lock);
2831 }
2832 
2833 int
nfs4_subr_init(void)2834 nfs4_subr_init(void)
2835 {
2836 	/*
2837 	 * Allocate and initialize the client handle cache
2838 	 */
2839 	chtab4_cache = kmem_cache_create("client_handle4_cache",
2840 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
2841 	    NULL, 0);
2842 
2843 	/*
2844 	 * Initialize the list of per-zone client handles (and associated data).
2845 	 * This needs to be done before we call zone_key_create().
2846 	 */
2847 	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
2848 	    offsetof(struct nfs4_clnt, nfscl_node));
2849 
2850 	/*
2851 	 * Initialize the zone_key for per-zone client handle lists.
2852 	 */
2853 	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
2854 
2855 	if (nfs4err_delay_time == 0)
2856 		nfs4err_delay_time = NFS4ERR_DELAY_TIME;
2857 
2858 	return (0);
2859 }
2860 
2861 int
nfs4_subr_fini(void)2862 nfs4_subr_fini(void)
2863 {
2864 	/*
2865 	 * Deallocate the client handle cache
2866 	 */
2867 	kmem_cache_destroy(chtab4_cache);
2868 
2869 	/*
2870 	 * Destroy the zone_key
2871 	 */
2872 	(void) zone_key_delete(nfs4clnt_zone_key);
2873 
2874 	return (0);
2875 }
2876 /*
2877  * Set or Clear direct I/O flag
2878  * VOP_RWLOCK() is held for write access to prevent a race condition
2879  * which would occur if a process is in the middle of a write when
2880  * directio flag gets set. It is possible that all pages may not get flushed.
2881  *
2882  * This is a copy of nfs_directio, changes here may need to be made
2883  * there and vice versa.
2884  */
2885 
2886 int
nfs4_directio(vnode_t * vp,int cmd,cred_t * cr)2887 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
2888 {
2889 	int	error = 0;
2890 	rnode4_t *rp;
2891 
2892 	rp = VTOR4(vp);
2893 
2894 	if (cmd == DIRECTIO_ON) {
2895 
2896 		if (rp->r_flags & R4DIRECTIO)
2897 			return (0);
2898 
2899 		/*
2900 		 * Flush the page cache.
2901 		 */
2902 
2903 		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
2904 
2905 		if (rp->r_flags & R4DIRECTIO) {
2906 			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2907 			return (0);
2908 		}
2909 
2910 		if (nfs4_has_pages(vp) &&
2911 		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
2912 			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
2913 			    B_INVAL, cr, NULL);
2914 			if (error) {
2915 				if (error == ENOSPC || error == EDQUOT) {
2916 					mutex_enter(&rp->r_statelock);
2917 					if (!rp->r_error)
2918 						rp->r_error = error;
2919 					mutex_exit(&rp->r_statelock);
2920 				}
2921 				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2922 				return (error);
2923 			}
2924 		}
2925 
2926 		mutex_enter(&rp->r_statelock);
2927 		rp->r_flags |= R4DIRECTIO;
2928 		mutex_exit(&rp->r_statelock);
2929 		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2930 		return (0);
2931 	}
2932 
2933 	if (cmd == DIRECTIO_OFF) {
2934 		mutex_enter(&rp->r_statelock);
2935 		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
2936 		mutex_exit(&rp->r_statelock);
2937 		return (0);
2938 	}
2939 
2940 	return (EINVAL);
2941 }
2942 
2943 /*
2944  * Return TRUE if the file has any pages.  Always go back to
2945  * the master vnode to check v_pages since none of the shadows
2946  * can have pages.
2947  */
2948 
2949 bool_t
nfs4_has_pages(vnode_t * vp)2950 nfs4_has_pages(vnode_t *vp)
2951 {
2952 	rnode4_t *rp;
2953 
2954 	rp = VTOR4(vp);
2955 	if (IS_SHADOW(vp, rp))
2956 		vp = RTOV4(rp);	/* RTOV4 always gives the master */
2957 
2958 	return (vn_has_cached_data(vp));
2959 }
2960 
2961 /*
2962  * This table is used to determine whether the client should attempt
2963  * failover based on the clnt_stat value returned by CLNT_CALL.  The
2964  * clnt_stat is used as an index into the table.  If
2965  * the error value that corresponds to the clnt_stat value in the
2966  * table is non-zero, then that is the error to be returned AND
2967  * that signals that failover should be attempted.
2968  *
2969  * Special note: If the RPC_ values change, then direct indexing of the
2970  * table is no longer valid, but having the RPC_ values in the table
2971  * allow the functions to detect the change and issue a warning.
2972  * In this case, the code will always attempt failover as a defensive
2973  * measure.
2974  */
2975 
2976 static struct try_failover_tab {
2977 	enum clnt_stat	cstat;
2978 	int		error;
2979 } try_failover_table [] = {
2980 
2981 	RPC_SUCCESS,		0,
2982 	RPC_CANTENCODEARGS,	0,
2983 	RPC_CANTDECODERES,	0,
2984 	RPC_CANTSEND,		ECOMM,
2985 	RPC_CANTRECV,		ECOMM,
2986 	RPC_TIMEDOUT,		ETIMEDOUT,
2987 	RPC_VERSMISMATCH,	0,
2988 	RPC_AUTHERROR,		0,
2989 	RPC_PROGUNAVAIL,	0,
2990 	RPC_PROGVERSMISMATCH,	0,
2991 	RPC_PROCUNAVAIL,	0,
2992 	RPC_CANTDECODEARGS,	0,
2993 	RPC_SYSTEMERROR,	ENOSR,
2994 	RPC_UNKNOWNHOST,	EHOSTUNREACH,
2995 	RPC_RPCBFAILURE,	ENETUNREACH,
2996 	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
2997 	RPC_FAILED,		ETIMEDOUT,
2998 	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
2999 	RPC_INTR,		0,
3000 	RPC_UNKNOWNADDR,	EHOSTUNREACH,
3001 	RPC_TLIERROR,		0,
3002 	RPC_NOBROADCAST,	EHOSTUNREACH,
3003 	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
3004 	RPC_UDERROR,		0,
3005 	RPC_INPROGRESS,		0,
3006 	RPC_STALERACHANDLE,	EINVAL,
3007 	RPC_CANTCONNECT,	ECONNREFUSED,
3008 	RPC_XPRTFAILED,		ECONNABORTED,
3009 	RPC_CANTCREATESTREAM,	ECONNREFUSED,
3010 	RPC_CANTSTORE,		ENOBUFS
3011 };
3012 
3013 /*
3014  * nfs4_try_failover - determine whether the client should
3015  * attempt failover based on the values stored in the nfs4_error_t.
3016  */
3017 int
nfs4_try_failover(nfs4_error_t * ep)3018 nfs4_try_failover(nfs4_error_t *ep)
3019 {
3020 	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
3021 		return (TRUE);
3022 
3023 	if (ep->error && ep->rpc_status != RPC_SUCCESS)
3024 		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
3025 
3026 	return (FALSE);
3027 }
3028 
3029 /*
3030  * try_failover - internal version of nfs4_try_failover, called
3031  * only by rfscall and aclcall.  Determine if failover is warranted
3032  * based on the clnt_stat and return the error number if it is.
3033  */
3034 static int
try_failover(enum clnt_stat rpc_status)3035 try_failover(enum clnt_stat rpc_status)
3036 {
3037 	int err = 0;
3038 
3039 	if (rpc_status == RPC_SUCCESS)
3040 		return (0);
3041 
3042 #ifdef	DEBUG
3043 	if (rpc_status != 0 && nfs4_try_failover_any) {
3044 		err = ETIMEDOUT;
3045 		goto done;
3046 	}
3047 #endif
3048 	/*
3049 	 * The rpc status is used as an index into the table.
3050 	 * If the rpc status is outside of the range of the
3051 	 * table or if the rpc error numbers have been changed
3052 	 * since the table was constructed, then print a warning
3053 	 * (DEBUG only) and try failover anyway.  Otherwise, just
3054 	 * grab the resulting error number out of the table.
3055 	 */
3056 	if (rpc_status < RPC_SUCCESS || rpc_status >=
3057 	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
3058 	    try_failover_table[rpc_status].cstat != rpc_status) {
3059 
3060 		err = ETIMEDOUT;
3061 #ifdef	DEBUG
3062 		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
3063 		    rpc_status);
3064 #endif
3065 	} else
3066 		err = try_failover_table[rpc_status].error;
3067 
3068 done:
3069 	if (rpc_status)
3070 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3071 		    "nfs4_try_failover: %strying failover on error %d",
3072 		    err ? "" : "NOT ", rpc_status));
3073 
3074 	return (err);
3075 }
3076 
3077 void
nfs4_error_zinit(nfs4_error_t * ep)3078 nfs4_error_zinit(nfs4_error_t *ep)
3079 {
3080 	ep->error = 0;
3081 	ep->stat = NFS4_OK;
3082 	ep->rpc_status = RPC_SUCCESS;
3083 }
3084 
3085 void
nfs4_error_init(nfs4_error_t * ep,int error)3086 nfs4_error_init(nfs4_error_t *ep, int error)
3087 {
3088 	ep->error = error;
3089 	ep->stat = NFS4_OK;
3090 	ep->rpc_status = RPC_SUCCESS;
3091 }
3092 
3093 
3094 #ifdef DEBUG
3095 
3096 /*
3097  * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3098  * use the same algorithm as for NFS v3.
3099  *
3100  */
3101 int
hash16(void * p,int len)3102 hash16(void *p, int len)
3103 {
3104 	int i, rem;
3105 	uint_t *wp;
3106 	uint_t key = 0;
3107 
3108 	/* protect against non word aligned */
3109 	if ((rem = len & 3) != 0)
3110 		len &= ~3;
3111 
3112 	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
3113 		key ^= (*wp >> 16) ^ *wp;
3114 	}
3115 
3116 	/* hash left-over bytes */
3117 	for (i = 0; i < rem; i++)
3118 		key ^= *((uchar_t *)p + i);
3119 
3120 	return (key & 0xffff);
3121 }
3122 
3123 /*
3124  * rnode4info - return filehandle and path information for an rnode.
3125  * XXX MT issues: uses a single static buffer, no locking of path.
3126  */
3127 char *
rnode4info(rnode4_t * rp)3128 rnode4info(rnode4_t *rp)
3129 {
3130 	static char buf[80];
3131 	nfs4_fhandle_t fhandle;
3132 	char *path;
3133 	char *type;
3134 
3135 	if (rp == NULL)
3136 		return ("null");
3137 	if (rp->r_flags & R4ISXATTR)
3138 		type = "attr";
3139 	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
3140 		type = "attrdir";
3141 	else if (RTOV4(rp)->v_flag & VROOT)
3142 		type = "root";
3143 	else if (RTOV4(rp)->v_type == VDIR)
3144 		type = "dir";
3145 	else if (RTOV4(rp)->v_type == VREG)
3146 		type = "file";
3147 	else
3148 		type = "other";
3149 	sfh4_copyval(rp->r_fh, &fhandle);
3150 	path = fn_path(rp->r_svnode.sv_name);
3151 	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3152 	    (void *)rp, path, type, rp->r_flags,
3153 	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
3154 	kmem_free(path, strlen(path)+1);
3155 	return (buf);
3156 }
3157 #endif
3158