1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <rpc/auth.h>
30 #include <rpc/clnt.h>
31 #include <nfs/nfs4_kprot.h>
32 #include <nfs/nfs4.h>
33 #include <nfs/lm.h>
34 #include <sys/cmn_err.h>
35 #include <sys/disp.h>
36 
37 #include <sys/pathname.h>
38 
39 #include <sys/strsubr.h>
40 #include <sys/ddi.h>
41 
42 #include <sys/vnode.h>
43 #include <sys/sdt.h>
44 #include <inet/common.h>
45 #include <inet/ip.h>
46 #include <inet/ip6.h>
47 
48 #define	MAX_READ_DELEGATIONS 5
49 
50 krwlock_t rfs4_deleg_policy_lock;
51 srv_deleg_policy_t rfs4_deleg_policy = SRV_NEVER_DELEGATE;
52 static int rfs4_deleg_wlp = 5;
53 kmutex_t rfs4_deleg_lock;
54 static int rfs4_deleg_disabled;
55 
56 #ifdef DEBUG
57 
58 static int rfs4_test_cbgetattr_fail = 0;
59 int rfs4_cb_null;
60 int rfs4_cb_debug;
61 int rfs4_deleg_debug;
62 
63 #endif
64 
65 static void rfs4_recall_file(rfs4_file_t *,
66 			    void (*recall)(rfs4_deleg_state_t *, bool_t),
67 			    bool_t, rfs4_client_t *);
68 static	void		rfs4_revoke_deleg(rfs4_deleg_state_t *);
69 static	void		rfs4_revoke_file(rfs4_file_t *);
70 static	void		rfs4_cb_chflush(rfs4_cbinfo_t *);
71 static	CLIENT		*rfs4_cb_getch(rfs4_cbinfo_t *);
72 static	void		rfs4_cb_freech(rfs4_cbinfo_t *, CLIENT *, bool_t);
73 static rfs4_deleg_state_t *rfs4_deleg_state(rfs4_state_t *,
74 				open_delegation_type4, int *);
75 
76 /*
77  * Convert a universal address to an transport specific
78  * address using inet_pton.
79  */
80 static int
81 uaddr2sockaddr(int af, char *ua, void *ap, in_port_t *pp)
82 {
83 	int dots = 0, i, j, len, k;
84 	unsigned char c;
85 	in_port_t port = 0;
86 
87 	len = strlen(ua);
88 
89 	for (i = len-1; i >= 0; i--) {
90 
91 		if (ua[i] == '.')
92 			dots++;
93 
94 		if (dots == 2) {
95 
96 			ua[i] = '\0';
97 			/*
98 			 * We use k to remember were to stick '.' back, since
99 			 * ua was kmem_allocateded from the pool len+1.
100 			 */
101 			k = i;
102 			if (inet_pton(af, ua, ap) == 1) {
103 
104 				c = 0;
105 
106 				for (j = i+1; j < len; j++) {
107 					if (ua[j] == '.') {
108 						port = c << 8;
109 						c = 0;
110 					} else if (ua[j] >= '0' &&
111 					    ua[j] <= '9') {
112 						c *= 10;
113 						c += ua[j] - '0';
114 					} else {
115 						ua[k] = '.';
116 						return (EINVAL);
117 					}
118 				}
119 				port += c;
120 
121 
122 				/* reset to network order */
123 				if (af == AF_INET) {
124 					*(uint32_t *)ap =
125 					    htonl(*(uint32_t *)ap);
126 					*pp = htons(port);
127 				} else {
128 					int ix;
129 					uint16_t *sap;
130 
131 					for (sap = ap, ix = 0; ix <
132 					    sizeof (struct in6_addr) /
133 					    sizeof (uint16_t); ix++)
134 						sap[ix] = htons(sap[ix]);
135 
136 					*pp = htons(port);
137 				}
138 
139 				ua[k] = '.';
140 				return (0);
141 			} else {
142 				ua[k] = '.';
143 				return (EINVAL);
144 			}
145 		}
146 	}
147 
148 	return (EINVAL);
149 }
150 
151 /*
152  * Update the delegation policy with the
153  * value of "new_policy"
154  */
155 void
156 rfs4_set_deleg_policy(srv_deleg_policy_t new_policy)
157 {
158 	rw_enter(&rfs4_deleg_policy_lock, RW_WRITER);
159 	rfs4_deleg_policy = new_policy;
160 	rw_exit(&rfs4_deleg_policy_lock);
161 }
162 
163 void
164 rfs4_hold_deleg_policy(void)
165 {
166 	rw_enter(&rfs4_deleg_policy_lock, RW_READER);
167 }
168 
169 void
170 rfs4_rele_deleg_policy(void)
171 {
172 	rw_exit(&rfs4_deleg_policy_lock);
173 }
174 
175 
176 /*
177  * This free function is to be used when the client struct is being
178  * released and nothing at all is needed of the callback info any
179  * longer.
180  */
181 void
182 rfs4_cbinfo_free(rfs4_cbinfo_t *cbp)
183 {
184 	char *addr = cbp->cb_callback.cb_location.r_addr;
185 	char *netid = cbp->cb_callback.cb_location.r_netid;
186 
187 	/* Free old address if any */
188 
189 	if (addr)
190 		kmem_free(addr, strlen(addr) + 1);
191 	if (netid)
192 		kmem_free(netid, strlen(netid) + 1);
193 
194 	addr = cbp->cb_newer.cb_callback.cb_location.r_addr;
195 	netid = cbp->cb_newer.cb_callback.cb_location.r_netid;
196 
197 	if (addr)
198 		kmem_free(addr, strlen(addr) + 1);
199 	if (netid)
200 		kmem_free(netid, strlen(netid) + 1);
201 
202 	if (cbp->cb_chc_free) {
203 		rfs4_cb_chflush(cbp);
204 	}
205 }
206 
207 /*
208  * The server uses this to check the callback path supplied by the
209  * client.  The callback connection is marked "in progress" while this
210  * work is going on and then eventually marked either OK or FAILED.
211  * This work can be done as part of a separate thread and at the end
212  * of this the thread will exit or it may be done such that the caller
213  * will continue with other work.
214  */
215 static void
216 rfs4_do_cb_null(rfs4_client_t *cp)
217 {
218 	struct timeval tv;
219 	CLIENT *ch;
220 	rfs4_cbstate_t newstate;
221 	rfs4_cbinfo_t *cbp = &cp->cbinfo;
222 
223 	mutex_enter(cbp->cb_lock);
224 	/* If another thread is doing CB_NULL RPC then return */
225 	if (cbp->cb_nullcaller == TRUE) {
226 		mutex_exit(cbp->cb_lock);
227 		rfs4_client_rele(cp);
228 		return;
229 	}
230 
231 	/* Mark the cbinfo as having a thread in the NULL callback */
232 	cbp->cb_nullcaller = TRUE;
233 
234 	/*
235 	 * Are there other threads still using the cbinfo client
236 	 * handles?  If so, this thread must wait before going and
237 	 * mucking aroiund with the callback information
238 	 */
239 	while (cbp->cb_refcnt != 0)
240 		cv_wait(cbp->cb_cv_nullcaller, cbp->cb_lock);
241 
242 	/*
243 	 * This thread itself may find that new callback info has
244 	 * arrived and is set up to handle this case and redrive the
245 	 * call to the client's callback server.
246 	 */
247 retry:
248 	if (cbp->cb_newer.cb_new == TRUE &&
249 	    cbp->cb_newer.cb_confirmed == TRUE) {
250 		char *addr = cbp->cb_callback.cb_location.r_addr;
251 		char *netid = cbp->cb_callback.cb_location.r_netid;
252 
253 		/*
254 		 * Free the old stuff if it exists; may be the first
255 		 * time through this path
256 		 */
257 		if (addr)
258 			kmem_free(addr, strlen(addr) + 1);
259 		if (netid)
260 			kmem_free(netid, strlen(netid) + 1);
261 
262 		/* Move over the addr/netid */
263 		cbp->cb_callback.cb_location.r_addr =
264 		    cbp->cb_newer.cb_callback.cb_location.r_addr;
265 		cbp->cb_newer.cb_callback.cb_location.r_addr = NULL;
266 		cbp->cb_callback.cb_location.r_netid =
267 		    cbp->cb_newer.cb_callback.cb_location.r_netid;
268 		cbp->cb_newer.cb_callback.cb_location.r_netid = NULL;
269 
270 		/* Get the program number */
271 		cbp->cb_callback.cb_program =
272 		    cbp->cb_newer.cb_callback.cb_program;
273 		cbp->cb_newer.cb_callback.cb_program = 0;
274 
275 		/* Don't forget the protocol's "cb_ident" field */
276 		cbp->cb_ident = cbp->cb_newer.cb_ident;
277 		cbp->cb_newer.cb_ident = 0;
278 
279 		/* no longer new */
280 		cbp->cb_newer.cb_new = FALSE;
281 		cbp->cb_newer.cb_confirmed = FALSE;
282 
283 		/* get rid of the old client handles that may exist */
284 		rfs4_cb_chflush(cbp);
285 
286 		cbp->cb_state = CB_NONE;
287 		cbp->cb_timefailed = 0; /* reset the clock */
288 		cbp->cb_notified_of_cb_path_down = TRUE;
289 	}
290 
291 	if (cbp->cb_state != CB_NONE) {
292 		cv_broadcast(cbp->cb_cv);	/* let the others know */
293 		cbp->cb_nullcaller = FALSE;
294 		mutex_exit(cbp->cb_lock);
295 		rfs4_client_rele(cp);
296 		return;
297 	}
298 
299 	/* mark rfs4_client_t as CALLBACK NULL in progress */
300 	cbp->cb_state = CB_INPROG;
301 	mutex_exit(cbp->cb_lock);
302 
303 	/* get/generate a client handle */
304 	if ((ch = rfs4_cb_getch(cbp)) == NULL) {
305 		mutex_enter(cbp->cb_lock);
306 		cbp->cb_state = CB_BAD;
307 		cbp->cb_timefailed = gethrestime_sec(); /* observability */
308 		goto retry;
309 	}
310 
311 
312 	tv.tv_sec = 30;
313 	tv.tv_usec = 0;
314 	if (clnt_call(ch, CB_NULL, xdr_void, NULL, xdr_void, NULL, tv) != 0) {
315 		newstate = CB_BAD;
316 	} else {
317 		newstate = CB_OK;
318 #ifdef	DEBUG
319 		rfs4_cb_null++;
320 #endif
321 	}
322 
323 	/* Check to see if the client has specified new callback info */
324 	mutex_enter(cbp->cb_lock);
325 	rfs4_cb_freech(cbp, ch, TRUE);
326 	if (cbp->cb_newer.cb_new == TRUE &&
327 	    cbp->cb_newer.cb_confirmed == TRUE) {
328 		goto retry;	/* give the CB_NULL another chance */
329 	}
330 
331 	cbp->cb_state = newstate;
332 	if (cbp->cb_state == CB_BAD)
333 		cbp->cb_timefailed = gethrestime_sec(); /* observability */
334 
335 	cv_broadcast(cbp->cb_cv);	/* start up the other threads */
336 	cbp->cb_nullcaller = FALSE;
337 	mutex_exit(cbp->cb_lock);
338 
339 	rfs4_client_rele(cp);
340 }
341 
342 /*
343  * Given a client struct, inspect the callback info to see if the
344  * callback path is up and available.  If it is being initialized,
345  * then wait for the CB_NULL RPC call to occur.
346  */
347 static rfs4_cbinfo_t *
348 rfs4_cbinfo_hold(rfs4_client_t *cp)
349 {
350 	rfs4_cbinfo_t *cbp = &cp->cbinfo;
351 
352 retry:
353 	mutex_enter(cbp->cb_lock);
354 
355 	if (cbp->cb_newer.cb_new == TRUE && cbp->cb_nullcaller == FALSE) {
356 		/*
357 		 * Looks like a new callback path may be available and
358 		 * noone has set it up.
359 		 */
360 		mutex_exit(cbp->cb_lock);
361 		rfs4_dbe_hold(cp->dbe);
362 		rfs4_do_cb_null(cp); /* caller will release client hold */
363 		goto retry;
364 	}
365 
366 	/* Is there a thread working on doing the CB_NULL RPC? */
367 	if (cbp->cb_nullcaller == TRUE)
368 		cv_wait(cbp->cb_cv, cbp->cb_lock);  /* if so, wait on it */
369 
370 	/* If the callback path is not okay (up and running), just quit */
371 	if (cbp->cb_state != CB_OK) {
372 		mutex_exit(cbp->cb_lock);
373 		return (NULL);
374 	}
375 
376 	/* Let someone know we are using the current callback info */
377 	cbp->cb_refcnt++;
378 	mutex_exit(cbp->cb_lock);
379 	return (cbp);
380 }
381 
382 /*
383  * The caller is done with the callback info.  It may be that the
384  * caller's RPC failed and the NFSv4 client has actually provided new
385  * callback information.  If so, let the caller know so they can
386  * advantage of this and maybe retry the RPC that originally failed.
387  */
388 static int
389 rfs4_cbinfo_rele(rfs4_cbinfo_t *cbp, rfs4_cbstate_t newstate)
390 {
391 	int cb_new = FALSE;
392 
393 	mutex_enter(cbp->cb_lock);
394 
395 	/* The caller gets a chance to mark the callback info as bad */
396 	if (newstate != CB_NOCHANGE)
397 		cbp->cb_state = newstate;
398 	if (newstate == CB_FAILED) {
399 		cbp->cb_timefailed = gethrestime_sec(); /* observability */
400 		cbp->cb_notified_of_cb_path_down = FALSE;
401 	}
402 
403 	cbp->cb_refcnt--;	/* no longer using the information */
404 
405 	/*
406 	 * A thread may be waiting on this one to finish and if so,
407 	 * let it know that it is okay to do the CB_NULL to the
408 	 * client's callback server.
409 	 */
410 	if (cbp->cb_refcnt == 0 && cbp->cb_nullcaller)
411 		cv_broadcast(cbp->cb_cv_nullcaller);
412 
413 	/*
414 	 * If this is the last thread to use the callback info and
415 	 * there is new callback information to try and no thread is
416 	 * there ready to do the CB_NULL, then return true to teh
417 	 * caller so they can do the CB_NULL
418 	 */
419 	if (cbp->cb_refcnt == 0 &&
420 	    cbp->cb_nullcaller == FALSE &&
421 	    cbp->cb_newer.cb_new == TRUE &&
422 	    cbp->cb_newer.cb_confirmed == TRUE)
423 		cb_new = TRUE;
424 
425 	mutex_exit(cbp->cb_lock);
426 
427 	return (cb_new);
428 }
429 
430 /*
431  * Given the information in the callback info struct, create a client
432  * handle that can be used by the server for its callback path.
433  */
434 static CLIENT *
435 rfs4_cbch_init(rfs4_cbinfo_t *cbp)
436 {
437 	struct knetconfig knc;
438 	vnode_t *vp;
439 	struct sockaddr_in addr4;
440 	struct sockaddr_in6 addr6;
441 	void *addr, *taddr;
442 	in_port_t *pp;
443 	int af;
444 	char *devnam;
445 	int err;
446 	struct netbuf nb;
447 	int size;
448 	CLIENT *ch = NULL;
449 	int useresvport = 0;
450 
451 	mutex_enter(cbp->cb_lock);
452 
453 	if (cbp->cb_callback.cb_location.r_netid == NULL ||
454 	    cbp->cb_callback.cb_location.r_addr == NULL) {
455 		goto cb_init_out;
456 	}
457 
458 	if (strcmp(cbp->cb_callback.cb_location.r_netid, "tcp") == 0) {
459 		knc.knc_semantics = NC_TPI_COTS;
460 		knc.knc_protofmly = "inet";
461 		knc.knc_proto = "tcp";
462 		devnam = "/dev/tcp";
463 		af = AF_INET;
464 	} else if (strcmp(cbp->cb_callback.cb_location.r_netid, "udp")
465 	    == 0) {
466 		knc.knc_semantics = NC_TPI_CLTS;
467 		knc.knc_protofmly = "inet";
468 		knc.knc_proto = "udp";
469 		devnam = "/dev/udp";
470 		af = AF_INET;
471 	} else if (strcmp(cbp->cb_callback.cb_location.r_netid, "tcp6")
472 	    == 0) {
473 		knc.knc_semantics = NC_TPI_COTS;
474 		knc.knc_protofmly = "inet6";
475 		knc.knc_proto = "tcp";
476 		devnam = "/dev/tcp6";
477 		af = AF_INET6;
478 	} else if (strcmp(cbp->cb_callback.cb_location.r_netid, "udp6")
479 	    == 0) {
480 		knc.knc_semantics = NC_TPI_CLTS;
481 		knc.knc_protofmly = "inet6";
482 		knc.knc_proto = "udp";
483 		devnam = "/dev/udp6";
484 		af = AF_INET6;
485 	} else {
486 		goto cb_init_out;
487 	}
488 
489 	if ((err = lookupname(devnam, UIO_SYSSPACE, FOLLOW,
490 	    NULLVPP, &vp)) != 0) {
491 
492 		goto cb_init_out;
493 	}
494 
495 	if (vp->v_type != VCHR) {
496 		VN_RELE(vp);
497 		goto cb_init_out;
498 	}
499 
500 	knc.knc_rdev = vp->v_rdev;
501 
502 	VN_RELE(vp);
503 
504 	if (af == AF_INET) {
505 		size = sizeof (addr4);
506 		bzero(&addr4, size);
507 		addr4.sin_family = (sa_family_t)af;
508 		addr = &addr4.sin_addr;
509 		pp = &addr4.sin_port;
510 		taddr = &addr4;
511 	} else /* AF_INET6 */ {
512 		size = sizeof (addr6);
513 		bzero(&addr6, size);
514 		addr6.sin6_family = (sa_family_t)af;
515 		addr = &addr6.sin6_addr;
516 		pp = &addr6.sin6_port;
517 		taddr = &addr6;
518 	}
519 
520 	if (uaddr2sockaddr(af,
521 	    cbp->cb_callback.cb_location.r_addr, addr, pp)) {
522 
523 		goto cb_init_out;
524 	}
525 
526 
527 	nb.maxlen = nb.len = size;
528 	nb.buf = (char *)taddr;
529 
530 	if (err = clnt_tli_kcreate(&knc, &nb, cbp->cb_callback.cb_program,
531 	    NFS_CB, 0, 0, curthread->t_cred, &ch)) {
532 
533 		ch = NULL;
534 	}
535 
536 	/* turn off reserved port usage */
537 	(void) CLNT_CONTROL(ch, CLSET_BINDRESVPORT, (char *)&useresvport);
538 
539 cb_init_out:
540 	mutex_exit(cbp->cb_lock);
541 	return (ch);
542 }
543 
544 /*
545  * Iterate over the client handle cache and
546  * destroy it.
547  */
548 static void
549 rfs4_cb_chflush(rfs4_cbinfo_t *cbp)
550 {
551 	CLIENT *ch;
552 
553 	while (cbp->cb_chc_free) {
554 		cbp->cb_chc_free--;
555 		ch = cbp->cb_chc[cbp->cb_chc_free];
556 		cbp->cb_chc[cbp->cb_chc_free] = NULL;
557 		if (ch) {
558 			if (ch->cl_auth)
559 				auth_destroy(ch->cl_auth);
560 			clnt_destroy(ch);
561 		}
562 	}
563 }
564 
565 /*
566  * Return a client handle, either from a the small
567  * rfs4_client_t cache or one that we just created.
568  */
569 static CLIENT *
570 rfs4_cb_getch(rfs4_cbinfo_t *cbp)
571 {
572 	CLIENT *cbch = NULL;
573 	uint32_t zilch = 0;
574 
575 	mutex_enter(cbp->cb_lock);
576 
577 	if (cbp->cb_chc_free) {
578 		cbp->cb_chc_free--;
579 		cbch = cbp->cb_chc[ cbp->cb_chc_free ];
580 		mutex_exit(cbp->cb_lock);
581 		(void) CLNT_CONTROL(cbch, CLSET_XID, (char *)&zilch);
582 		return (cbch);
583 	}
584 
585 	mutex_exit(cbp->cb_lock);
586 
587 	/* none free so make it now */
588 	cbch = rfs4_cbch_init(cbp);
589 
590 	return (cbch);
591 }
592 
593 /*
594  * Return the client handle to the small cache or
595  * destroy it.
596  */
597 static void
598 rfs4_cb_freech(rfs4_cbinfo_t *cbp, CLIENT *ch, bool_t lockheld)
599 {
600 	if (lockheld == FALSE)
601 		mutex_enter(cbp->cb_lock);
602 
603 	if (cbp->cb_chc_free < RFS4_CBCH_MAX) {
604 		cbp->cb_chc[ cbp->cb_chc_free++ ] = ch;
605 		if (lockheld == FALSE)
606 			mutex_exit(cbp->cb_lock);
607 		return;
608 	}
609 	if (lockheld == FALSE)
610 		mutex_exit(cbp->cb_lock);
611 
612 	/*
613 	 * cache maxed out of free entries, obliterate
614 	 * this client handle, destroy it, throw it away.
615 	 */
616 	if (ch->cl_auth)
617 		auth_destroy(ch->cl_auth);
618 	clnt_destroy(ch);
619 }
620 
621 /*
622  * With the supplied callback information - initialize the client
623  * callback data.  If there is a callback in progress, save the
624  * callback info so that a thread can pick it up in the future.
625  */
626 void
627 rfs4_client_setcb(rfs4_client_t *cp, cb_client4 *cb, uint32_t cb_ident)
628 {
629 	char *addr = NULL;
630 	char *netid = NULL;
631 	rfs4_cbinfo_t *cbp = &cp->cbinfo;
632 	size_t len;
633 
634 	/* Set the call back for the client */
635 	if (cb->cb_location.r_addr && cb->cb_location.r_addr[0] != '\0' &&
636 	    cb->cb_location.r_netid && cb->cb_location.r_netid[0] != '\0') {
637 		len = strlen(cb->cb_location.r_addr) + 1;
638 		addr = kmem_alloc(len, KM_SLEEP);
639 		bcopy(cb->cb_location.r_addr, addr, len);
640 		len = strlen(cb->cb_location.r_netid) + 1;
641 		netid = kmem_alloc(len, KM_SLEEP);
642 		bcopy(cb->cb_location.r_netid, netid, len);
643 	}
644 	/* ready to save the new information but first free old, if exists */
645 	mutex_enter(cbp->cb_lock);
646 
647 	cbp->cb_newer.cb_callback.cb_program = cb->cb_program;
648 
649 	if (cbp->cb_newer.cb_callback.cb_location.r_addr != NULL)
650 		kmem_free(cbp->cb_newer.cb_callback.cb_location.r_addr,
651 		    strlen(cbp->cb_newer.cb_callback.cb_location.r_addr) + 1);
652 	cbp->cb_newer.cb_callback.cb_location.r_addr = addr;
653 
654 	if (cbp->cb_newer.cb_callback.cb_location.r_netid != NULL)
655 		kmem_free(cbp->cb_newer.cb_callback.cb_location.r_netid,
656 		    strlen(cbp->cb_newer.cb_callback.cb_location.r_netid) + 1);
657 	cbp->cb_newer.cb_callback.cb_location.r_netid = netid;
658 
659 	cbp->cb_newer.cb_ident = cb_ident;
660 
661 	if (addr && *addr && netid && *netid) {
662 		cbp->cb_newer.cb_new = TRUE;
663 		cbp->cb_newer.cb_confirmed = FALSE;
664 	} else {
665 		cbp->cb_newer.cb_new = FALSE;
666 		cbp->cb_newer.cb_confirmed = FALSE;
667 	}
668 
669 	mutex_exit(cbp->cb_lock);
670 }
671 
672 /*
673  * The server uses this when processing SETCLIENTID_CONFIRM.  Callback
674  * information may have been provided on SETCLIENTID and this call
675  * marks that information as confirmed and then starts a thread to
676  * test the callback path.
677  */
678 void
679 rfs4_deleg_cb_check(rfs4_client_t *cp)
680 {
681 	if (cp->cbinfo.cb_newer.cb_new == FALSE)
682 		return;
683 
684 	cp->cbinfo.cb_newer.cb_confirmed = TRUE;
685 
686 	rfs4_dbe_hold(cp->dbe); /* hold the client struct for thread */
687 
688 	(void) thread_create(NULL, 0, rfs4_do_cb_null, cp, 0, &p0, TS_RUN,
689 	    minclsyspri);
690 }
691 
692 static void
693 rfs4args_cb_recall_free(nfs_cb_argop4 *argop)
694 {
695 	CB_RECALL4args	*rec_argp;
696 
697 	rec_argp = &argop->nfs_cb_argop4_u.opcbrecall;
698 	if (rec_argp->fh.nfs_fh4_val)
699 		kmem_free(rec_argp->fh.nfs_fh4_val, rec_argp->fh.nfs_fh4_len);
700 }
701 
702 /* ARGSUSED */
703 static void
704 rfs4args_cb_getattr_free(nfs_cb_argop4 *argop)
705 {
706 	CB_GETATTR4args *argp;
707 
708 	argp = &argop->nfs_cb_argop4_u.opcbgetattr;
709 	if (argp->fh.nfs_fh4_val)
710 		kmem_free(argp->fh.nfs_fh4_val, argp->fh.nfs_fh4_len);
711 }
712 
713 static void
714 rfs4freeargres(CB_COMPOUND4args *args, CB_COMPOUND4res *resp)
715 {
716 	int i, arglen;
717 	nfs_cb_argop4 *argop;
718 
719 	/*
720 	 * First free any special args alloc'd for specific ops.
721 	 */
722 	arglen = args->array_len;
723 	argop = args->array;
724 	for (i = 0; i < arglen; i++, argop++) {
725 
726 		switch (argop->argop) {
727 		case OP_CB_RECALL:
728 			rfs4args_cb_recall_free(argop);
729 			break;
730 
731 		case OP_CB_GETATTR:
732 			rfs4args_cb_getattr_free(argop);
733 			break;
734 
735 		default:
736 			return;
737 		}
738 	}
739 
740 	if (args->tag.utf8string_len > 0)
741 		UTF8STRING_FREE(args->tag)
742 
743 	kmem_free(args->array, arglen * sizeof (nfs_cb_argop4));
744 	if (resp)
745 		(void) xdr_free(xdr_CB_COMPOUND4res, (caddr_t)resp);
746 }
747 
748 /*
749  * General callback routine for the server to the client.
750  */
751 static enum clnt_stat
752 rfs4_do_callback(rfs4_client_t	*cp, CB_COMPOUND4args *args,
753 		CB_COMPOUND4res *res, struct timeval timeout)
754 {
755 	rfs4_cbinfo_t *cbp;
756 	CLIENT *ch;
757 	/* start with this in case cb_getch() fails */
758 	enum clnt_stat	stat = RPC_FAILED;
759 
760 	res->tag.utf8string_val = NULL;
761 	res->array = NULL;
762 
763 retry:
764 	cbp = rfs4_cbinfo_hold(cp);
765 	if (cbp == NULL)
766 		return (stat);
767 
768 	/* get a client handle */
769 	if ((ch = rfs4_cb_getch(cbp)) != NULL) {
770 		/*
771 		 * reset the cb_ident since it may have changed in
772 		 * rfs4_cbinfo_hold()
773 		 */
774 		args->callback_ident = cbp->cb_ident;
775 
776 		stat = clnt_call(ch, CB_COMPOUND, xdr_CB_COMPOUND4args_srv,
777 		    (caddr_t)args, xdr_CB_COMPOUND4res,
778 		    (caddr_t)res, timeout);
779 
780 		/* free client handle */
781 		rfs4_cb_freech(cbp, ch, FALSE);
782 	}
783 
784 	/*
785 	 * If the rele says that there may be new callback info then
786 	 * retry this sequence and it may succeed as a result of the
787 	 * new callback path
788 	 */
789 	if (rfs4_cbinfo_rele(cbp,
790 	    (stat == RPC_SUCCESS ? CB_NOCHANGE : CB_FAILED)) == TRUE)
791 		goto retry;
792 
793 	return (stat);
794 }
795 
796 /*
797  * Used by the NFSv4 server to get attributes for a file while
798  * handling the case where a file has been write delegated.  For the
799  * time being, VOP_GETATTR() is called and CB_GETATTR processing is
800  * not undertaken.  This call site is maintained in case the server is
801  * updated in the future to handle write delegation space guarantees.
802  */
803 nfsstat4
804 rfs4_vop_getattr(vnode_t *vp, vattr_t *vap, int flag, cred_t *cr)
805 {
806 	uint_t mask;
807 	int error;
808 
809 	mask = vap->va_mask;
810 	error = VOP_GETATTR(vp, vap, flag, cr);
811 	/*
812 	 * Some file systems clobber va_mask. it is probably wrong of
813 	 * them to do so, nonethless we practice defensive coding.
814 	 * See bug id 4276830.
815 	 */
816 	vap->va_mask = mask;
817 	return (puterrno4(error));
818 }
819 
820 /*
821  * This is used everywhere in the v2/v3 server to allow the
822  * integration of all NFS versions and the support of delegation.  For
823  * now, just call the VOP_GETATTR().  If the NFSv4 server is enhanced
824  * in the future to provide space guarantees for write delegations
825  * then this call site should be expanded to interact with the client.
826  */
827 int
828 rfs4_delegated_getattr(vnode_t *vp, vattr_t *vap, int flag, cred_t *cr)
829 {
830 	return (VOP_GETATTR(vp, vap, flag, cr));
831 }
832 
833 /*
834  * Place the actual cb_recall otw call to client.
835  */
836 static void
837 rfs4_do_cb_recall(rfs4_deleg_state_t *dsp, bool_t trunc)
838 {
839 	CB_COMPOUND4args	cb4_args;
840 	CB_COMPOUND4res		cb4_res;
841 	CB_RECALL4args		*rec_argp;
842 	nfs_cb_argop4		*argop;
843 	int			numops;
844 	int			argoplist_size;
845 	struct timeval		timeout;
846 	nfs_fh4			*fhp;
847 	enum clnt_stat		call_stat;
848 
849 	/*
850 	 * set up the compound args
851 	 */
852 	numops = 1;	/* CB_RECALL only */
853 
854 	argoplist_size = numops * sizeof (nfs_cb_argop4);
855 	argop = kmem_zalloc(argoplist_size, KM_SLEEP);
856 	argop->argop = OP_CB_RECALL;
857 	rec_argp = &argop->nfs_cb_argop4_u.opcbrecall;
858 
859 	(void) str_to_utf8("cb_recall", &cb4_args.tag);
860 	cb4_args.minorversion = CB4_MINORVERSION;
861 	/* cb4_args.callback_ident is set in rfs4_do_callback() */
862 	cb4_args.array_len = numops;
863 	cb4_args.array = argop;
864 
865 	/*
866 	 * fill in the args struct
867 	 */
868 	bcopy(&dsp->delegid.stateid, &rec_argp->stateid, sizeof (stateid4));
869 	rec_argp->truncate = trunc;
870 
871 	fhp = &dsp->finfo->filehandle;
872 	rec_argp->fh.nfs_fh4_val = kmem_alloc(sizeof (char) *
873 	    fhp->nfs_fh4_len, KM_SLEEP);
874 	nfs_fh4_copy(fhp, &rec_argp->fh);
875 
876 	/* Keep track of when we did this for observability */
877 	dsp->time_recalled = gethrestime_sec();
878 
879 	/*
880 	 * Set up the timeout for the callback and make the actual call.
881 	 * Timeout will be 80% of the lease period for this server.
882 	 */
883 	timeout.tv_sec = (rfs4_lease_time * 80) / 100;
884 	timeout.tv_usec = 0;
885 
886 	call_stat = rfs4_do_callback(dsp->client, &cb4_args, &cb4_res, timeout);
887 
888 	if (call_stat != RPC_SUCCESS || cb4_res.status != NFS4_OK) {
889 		rfs4_revoke_deleg(dsp);
890 	}
891 
892 	rfs4freeargres(&cb4_args, &cb4_res);
893 }
894 
895 struct recall_arg {
896 	rfs4_deleg_state_t *dsp;
897 	void (*recall)(rfs4_deleg_state_t *, bool_t trunc);
898 	bool_t trunc;
899 };
900 
901 static void
902 do_recall(struct recall_arg *arg)
903 {
904 	rfs4_deleg_state_t *dsp = arg->dsp;
905 	rfs4_file_t *fp = dsp->finfo;
906 	callb_cpr_t cpr_info;
907 	kmutex_t cpr_lock;
908 
909 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
910 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recall");
911 
912 	/*
913 	 * It is possible that before this thread starts
914 	 * the client has send us a return_delegation, and
915 	 * if that is the case we do not need to send the
916 	 * recall callback.
917 	 */
918 	if (dsp->dtype != OPEN_DELEGATE_NONE) {
919 		DTRACE_PROBE3(nfss__i__recall,
920 		    struct recall_arg *, arg,
921 		    struct rfs4_deleg_state_t *, dsp,
922 		    struct rfs4_file_t *, fp);
923 
924 		if (arg->recall)
925 			(void) (*arg->recall)(dsp, arg->trunc);
926 	}
927 
928 	mutex_enter(fp->dinfo->recall_lock);
929 	/*
930 	 * Recall count may go negative if the parent thread that is
931 	 * creating the individual callback threads does not modify
932 	 * the recall_count field before the callback thread actually
933 	 * gets a response from the CB_RECALL
934 	 */
935 	fp->dinfo->recall_count--;
936 	if (fp->dinfo->recall_count == 0)
937 		cv_signal(fp->dinfo->recall_cv);
938 	mutex_exit(fp->dinfo->recall_lock);
939 
940 	mutex_enter(&cpr_lock);
941 	CALLB_CPR_EXIT(&cpr_info);
942 	mutex_destroy(&cpr_lock);
943 
944 	rfs4_deleg_state_rele(dsp); /* release the hold for this thread */
945 
946 	kmem_free(arg, sizeof (struct recall_arg));
947 }
948 
949 struct master_recall_args {
950     rfs4_file_t *fp;
951     void (*recall)(rfs4_deleg_state_t *, bool_t);
952     bool_t trunc;
953 };
954 
955 static void
956 do_recall_file(struct master_recall_args *map)
957 {
958 	rfs4_file_t *fp = map->fp;
959 	rfs4_deleg_state_t *dsp;
960 	struct recall_arg *arg;
961 	callb_cpr_t cpr_info;
962 	kmutex_t cpr_lock;
963 	int32_t recall_count;
964 
965 	rfs4_dbe_lock(fp->dbe);
966 
967 	/* Recall already in progress ? */
968 	mutex_enter(fp->dinfo->recall_lock);
969 	if (fp->dinfo->recall_count != 0) {
970 		mutex_exit(fp->dinfo->recall_lock);
971 		rfs4_dbe_rele_nolock(fp->dbe);
972 		rfs4_dbe_unlock(fp->dbe);
973 		kmem_free(map, sizeof (struct master_recall_args));
974 		return;
975 	}
976 
977 	mutex_exit(fp->dinfo->recall_lock);
978 
979 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
980 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,	"v4RecallFile");
981 
982 	recall_count = 0;
983 	for (dsp = fp->delegationlist.next->dsp; dsp != NULL;
984 	    dsp = dsp->delegationlist.next->dsp) {
985 
986 		rfs4_dbe_lock(dsp->dbe);
987 		/*
988 		 * if this delegation state
989 		 * is being reaped skip it
990 		 */
991 		if (rfs4_dbe_is_invalid(dsp->dbe)) {
992 			rfs4_dbe_unlock(dsp->dbe);
993 			continue;
994 		}
995 
996 		/* hold for receiving thread */
997 		rfs4_dbe_hold(dsp->dbe);
998 		rfs4_dbe_unlock(dsp->dbe);
999 
1000 		arg = kmem_alloc(sizeof (struct recall_arg), KM_SLEEP);
1001 		arg->recall = map->recall;
1002 		arg->trunc = map->trunc;
1003 		arg->dsp = dsp;
1004 
1005 		recall_count++;
1006 
1007 		(void) thread_create(NULL, 0, do_recall, arg, 0, &p0, TS_RUN,
1008 		    minclsyspri);
1009 	}
1010 
1011 	rfs4_dbe_unlock(fp->dbe);
1012 
1013 	mutex_enter(fp->dinfo->recall_lock);
1014 	/*
1015 	 * Recall count may go negative if the parent thread that is
1016 	 * creating the individual callback threads does not modify
1017 	 * the recall_count field before the callback thread actually
1018 	 * gets a response from the CB_RECALL
1019 	 */
1020 	fp->dinfo->recall_count += recall_count;
1021 	while (fp->dinfo->recall_count)
1022 		cv_wait(fp->dinfo->recall_cv, fp->dinfo->recall_lock);
1023 
1024 	mutex_exit(fp->dinfo->recall_lock);
1025 
1026 	DTRACE_PROBE1(nfss__i__recall_done, rfs4_file_t *, fp);
1027 	rfs4_file_rele(fp);
1028 	kmem_free(map, sizeof (struct master_recall_args));
1029 	mutex_enter(&cpr_lock);
1030 	CALLB_CPR_EXIT(&cpr_info);
1031 	mutex_destroy(&cpr_lock);
1032 }
1033 
1034 static void
1035 rfs4_recall_file(rfs4_file_t *fp,
1036 	void (*recall)(rfs4_deleg_state_t *, bool_t trunc),
1037 	bool_t trunc, rfs4_client_t *cp)
1038 {
1039 	struct master_recall_args *args;
1040 
1041 	rfs4_dbe_lock(fp->dbe);
1042 	if (fp->dinfo->dtype == OPEN_DELEGATE_NONE) {
1043 		rfs4_dbe_unlock(fp->dbe);
1044 		return;
1045 	}
1046 	rfs4_dbe_hold(fp->dbe);	/* hold for new thread */
1047 
1048 	/*
1049 	 * Mark the time we started the recall processing.
1050 	 * If it has been previously recalled, do not reset the
1051 	 * timer since this is used for the revocation decision.
1052 	 */
1053 	if (fp->dinfo->time_recalled == 0)
1054 		fp->dinfo->time_recalled = gethrestime_sec();
1055 	fp->dinfo->ever_recalled = TRUE; /* used for policy decision */
1056 	/* Client causing recall not always available */
1057 	if (cp)
1058 		fp->dinfo->conflicted_client = cp->clientid;
1059 
1060 	rfs4_dbe_unlock(fp->dbe);
1061 
1062 	args = kmem_alloc(sizeof (struct master_recall_args), KM_SLEEP);
1063 	args->fp = fp;
1064 	args->recall = recall;
1065 	args->trunc = trunc;
1066 
1067 	(void) thread_create(NULL, 0, do_recall_file, args, 0, &p0, TS_RUN,
1068 	    minclsyspri);
1069 }
1070 
1071 void
1072 rfs4_recall_deleg(rfs4_file_t *fp, bool_t trunc, rfs4_client_t *cp)
1073 {
1074 	time_t elapsed1, elapsed2;
1075 
1076 	if (fp->dinfo->time_recalled != 0) {
1077 		elapsed1 = gethrestime_sec() - fp->dinfo->time_recalled;
1078 		elapsed2 = gethrestime_sec() - fp->dinfo->time_lastwrite;
1079 		/* First check to see if a revocation should occur */
1080 		if (elapsed1 > rfs4_lease_time &&
1081 		    elapsed2 > rfs4_lease_time) {
1082 			rfs4_revoke_file(fp);
1083 			return;
1084 		}
1085 		/*
1086 		 * Next check to see if a recall should be done again
1087 		 * so quickly.
1088 		 */
1089 		if (elapsed1 <= ((rfs4_lease_time * 20) / 100))
1090 			return;
1091 	}
1092 	rfs4_recall_file(fp, rfs4_do_cb_recall, trunc, cp);
1093 }
1094 
1095 /*
1096  * rfs4_check_recall is called from rfs4_do_open to determine if the current
1097  * open conflicts with the delegation.
1098  * Return true if we need recall otherwise false.
1099  * Assumes entry locks for sp and sp->finfo are held.
1100  */
1101 bool_t
1102 rfs4_check_recall(rfs4_state_t *sp, uint32_t access)
1103 {
1104 	open_delegation_type4 dtype = sp->finfo->dinfo->dtype;
1105 
1106 	switch (dtype) {
1107 	case OPEN_DELEGATE_NONE:
1108 		/* Not currently delegated so there is nothing to do */
1109 		return (FALSE);
1110 	case OPEN_DELEGATE_READ:
1111 		/*
1112 		 * If the access is only asking for READ then there is
1113 		 * no conflict and nothing to do.  If it is asking
1114 		 * for write, then there will be conflict and the read
1115 		 * delegation should be recalled.
1116 		 */
1117 		if (access == OPEN4_SHARE_ACCESS_READ)
1118 			return (FALSE);
1119 		else
1120 			return (TRUE);
1121 	case OPEN_DELEGATE_WRITE:
1122 		/* Check to see if this client has the delegation */
1123 		return (rfs4_is_deleg(sp));
1124 	}
1125 
1126 	return (FALSE);
1127 }
1128 
1129 /*
1130  * Return the "best" allowable delegation available given the current
1131  * delegation type and the desired access and deny modes on the file.
1132  * At the point that this routine is called we know that the access and
1133  * deny modes are consistent with the file modes.
1134  */
1135 static open_delegation_type4
1136 rfs4_check_delegation(rfs4_state_t *sp, rfs4_file_t *fp)
1137 {
1138 	open_delegation_type4 dtype = fp->dinfo->dtype;
1139 	uint32_t access = sp->share_access;
1140 	uint32_t deny = sp->share_deny;
1141 	int readcnt = 0;
1142 	int writecnt = 0;
1143 
1144 	switch (dtype) {
1145 	case OPEN_DELEGATE_NONE:
1146 		/*
1147 		 * Determine if more than just this OPEN have the file
1148 		 * open and if so, no delegation may be provided to
1149 		 * the client.
1150 		 */
1151 		if (access & OPEN4_SHARE_ACCESS_WRITE)
1152 			writecnt++;
1153 		if (access & OPEN4_SHARE_ACCESS_READ)
1154 			readcnt++;
1155 
1156 		if (fp->access_read > readcnt || fp->access_write > writecnt)
1157 			return (OPEN_DELEGATE_NONE);
1158 
1159 		/*
1160 		 * If the client is going to write, or if the client
1161 		 * has exclusive access, return a write delegation.
1162 		 */
1163 		if ((access & OPEN4_SHARE_ACCESS_WRITE) ||
1164 		    (deny & (OPEN4_SHARE_DENY_READ | OPEN4_SHARE_DENY_WRITE)))
1165 			return (OPEN_DELEGATE_WRITE);
1166 		/*
1167 		 * If we don't want to write or we've haven't denied read
1168 		 * access to others, return a read delegation.
1169 		 */
1170 		if ((access & ~OPEN4_SHARE_ACCESS_WRITE) ||
1171 		    (deny & ~OPEN4_SHARE_DENY_READ))
1172 			return (OPEN_DELEGATE_READ);
1173 
1174 		/* Shouldn't get here */
1175 		return (OPEN_DELEGATE_NONE);
1176 
1177 	case OPEN_DELEGATE_READ:
1178 		/*
1179 		 * If the file is delegated for read but we wan't to
1180 		 * write or deny others to read then we can't delegate
1181 		 * the file. We shouldn't get here since the delegation should
1182 		 * have been recalled already.
1183 		 */
1184 		if ((access & OPEN4_SHARE_ACCESS_WRITE) ||
1185 		    (deny & OPEN4_SHARE_DENY_READ))
1186 			return (OPEN_DELEGATE_NONE);
1187 		return (OPEN_DELEGATE_READ);
1188 
1189 	case OPEN_DELEGATE_WRITE:
1190 		return (OPEN_DELEGATE_WRITE);
1191 	}
1192 
1193 	/* Shouldn't get here */
1194 	return (OPEN_DELEGATE_NONE);
1195 }
1196 
1197 /*
1198  * Given the desired delegation type and the "history" of the file
1199  * determine the actual delegation type to return.
1200  */
1201 static open_delegation_type4
1202 rfs4_delegation_policy(open_delegation_type4 dtype,
1203 	rfs4_dinfo_t *dinfo, clientid4 cid)
1204 {
1205 	time_t elapsed;
1206 
1207 	if (rfs4_deleg_policy != SRV_NORMAL_DELEGATE)
1208 		return (OPEN_DELEGATE_NONE);
1209 
1210 	/*
1211 	 * Has this file/delegation ever been recalled?  If not then
1212 	 * no furhter checks for a delegation race need to be done.
1213 	 * However if a recall has occurred, then check to see if a
1214 	 * client has caused its own delegation recall to occur.  If
1215 	 * not, then has a delegation for this file been returned
1216 	 * recently?  If so, then do not assign a new delegation to
1217 	 * avoid a "delegation race" between the original client and
1218 	 * the new/conflicting client.
1219 	 */
1220 	if (dinfo->ever_recalled == TRUE) {
1221 		if (dinfo->conflicted_client != cid) {
1222 			elapsed = gethrestime_sec() - dinfo->time_returned;
1223 			if (elapsed < rfs4_lease_time)
1224 				return (OPEN_DELEGATE_NONE);
1225 		}
1226 	}
1227 
1228 	/* Limit the number of read grants */
1229 	if (dtype == OPEN_DELEGATE_READ &&
1230 	    dinfo->rdgrants > MAX_READ_DELEGATIONS)
1231 		return (OPEN_DELEGATE_NONE);
1232 
1233 	/*
1234 	 * Should consider limiting total number of read/write
1235 	 * delegations the server will permit.
1236 	 */
1237 
1238 	return (dtype);
1239 }
1240 
1241 /*
1242  * Try and grant a delegation for an open give the state. The routine
1243  * returns the delegation type granted. This could be OPEN_DELEGATE_NONE.
1244  *
1245  * The state and associate file entry must be locked
1246  */
1247 rfs4_deleg_state_t *
1248 rfs4_grant_delegation(delegreq_t dreq, rfs4_state_t *sp, int *recall)
1249 {
1250 	rfs4_file_t *fp = sp->finfo;
1251 	open_delegation_type4 dtype;
1252 	int no_delegation;
1253 
1254 	ASSERT(rfs4_dbe_islocked(sp->dbe));
1255 	ASSERT(rfs4_dbe_islocked(fp->dbe));
1256 
1257 	/* Is the server even providing delegations? */
1258 	if (rfs4_deleg_policy == SRV_NEVER_DELEGATE || dreq == DELEG_NONE)
1259 		return (NULL);
1260 
1261 	/* Check to see if delegations have been temporarily disabled */
1262 	mutex_enter(&rfs4_deleg_lock);
1263 	no_delegation = rfs4_deleg_disabled;
1264 	mutex_exit(&rfs4_deleg_lock);
1265 
1266 	if (no_delegation)
1267 		return (NULL);
1268 
1269 	/* Don't grant a delegation if a deletion is impending. */
1270 	if (fp->dinfo->hold_grant > 0) {
1271 		return (NULL);
1272 	}
1273 
1274 	/*
1275 	 * Don't grant a delegation if there are any lock manager
1276 	 * (NFSv2/v3) locks for the file.  This is a bit of a hack (e.g.,
1277 	 * if there are only read locks we should be able to grant a
1278 	 * read-only delegation), but it's good enough for now.
1279 	 *
1280 	 * MT safety: the lock manager checks for conflicting delegations
1281 	 * before processing a lock request.  That check will block until
1282 	 * we are done here.  So if the lock manager acquires a lock after
1283 	 * we decide to grant the delegation, the delegation will get
1284 	 * immediately recalled (if there's a conflict), so we're safe.
1285 	 */
1286 	if (lm_vp_active(fp->vp)) {
1287 		return (NULL);
1288 	}
1289 
1290 	/*
1291 	 * Based on the type of delegation request passed in, take the
1292 	 * appropriate action (DELEG_NONE is handled above)
1293 	 */
1294 	switch (dreq) {
1295 
1296 	case DELEG_READ:
1297 	case DELEG_WRITE:
1298 		/*
1299 		 * The server "must" grant the delegation in this case.
1300 		 * Client is using open previous
1301 		 */
1302 		dtype = (open_delegation_type4)dreq;
1303 		*recall = 1;
1304 		break;
1305 	case DELEG_ANY:
1306 		/*
1307 		 * If a valid callback path does not exist, no delegation may
1308 		 * be granted.
1309 		 */
1310 		if (sp->owner->client->cbinfo.cb_state != CB_OK)
1311 			return (NULL);
1312 
1313 		/*
1314 		 * If the original operation which caused time_rm_delayed
1315 		 * to be set hasn't been retried and completed for one
1316 		 * full lease period, clear it and allow delegations to
1317 		 * get granted again.
1318 		 */
1319 		if (fp->dinfo->time_rm_delayed > 0 &&
1320 		    gethrestime_sec() >
1321 		    fp->dinfo->time_rm_delayed + rfs4_lease_time)
1322 			fp->dinfo->time_rm_delayed = 0;
1323 
1324 		/*
1325 		 * If we are waiting for a delegation to be returned then
1326 		 * don't delegate this file. We do this for correctness as
1327 		 * well as if the file is being recalled we would likely
1328 		 * recall this file again.
1329 		 */
1330 
1331 		if (fp->dinfo->time_recalled != 0 ||
1332 		    fp->dinfo->time_rm_delayed != 0)
1333 			return (NULL);
1334 
1335 		/* Get the "best" delegation candidate */
1336 		dtype = rfs4_check_delegation(sp, fp);
1337 
1338 		if (dtype == OPEN_DELEGATE_NONE)
1339 			return (NULL);
1340 
1341 		/*
1342 		 * Based on policy and the history of the file get the
1343 		 * actual delegation.
1344 		 */
1345 		dtype = rfs4_delegation_policy(dtype, fp->dinfo,
1346 		    sp->owner->client->clientid);
1347 
1348 		if (dtype == OPEN_DELEGATE_NONE)
1349 			return (NULL);
1350 		break;
1351 	default:
1352 		return (NULL);
1353 	}
1354 
1355 	/* set the delegation for the state */
1356 	return (rfs4_deleg_state(sp, dtype, recall));
1357 }
1358 
1359 void
1360 rfs4_set_deleg_response(rfs4_deleg_state_t *dsp, open_delegation4 *dp,
1361 			nfsace4 *ace,  int recall)
1362 {
1363 	open_write_delegation4 *wp;
1364 	open_read_delegation4 *rp;
1365 	nfs_space_limit4 *spl;
1366 	nfsace4 nace;
1367 
1368 	/*
1369 	 * We need to allocate a new copy of the who string.
1370 	 * this string will be freed by the rfs4_op_open dis_resfree
1371 	 * routine. We need to do this allocation since replays will
1372 	 * be allocated and rfs4_compound can't tell the difference from
1373 	 * a replay and an inital open. N.B. if an ace is passed in, it
1374 	 * the caller's responsibility to free it.
1375 	 */
1376 
1377 	if (ace == NULL) {
1378 		/*
1379 		 * Default is to deny all access, the client will have
1380 		 * to contact the server.  XXX Do we want to actually
1381 		 * set a deny for every one, or do we simply want to
1382 		 * construct an entity that will match no one?
1383 		 */
1384 		nace.type = ACE4_ACCESS_DENIED_ACE_TYPE;
1385 		nace.flag = 0;
1386 		nace.access_mask = ACE4_VALID_MASK_BITS;
1387 		(void) str_to_utf8(ACE4_WHO_EVERYONE, &nace.who);
1388 	} else {
1389 		nace.type = ace->type;
1390 		nace.flag = ace->flag;
1391 		nace.access_mask = ace->access_mask;
1392 		(void) utf8_copy(&ace->who, &nace.who);
1393 	}
1394 
1395 	dp->delegation_type = dsp->dtype;
1396 
1397 	switch (dsp->dtype) {
1398 	case OPEN_DELEGATE_NONE:
1399 		break;
1400 	case OPEN_DELEGATE_READ:
1401 		rp = &dp->open_delegation4_u.read;
1402 		rp->stateid = dsp->delegid.stateid;
1403 		rp->recall = (bool_t)recall;
1404 		rp->permissions = nace;
1405 		break;
1406 	case OPEN_DELEGATE_WRITE:
1407 		wp = &dp->open_delegation4_u.write;
1408 		wp->stateid = dsp->delegid.stateid;
1409 		wp->recall = (bool_t)recall;
1410 		spl = &wp->space_limit;
1411 		spl->limitby = NFS_LIMIT_SIZE;
1412 		spl->nfs_space_limit4_u.filesize = 0;
1413 		wp->permissions = nace;
1414 		break;
1415 	}
1416 }
1417 
1418 /*
1419  * Check if the file is delegated via the provided file struct.
1420  * Return TRUE if it is delegated.  This is intended for use by
1421  * the v4 server.  The v2/v3 server code should use rfs4_check_delegated().
1422  *
1423  * Note that if the file is found to have a delegation, it is
1424  * recalled, unless the clientid of the caller matches the clientid of the
1425  * delegation. If the caller has specified, there is a slight delay
1426  * inserted in the hopes that the delegation will be returned quickly.
1427  */
1428 bool_t
1429 rfs4_check_delegated_byfp(int mode, rfs4_file_t *fp,
1430 	bool_t trunc, bool_t do_delay, bool_t is_rm, clientid4 *cp)
1431 {
1432 	rfs4_deleg_state_t *dsp;
1433 
1434 	/* Is delegation enabled? */
1435 	if (rfs4_deleg_policy == SRV_NEVER_DELEGATE)
1436 		return (FALSE);
1437 
1438 	/* do we have a delegation on this file? */
1439 	rfs4_dbe_lock(fp->dbe);
1440 	if (fp->dinfo->dtype == OPEN_DELEGATE_NONE) {
1441 		if (is_rm)
1442 			fp->dinfo->hold_grant++;
1443 		rfs4_dbe_unlock(fp->dbe);
1444 		return (FALSE);
1445 	}
1446 	/*
1447 	 * do we have a write delegation on this file or are we
1448 	 * requesting write access to a file with any type of existing
1449 	 * delegation?
1450 	 */
1451 	if (mode == FWRITE || fp->dinfo->dtype == OPEN_DELEGATE_WRITE) {
1452 		if (cp != NULL) {
1453 			dsp = fp->delegationlist.next->dsp;
1454 			if (dsp == NULL) {
1455 				rfs4_dbe_unlock(fp->dbe);
1456 				return (FALSE);
1457 			}
1458 			/*
1459 			 * Does the requestor already own the delegation?
1460 			 */
1461 			if (dsp->client->clientid == *(cp)) {
1462 				rfs4_dbe_unlock(fp->dbe);
1463 				return (FALSE);
1464 			}
1465 		}
1466 
1467 		rfs4_dbe_unlock(fp->dbe);
1468 		rfs4_recall_deleg(fp, trunc, NULL);
1469 
1470 		if (!do_delay) {
1471 			rfs4_dbe_lock(fp->dbe);
1472 			fp->dinfo->time_rm_delayed = gethrestime_sec();
1473 			rfs4_dbe_unlock(fp->dbe);
1474 			return (TRUE);
1475 		}
1476 
1477 		delay(NFS4_DELEGATION_CONFLICT_DELAY);
1478 
1479 		rfs4_dbe_lock(fp->dbe);
1480 		if (fp->dinfo->dtype != OPEN_DELEGATE_NONE) {
1481 			fp->dinfo->time_rm_delayed = gethrestime_sec();
1482 			rfs4_dbe_unlock(fp->dbe);
1483 			return (TRUE);
1484 		}
1485 	}
1486 	if (is_rm)
1487 		fp->dinfo->hold_grant++;
1488 	rfs4_dbe_unlock(fp->dbe);
1489 	return (FALSE);
1490 }
1491 
1492 /*
1493  * Check if the file is delegated in the case of a v2 or v3 access.
1494  * Return TRUE if it is delegated which in turn means that v2 should
1495  * drop the request and in the case of v3 JUKEBOX should be returned.
1496  */
1497 bool_t
1498 rfs4_check_delegated(int mode, vnode_t *vp, bool_t trunc)
1499 {
1500 	rfs4_file_t *fp;
1501 	bool_t create = FALSE;
1502 	bool_t rc = FALSE;
1503 
1504 	rfs4_hold_deleg_policy();
1505 
1506 	/* Is delegation enabled? */
1507 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE) {
1508 		fp = rfs4_findfile(vp, NULL, &create);
1509 		if (fp != NULL) {
1510 			if (rfs4_check_delegated_byfp(mode, fp, trunc,
1511 			    TRUE, FALSE, NULL)) {
1512 				rc = TRUE;
1513 			}
1514 			rfs4_file_rele(fp);
1515 		}
1516 	}
1517 	rfs4_rele_deleg_policy();
1518 	return (rc);
1519 }
1520 
1521 /*
1522  * Release a hold on the hold_grant counter which
1523  * prevents delegation from being granted while a remove
1524  * or a rename is in progress.
1525  */
1526 void
1527 rfs4_clear_dont_grant(rfs4_file_t *fp)
1528 {
1529 	if (rfs4_deleg_policy == SRV_NEVER_DELEGATE)
1530 		return;
1531 	rfs4_dbe_lock(fp->dbe);
1532 	ASSERT(fp->dinfo->hold_grant > 0);
1533 	fp->dinfo->hold_grant--;
1534 	fp->dinfo->time_rm_delayed = 0;
1535 	rfs4_dbe_unlock(fp->dbe);
1536 }
1537 
1538 /*
1539  * State support for delegation.
1540  * Set the state delegation type for this state;
1541  * This routine is called from open via rfs4_grant_delegation and the entry
1542  * locks on sp and sp->finfo are assumed.
1543  */
1544 static rfs4_deleg_state_t *
1545 rfs4_deleg_state(rfs4_state_t *sp, open_delegation_type4 dtype, int *recall)
1546 {
1547 	rfs4_file_t *fp = sp->finfo;
1548 	bool_t create = TRUE;
1549 	rfs4_deleg_state_t *dsp;
1550 	vnode_t *vp;
1551 	int open_prev = *recall;
1552 
1553 	ASSERT(rfs4_dbe_islocked(sp->dbe));
1554 	ASSERT(rfs4_dbe_islocked(fp->dbe));
1555 
1556 	/* Shouldn't happen */
1557 	if (fp->dinfo->recall_count != 0 ||
1558 	    (fp->dinfo->dtype == OPEN_DELEGATE_READ &&
1559 	    dtype != OPEN_DELEGATE_READ)) {
1560 		return (NULL);
1561 	}
1562 
1563 	/* Unlock to avoid deadlock */
1564 	rfs4_dbe_unlock(fp->dbe);
1565 	rfs4_dbe_unlock(sp->dbe);
1566 
1567 	dsp = rfs4_finddeleg(sp, &create);
1568 
1569 	rfs4_dbe_lock(sp->dbe);
1570 	rfs4_dbe_lock(fp->dbe);
1571 
1572 	if (dsp == NULL)
1573 		return (NULL);
1574 
1575 	/*
1576 	 * It is possible that since we dropped the lock
1577 	 * in order to call finddeleg, the rfs4_file_t
1578 	 * was marked such that we should not grant a
1579 	 * delegation, if so bail out.
1580 	 */
1581 	if (fp->dinfo->hold_grant > 0) {
1582 		rfs4_deleg_state_rele(dsp);
1583 		return (NULL);
1584 	}
1585 
1586 	if (create == FALSE) {
1587 		if (sp->owner->client == dsp->client &&
1588 		    dsp->dtype == dtype) {
1589 			return (dsp);
1590 		} else {
1591 			rfs4_deleg_state_rele(dsp);
1592 			return (NULL);
1593 		}
1594 	}
1595 
1596 	/*
1597 	 * Check that this file has not been delegated to another
1598 	 * client
1599 	 */
1600 	if (fp->dinfo->recall_count != 0 ||
1601 	    fp->dinfo->dtype == OPEN_DELEGATE_WRITE ||
1602 	    (fp->dinfo->dtype == OPEN_DELEGATE_READ &&
1603 	    dtype != OPEN_DELEGATE_READ)) {
1604 		rfs4_deleg_state_rele(dsp);
1605 		return (NULL);
1606 	}
1607 
1608 	vp = fp->vp;
1609 	/* vnevent_support returns 0 if file system supports vnevents */
1610 	if (vnevent_support(vp)) {
1611 		rfs4_deleg_state_rele(dsp);
1612 		return (NULL);
1613 	}
1614 
1615 	*recall = 0;
1616 	if (dtype == OPEN_DELEGATE_READ) {
1617 		if (vn_is_opened(vp, V_WRITE) || vn_is_mapped(vp, V_WRITE)) {
1618 			if (open_prev) {
1619 				*recall = 1;
1620 			} else {
1621 				rfs4_deleg_state_rele(dsp);
1622 				return (NULL);
1623 			}
1624 		}
1625 		(void) fem_install(vp, deleg_rdops, (void *)fp, OPUNIQ,
1626 		    rfs4_mon_hold, rfs4_mon_rele);
1627 		if (vn_is_opened(vp, V_WRITE) || vn_is_mapped(vp, V_WRITE)) {
1628 			if (open_prev) {
1629 				*recall = 1;
1630 			} else {
1631 				(void) fem_uninstall(vp, deleg_rdops,
1632 				    (void *)fp);
1633 				rfs4_deleg_state_rele(dsp);
1634 				return (NULL);
1635 			}
1636 		}
1637 	} else { /* WRITE */
1638 		if (vn_is_opened(vp, V_RDORWR) || vn_is_mapped(vp, V_RDORWR)) {
1639 			if (open_prev) {
1640 				*recall = 1;
1641 			} else {
1642 				rfs4_deleg_state_rele(dsp);
1643 				return (NULL);
1644 			}
1645 		}
1646 		(void) fem_install(vp, deleg_wrops, (void *)fp, OPUNIQ,
1647 		    rfs4_mon_hold, rfs4_mon_rele);
1648 		if (vn_is_opened(vp, V_RDORWR) || vn_is_mapped(vp, V_RDORWR)) {
1649 			if (open_prev) {
1650 				*recall = 1;
1651 			} else {
1652 				(void) fem_uninstall(vp, deleg_wrops,
1653 				    (void *)fp);
1654 				rfs4_deleg_state_rele(dsp);
1655 				return (NULL);
1656 			}
1657 		}
1658 	}
1659 	/* Place on delegation list for file */
1660 	insque(&dsp->delegationlist, fp->delegationlist.prev);
1661 
1662 	dsp->dtype = fp->dinfo->dtype = dtype;
1663 
1664 	/* Update delegation stats for this file */
1665 	fp->dinfo->time_lastgrant = gethrestime_sec();
1666 
1667 	/* reset since this is a new delegation */
1668 	fp->dinfo->conflicted_client = 0;
1669 	fp->dinfo->ever_recalled = FALSE;
1670 
1671 	if (dtype == OPEN_DELEGATE_READ)
1672 		fp->dinfo->rdgrants++;
1673 	else
1674 		fp->dinfo->wrgrants++;
1675 
1676 	return (dsp);
1677 }
1678 
1679 /*
1680  * State routine for the server when a delegation is returned.
1681  */
1682 void
1683 rfs4_return_deleg(rfs4_deleg_state_t *dsp, bool_t revoked)
1684 {
1685 	rfs4_file_t *fp = dsp->finfo;
1686 	open_delegation_type4 dtypewas;
1687 
1688 	rfs4_dbe_lock(fp->dbe);
1689 	/* Remove state from recall list */
1690 
1691 	remque(&dsp->delegationlist);
1692 	dsp->delegationlist.next = dsp->delegationlist.prev =
1693 	    &dsp->delegationlist;
1694 
1695 	if (&fp->delegationlist == fp->delegationlist.next) {
1696 		dtypewas = fp->dinfo->dtype;
1697 		fp->dinfo->dtype = OPEN_DELEGATE_NONE;
1698 		rfs4_dbe_cv_broadcast(fp->dbe);
1699 
1700 		/* if file system was unshared, the vp will be NULL */
1701 		if (fp->vp != NULL) {
1702 			if (dtypewas == OPEN_DELEGATE_READ)
1703 				(void) fem_uninstall(fp->vp, deleg_rdops,
1704 				    (void *)fp);
1705 			else
1706 				(void) fem_uninstall(fp->vp, deleg_wrops,
1707 				    (void *)fp);
1708 		}
1709 	}
1710 
1711 	switch (dsp->dtype) {
1712 	case OPEN_DELEGATE_READ:
1713 		fp->dinfo->rdgrants--;
1714 		break;
1715 	case OPEN_DELEGATE_WRITE:
1716 		fp->dinfo->wrgrants--;
1717 		break;
1718 	default:
1719 		break;
1720 	}
1721 
1722 	/* used in the policy decision */
1723 	fp->dinfo->time_returned = gethrestime_sec();
1724 
1725 	/*
1726 	 * reset the time_recalled field so future delegations are not
1727 	 * accidentally revoked
1728 	 */
1729 	if ((fp->dinfo->rdgrants + fp->dinfo->wrgrants) == 0)
1730 		fp->dinfo->time_recalled = 0;
1731 
1732 	rfs4_dbe_unlock(fp->dbe);
1733 
1734 	rfs4_dbe_lock(dsp->dbe);
1735 
1736 	dsp->dtype = OPEN_DELEGATE_NONE;
1737 
1738 	if (revoked == TRUE)
1739 		dsp->time_revoked = gethrestime_sec();
1740 
1741 	rfs4_dbe_invalidate(dsp->dbe);
1742 
1743 	rfs4_dbe_unlock(dsp->dbe);
1744 
1745 	if (revoked == TRUE) {
1746 		rfs4_dbe_lock(dsp->client->dbe);
1747 		dsp->client->deleg_revoked++;	/* observability */
1748 		rfs4_dbe_unlock(dsp->client->dbe);
1749 	}
1750 }
1751 
1752 static void
1753 rfs4_revoke_deleg(rfs4_deleg_state_t *dsp)
1754 {
1755 	rfs4_return_deleg(dsp, TRUE);
1756 }
1757 
1758 static void
1759 rfs4_revoke_file(rfs4_file_t *fp)
1760 {
1761 	rfs4_deleg_state_t *dsp;
1762 
1763 	/*
1764 	 * The lock for rfs4_file_t must be held when traversing the
1765 	 * delegation list but that lock needs to be released to call
1766 	 * rfs4_revoke_deleg()
1767 	 * This for loop is set up to check the list for being empty,
1768 	 * and locking the rfs4_file_t struct on init and end
1769 	 */
1770 	for (rfs4_dbe_lock(fp->dbe);
1771 	    &fp->delegationlist != fp->delegationlist.next;
1772 	    rfs4_dbe_lock(fp->dbe)) {
1773 
1774 		dsp = fp->delegationlist.next->dsp;
1775 		rfs4_dbe_hold(dsp->dbe);
1776 		rfs4_dbe_unlock(fp->dbe);
1777 		rfs4_revoke_deleg(dsp);
1778 		rfs4_deleg_state_rele(dsp);
1779 	}
1780 	rfs4_dbe_unlock(fp->dbe);
1781 }
1782 
1783 /*
1784  * A delegation is assumed to be present on the file associated with
1785  * "state".  Check to see if the delegation matches is associated with
1786  * the same client as referenced by "state".  If it is not, TRUE is
1787  * returned.  If the delegation DOES match the client (or no
1788  * delegation is present), return FALSE.
1789  * Assume the state entry and file entry are locked.
1790  */
1791 bool_t
1792 rfs4_is_deleg(rfs4_state_t *state)
1793 {
1794 	rfs4_deleg_state_t *dsp;
1795 	rfs4_file_t *fp = state->finfo;
1796 	rfs4_client_t *cp = state->owner->client;
1797 
1798 	ASSERT(rfs4_dbe_islocked(fp->dbe));
1799 	for (dsp = fp->delegationlist.next->dsp; dsp != NULL;
1800 	    dsp = dsp->delegationlist.next->dsp) {
1801 		if (cp != dsp->client) {
1802 			return (TRUE);
1803 		}
1804 	}
1805 	return (FALSE);
1806 }
1807 
1808 void
1809 rfs4_disable_delegation(void)
1810 {
1811 	mutex_enter(&rfs4_deleg_lock);
1812 	rfs4_deleg_disabled++;
1813 	mutex_exit(&rfs4_deleg_lock);
1814 }
1815 
1816 void
1817 rfs4_enable_delegation(void)
1818 {
1819 	mutex_enter(&rfs4_deleg_lock);
1820 	ASSERT(rfs4_deleg_disabled > 0);
1821 	rfs4_deleg_disabled--;
1822 	mutex_exit(&rfs4_deleg_lock);
1823 }
1824 
1825 void
1826 rfs4_mon_hold(void *arg)
1827 {
1828 	rfs4_file_t *fp = arg;
1829 
1830 	rfs4_dbe_hold(fp->dbe);
1831 }
1832 
1833 void
1834 rfs4_mon_rele(void *arg)
1835 {
1836 	rfs4_file_t *fp = arg;
1837 
1838 	rfs4_dbe_rele_nolock(fp->dbe);
1839 }
1840