1 /*
2  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
3  * Authors: Doug Rabson <dfr@rabson.org>
4  * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Copyright (c) 2012 by Delphix. All rights reserved.
30  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
31  * Copyright 2014 Joyent, Inc.  All rights reserved.
32  */
33 
34 /*
35  * NFS Lock Manager service functions (nlm_do_...)
36  * Called from nlm_rpc_svc.c wrappers.
37  *
38  * Source code derived from FreeBSD nlm_prot_impl.c
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/thread.h>
44 #include <sys/fcntl.h>
45 #include <sys/flock.h>
46 #include <sys/mount.h>
47 #include <sys/priv.h>
48 #include <sys/proc.h>
49 #include <sys/share.h>
50 #include <sys/socket.h>
51 #include <sys/syscall.h>
52 #include <sys/syslog.h>
53 #include <sys/systm.h>
54 #include <sys/taskq.h>
55 #include <sys/unistd.h>
56 #include <sys/vnode.h>
57 #include <sys/vfs.h>
58 #include <sys/queue.h>
59 #include <sys/sdt.h>
60 #include <netinet/in.h>
61 
62 #include <rpc/rpc.h>
63 #include <rpc/xdr.h>
64 #include <rpc/pmap_prot.h>
65 #include <rpc/pmap_clnt.h>
66 #include <rpc/rpcb_prot.h>
67 
68 #include <rpcsvc/nlm_prot.h>
69 #include <rpcsvc/sm_inter.h>
70 
71 #include <nfs/nfs.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/export.h>
74 #include <nfs/rnode.h>
75 
76 #include "nlm_impl.h"
77 
78 #define	NLM_IN_GRACE(g) (ddi_get_lbolt() < (g)->grace_threshold)
79 
80 struct nlm_block_cb_data {
81 	struct nlm_host		*hostp;
82 	struct nlm_vhold	*nvp;
83 	struct flock64		*flp;
84 };
85 
86 /*
87  * Invoke an asyncronous RPC callbeck
88  * (used when NLM server needs to reply to MSG NLM procedure).
89  */
90 #define	NLM_INVOKE_CALLBACK(descr, rpcp, resp, callb)			\
91 	do {								\
92 		enum clnt_stat _stat;					\
93 									\
94 		_stat = (*(callb))(resp, NULL, (rpcp)->nr_handle);	\
95 		if (_stat != RPC_SUCCESS && _stat != RPC_TIMEDOUT) {	\
96 			struct rpc_err _err;				\
97 									\
98 			CLNT_GETERR((rpcp)->nr_handle, &_err);		\
99 			NLM_ERR("NLM: %s callback failed: "		\
100 			    "stat %d, err %d\n", descr, _stat,		\
101 			    _err.re_errno);				\
102 		}							\
103 									\
104 	_NOTE(CONSTCOND) } while (0)
105 
106 static void nlm_block(
107 	nlm4_lockargs *lockargs,
108 	struct nlm_host *host,
109 	struct nlm_vhold *nvp,
110 	nlm_rpc_t *rpcp,
111 	struct flock64 *fl,
112 	nlm_testargs_cb grant_cb);
113 
114 static vnode_t *nlm_fh_to_vp(struct netobj *);
115 static struct nlm_vhold *nlm_fh_to_vhold(struct nlm_host *, struct netobj *);
116 static void nlm_init_shrlock(struct shrlock *, nlm4_share *, struct nlm_host *);
117 static callb_cpr_t *nlm_block_callback(flk_cb_when_t, void *);
118 static int nlm_vop_frlock(vnode_t *, int, flock64_t *, int, offset_t,
119     struct flk_callback *, cred_t *, caller_context_t *);
120 
121 /*
122  * Convert a lock from network to local form, and
123  * check for valid range (no overflow).
124  */
125 static int
126 nlm_init_flock(struct flock64 *fl, struct nlm4_lock *nl,
127 	struct nlm_host *host, rpcvers_t vers, short type)
128 {
129 	uint64_t off, len;
130 
131 	bzero(fl, sizeof (*fl));
132 	off = nl->l_offset;
133 	len = nl->l_len;
134 
135 	if (vers < NLM4_VERS) {
136 		if (off > MAX_UOFF32 || len > MAX_UOFF32)
137 			return (EINVAL);
138 		if (off + len > MAX_UOFF32 + 1)
139 			return (EINVAL);
140 	} else {
141 		/*
142 		 * Check range for 64-bit client (no overflow).
143 		 * Again allow len == ~0 to mean lock to EOF.
144 		 */
145 		if (len == MAX_U_OFFSET_T)
146 			len = 0;
147 		if (len != 0 && off + (len - 1) < off)
148 			return (EINVAL);
149 	}
150 
151 	fl->l_type = type;
152 	fl->l_whence = SEEK_SET;
153 	fl->l_start = off;
154 	fl->l_len = len;
155 	fl->l_sysid = host->nh_sysid;
156 	fl->l_pid = nl->svid;
157 	/* l_pad */
158 
159 	return (0);
160 }
161 
162 /*
163  * Convert an fhandle into a vnode.
164  * Uses the file id (fh_len + fh_data) in the fhandle to get the vnode.
165  * WARNING: users of this routine must do a VN_RELE on the vnode when they
166  * are done with it.
167  * This is just like nfs_fhtovp() but without the exportinfo argument.
168  */
169 static vnode_t *
170 lm_fhtovp(fhandle3_t *fh)
171 {
172 	vfs_t *vfsp;
173 	vnode_t *vp;
174 	int error;
175 
176 	vfsp = getvfs(&fh->_fh3_fsid);
177 	if (vfsp == NULL)
178 		return (NULL);
179 
180 	/* LINTED E_BAD_PTR_CAST_ALIGN */
181 	error = VFS_VGET(vfsp, &vp, (fid_t *)&(fh->_fh3_len));
182 	VFS_RELE(vfsp);
183 	if (error || vp == NULL)
184 		return (NULL);
185 
186 	return (vp);
187 }
188 
189 /*
190  * Gets vnode from client's filehandle
191  * NOTE: Holds vnode, it _must_ be explicitly
192  * released by VN_RELE().
193  */
194 static vnode_t *
195 nlm_fh_to_vp(struct netobj *fh)
196 {
197 	fhandle3_t *fhp;
198 
199 	/*
200 	 * Get a vnode pointer for the given NFS file handle.
201 	 * Note that it could be an NFSv2 or NFSv3 handle,
202 	 * which means the size might vary.  (don't copy)
203 	 */
204 	if (fh->n_len < sizeof (fhandle_t))
205 		return (NULL);
206 
207 	/* We know this is aligned (kmem_alloc) */
208 	/* LINTED E_BAD_PTR_CAST_ALIGN */
209 	fhp = (fhandle3_t *)fh->n_bytes;
210 
211 	/*
212 	 * See the comment for NFS_FH3MAXDATA in uts/common/nfs/nfs.h for
213 	 * converting fhandles. Check the NFSv3 file handle size. The lockmgr
214 	 * is not used for NFS v4.
215 	 */
216 	if (fhp->_fh3_len > NFS_FH3MAXDATA || fhp->_fh3_len == 0)
217 		return (NULL);
218 
219 	return (lm_fhtovp(fhp));
220 }
221 
222 /*
223  * Get vhold from client's filehandle, but in contrast to
224  * The function tries to check some access rights as well.
225  *
226  * NOTE: vhold object _must_ be explicitly released by
227  * nlm_vhold_release().
228  */
229 static struct nlm_vhold *
230 nlm_fh_to_vhold(struct nlm_host *hostp, struct netobj *fh)
231 {
232 	vnode_t *vp;
233 	struct nlm_vhold *nvp;
234 
235 	vp = nlm_fh_to_vp(fh);
236 	if (vp == NULL)
237 		return (NULL);
238 
239 
240 	nvp = nlm_vhold_get(hostp, vp);
241 
242 	/*
243 	 * Both nlm_fh_to_vp() and nlm_vhold_get()
244 	 * do VN_HOLD(), so we need to drop one
245 	 * reference on vnode.
246 	 */
247 	VN_RELE(vp);
248 	return (nvp);
249 }
250 
251 /* ******************************************************************* */
252 
253 /*
254  * NLM implementation details, called from the RPC svc code.
255  */
256 
257 /*
258  * Call-back from NFS statd, used to notify that one of our
259  * hosts had a status change. The host can be either an
260  * NFS client, NFS server or both.
261  * According to NSM protocol description, the state is a
262  * number that is increases monotonically each time the
263  * state of host changes. An even number indicates that
264  * the host is down, while an odd number indicates that
265  * the host is up.
266  *
267  * Here we ignore this even/odd difference of status number
268  * reported by the NSM, we launch notification handlers
269  * every time the state is changed. The reason we why do so
270  * is that client and server can talk to each other using
271  * connectionless transport and it's easy to lose packet
272  * containing NSM notification with status number update.
273  *
274  * In nlm_host_monitor(), we put the sysid in the private data
275  * that statd carries in this callback, so we can easliy find
276  * the host this call applies to.
277  */
278 /* ARGSUSED */
279 void
280 nlm_do_notify1(nlm_sm_status *argp, void *res, struct svc_req *sr)
281 {
282 	struct nlm_globals *g;
283 	struct nlm_host *host;
284 	uint16_t sysid;
285 
286 	g = zone_getspecific(nlm_zone_key, curzone);
287 	bcopy(&argp->priv, &sysid, sizeof (sysid));
288 
289 	DTRACE_PROBE2(nsm__notify, uint16_t, sysid,
290 	    int, argp->state);
291 
292 	host = nlm_host_find_by_sysid(g, (sysid_t)sysid);
293 	if (host == NULL)
294 		return;
295 
296 	nlm_host_notify_server(host, argp->state);
297 	nlm_host_notify_client(host, argp->state);
298 	nlm_host_release(g, host);
299 }
300 
301 /*
302  * Another available call-back for NFS statd.
303  * Not currently used.
304  */
305 /* ARGSUSED */
306 void
307 nlm_do_notify2(nlm_sm_status *argp, void *res, struct svc_req *sr)
308 {
309 	ASSERT(0);
310 }
311 
312 
313 /*
314  * NLM_TEST, NLM_TEST_MSG,
315  * NLM4_TEST, NLM4_TEST_MSG,
316  * Client inquiry about locks, non-blocking.
317  */
318 void
319 nlm_do_test(nlm4_testargs *argp, nlm4_testres *resp,
320     struct svc_req *sr, nlm_testres_cb cb)
321 {
322 	struct nlm_globals *g;
323 	struct nlm_host *host;
324 	struct nlm4_holder *lh;
325 	struct nlm_owner_handle *oh;
326 	nlm_rpc_t *rpcp = NULL;
327 	vnode_t *vp = NULL;
328 	struct netbuf *addr;
329 	char *netid;
330 	char *name;
331 	int error;
332 	struct flock64 fl;
333 
334 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
335 
336 	name = argp->alock.caller_name;
337 	netid = svc_getnetid(sr->rq_xprt);
338 	addr = svc_getrpccaller(sr->rq_xprt);
339 
340 	g = zone_getspecific(nlm_zone_key, curzone);
341 	host = nlm_host_findcreate(g, name, netid, addr);
342 	if (host == NULL) {
343 		resp->stat.stat = nlm4_denied_nolocks;
344 		return;
345 	}
346 	if (cb != NULL) {
347 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
348 		if (error != 0) {
349 			resp->stat.stat = nlm4_denied_nolocks;
350 			goto out;
351 		}
352 	}
353 
354 	vp = nlm_fh_to_vp(&argp->alock.fh);
355 	if (vp == NULL) {
356 		resp->stat.stat = nlm4_stale_fh;
357 		goto out;
358 	}
359 
360 	if (NLM_IN_GRACE(g)) {
361 		resp->stat.stat = nlm4_denied_grace_period;
362 		goto out;
363 	}
364 
365 	/* Convert to local form. */
366 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers,
367 	    (argp->exclusive) ? F_WRLCK : F_RDLCK);
368 	if (error) {
369 		resp->stat.stat = nlm4_failed;
370 		goto out;
371 	}
372 
373 	/* BSD: VOP_ADVLOCK(nv->nv_vp, NULL, F_GETLK, &fl, F_REMOTE); */
374 	error = nlm_vop_frlock(vp, F_GETLK, &fl,
375 	    F_REMOTELOCK | FREAD | FWRITE,
376 	    (u_offset_t)0, NULL, CRED(), NULL);
377 	if (error) {
378 		resp->stat.stat = nlm4_failed;
379 		goto out;
380 	}
381 
382 	if (fl.l_type == F_UNLCK) {
383 		resp->stat.stat = nlm4_granted;
384 		goto out;
385 	}
386 	resp->stat.stat = nlm4_denied;
387 
388 	/*
389 	 * This lock "test" fails due to a conflicting lock.
390 	 *
391 	 * If this is a v1 client, make sure the conflicting
392 	 * lock range we report can be expressed with 32-bit
393 	 * offsets.  The lock range requested was expressed
394 	 * as 32-bit offset and length, so at least part of
395 	 * the conflicting lock should lie below MAX_UOFF32.
396 	 * If the conflicting lock extends past that, we'll
397 	 * trim the range to end at MAX_UOFF32 so this lock
398 	 * can be represented in a 32-bit response.  Check
399 	 * the start also (paranoid, but a low cost check).
400 	 */
401 	if (sr->rq_vers < NLM4_VERS) {
402 		uint64 maxlen;
403 		if (fl.l_start > MAX_UOFF32)
404 			fl.l_start = MAX_UOFF32;
405 		maxlen = MAX_UOFF32 + 1 - fl.l_start;
406 		if (fl.l_len > maxlen)
407 			fl.l_len = maxlen;
408 	}
409 
410 	/*
411 	 * Build the nlm4_holder result structure.
412 	 *
413 	 * Note that lh->oh is freed via xdr_free,
414 	 * xdr_nlm4_holder, xdr_netobj, xdr_bytes.
415 	 */
416 	oh = kmem_zalloc(sizeof (*oh), KM_SLEEP);
417 	oh->oh_sysid = (sysid_t)fl.l_sysid;
418 	lh = &resp->stat.nlm4_testrply_u.holder;
419 	lh->exclusive = (fl.l_type == F_WRLCK);
420 	lh->svid = fl.l_pid;
421 	lh->oh.n_len = sizeof (*oh);
422 	lh->oh.n_bytes = (void *)oh;
423 	lh->l_offset = fl.l_start;
424 	lh->l_len = fl.l_len;
425 
426 out:
427 	/*
428 	 * If we have a callback function, use that to
429 	 * deliver the response via another RPC call.
430 	 */
431 	if (cb != NULL && rpcp != NULL)
432 		NLM_INVOKE_CALLBACK("test", rpcp, resp, cb);
433 
434 	if (vp != NULL)
435 		VN_RELE(vp);
436 	if (rpcp != NULL)
437 		nlm_host_rele_rpc(host, rpcp);
438 
439 	nlm_host_release(g, host);
440 }
441 
442 /*
443  * NLM_LOCK, NLM_LOCK_MSG, NLM_NM_LOCK
444  * NLM4_LOCK, NLM4_LOCK_MSG, NLM4_NM_LOCK
445  *
446  * Client request to set a lock, possibly blocking.
447  *
448  * If the lock needs to block, we return status blocked to
449  * this RPC call, and then later call back the client with
450  * a "granted" callback.  Tricky aspects of this include:
451  * sending a reply before this function returns, and then
452  * borrowing this thread from the RPC service pool for the
453  * wait on the lock and doing the later granted callback.
454  *
455  * We also have to keep a list of locks (pending + granted)
456  * both to handle retransmitted requests, and to keep the
457  * vnodes for those locks active.
458  */
459 void
460 nlm_do_lock(nlm4_lockargs *argp, nlm4_res *resp, struct svc_req *sr,
461     nlm_reply_cb reply_cb, nlm_res_cb res_cb, nlm_testargs_cb grant_cb)
462 {
463 	struct nlm_globals *g;
464 	struct flock64 fl;
465 	struct nlm_host *host = NULL;
466 	struct netbuf *addr;
467 	struct nlm_vhold *nvp = NULL;
468 	nlm_rpc_t *rpcp = NULL;
469 	char *netid;
470 	char *name;
471 	int error, flags;
472 	bool_t do_blocking = FALSE;
473 	bool_t do_mon_req = FALSE;
474 	enum nlm4_stats status;
475 
476 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
477 
478 	name = argp->alock.caller_name;
479 	netid = svc_getnetid(sr->rq_xprt);
480 	addr = svc_getrpccaller(sr->rq_xprt);
481 
482 	g = zone_getspecific(nlm_zone_key, curzone);
483 	host = nlm_host_findcreate(g, name, netid, addr);
484 	if (host == NULL) {
485 		DTRACE_PROBE4(no__host, struct nlm_globals *, g,
486 		    char *, name, char *, netid, struct netbuf *, addr);
487 		status = nlm4_denied_nolocks;
488 		goto doreply;
489 	}
490 
491 	DTRACE_PROBE3(start, struct nlm_globals *, g,
492 	    struct nlm_host *, host, nlm4_lockargs *, argp);
493 
494 	/*
495 	 * If we may need to do _msg_ call needing an RPC
496 	 * callback, get the RPC client handle now,
497 	 * so we know if we can bind to the NLM service on
498 	 * this client.
499 	 *
500 	 * Note: host object carries transport type.
501 	 * One client using multiple transports gets
502 	 * separate sysids for each of its transports.
503 	 */
504 	if (res_cb != NULL || (grant_cb != NULL && argp->block == TRUE)) {
505 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
506 		if (error != 0) {
507 			status = nlm4_denied_nolocks;
508 			goto doreply;
509 		}
510 	}
511 
512 	/*
513 	 * During the "grace period", only allow reclaim.
514 	 */
515 	if (argp->reclaim == 0 && NLM_IN_GRACE(g)) {
516 		status = nlm4_denied_grace_period;
517 		goto doreply;
518 	}
519 
520 	/*
521 	 * Check whether we missed host shutdown event
522 	 */
523 	if (nlm_host_get_state(host) != argp->state)
524 		nlm_host_notify_server(host, argp->state);
525 
526 	/*
527 	 * Get a hold on the vnode for a lock operation.
528 	 * Only lock() and share() need vhold objects.
529 	 */
530 	nvp = nlm_fh_to_vhold(host, &argp->alock.fh);
531 	if (nvp == NULL) {
532 		status = nlm4_stale_fh;
533 		goto doreply;
534 	}
535 
536 	/* Convert to local form. */
537 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers,
538 	    (argp->exclusive) ? F_WRLCK : F_RDLCK);
539 	if (error) {
540 		status = nlm4_failed;
541 		goto doreply;
542 	}
543 
544 	/*
545 	 * Try to lock non-blocking first.  If we succeed
546 	 * getting the lock, we can reply with the granted
547 	 * status directly and avoid the complications of
548 	 * making the "granted" RPC callback later.
549 	 *
550 	 * This also let's us find out now about some
551 	 * possible errors like EROFS, etc.
552 	 */
553 	flags = F_REMOTELOCK | FREAD | FWRITE;
554 	error = nlm_vop_frlock(nvp->nv_vp, F_SETLK, &fl, flags,
555 	    (u_offset_t)0, NULL, CRED(), NULL);
556 
557 	DTRACE_PROBE3(setlk__res, struct flock64 *, &fl,
558 	    int, flags, int, error);
559 
560 	switch (error) {
561 	case 0:
562 		/* Got it without waiting! */
563 		status = nlm4_granted;
564 		do_mon_req = TRUE;
565 		break;
566 
567 	/* EINPROGRESS too? */
568 	case EAGAIN:
569 		/* We did not get the lock. Should we block? */
570 		if (argp->block == FALSE || grant_cb == NULL) {
571 			status = nlm4_denied;
572 			break;
573 		}
574 		/*
575 		 * Should block.  Try to reserve this thread
576 		 * so we can use it to wait for the lock and
577 		 * later send the granted message.  If this
578 		 * reservation fails, say "no resources".
579 		 */
580 		if (!svc_reserve_thread(sr->rq_xprt)) {
581 			status = nlm4_denied_nolocks;
582 			break;
583 		}
584 		/*
585 		 * OK, can detach this thread, so this call
586 		 * will block below (after we reply).
587 		 */
588 		status = nlm4_blocked;
589 		do_blocking = TRUE;
590 		do_mon_req = TRUE;
591 		break;
592 
593 	case ENOLCK:
594 		/* Failed for lack of resources. */
595 		status = nlm4_denied_nolocks;
596 		break;
597 
598 	case EROFS:
599 		/* read-only file system */
600 		status = nlm4_rofs;
601 		break;
602 
603 	case EFBIG:
604 		/* file too big */
605 		status = nlm4_fbig;
606 		break;
607 
608 	case EDEADLK:
609 		/* dead lock condition */
610 		status = nlm4_deadlck;
611 		break;
612 
613 	default:
614 		status = nlm4_denied;
615 		break;
616 	}
617 
618 doreply:
619 	resp->stat.stat = status;
620 
621 	/*
622 	 * We get one of two function pointers; one for a
623 	 * normal RPC reply, and another for doing an RPC
624 	 * "callback" _res reply for a _msg function.
625 	 * Use either of those to send the reply now.
626 	 *
627 	 * If sending this reply fails, just leave the
628 	 * lock in the list for retransmitted requests.
629 	 * Cleanup is via unlock or host rele (statmon).
630 	 */
631 	if (reply_cb != NULL) {
632 		/* i.e. nlm_lock_1_reply */
633 		if (!(*reply_cb)(sr->rq_xprt, resp))
634 			svcerr_systemerr(sr->rq_xprt);
635 	}
636 	if (res_cb != NULL && rpcp != NULL)
637 		NLM_INVOKE_CALLBACK("lock", rpcp, resp, res_cb);
638 
639 	/*
640 	 * The reply has been sent to the client.
641 	 * Start monitoring this client (maybe).
642 	 *
643 	 * Note that the non-monitored (NM) calls pass grant_cb=NULL
644 	 * indicating that the client doesn't support RPC callbacks.
645 	 * No monitoring for these (lame) clients.
646 	 */
647 	if (do_mon_req && grant_cb != NULL)
648 		nlm_host_monitor(g, host, argp->state);
649 
650 	if (do_blocking) {
651 		/*
652 		 * We need to block on this lock, and when that
653 		 * completes, do the granted RPC call. Note that
654 		 * we "reserved" this thread above, so we can now
655 		 * "detach" it from the RPC SVC pool, allowing it
656 		 * to block indefinitely if needed.
657 		 */
658 		ASSERT(rpcp != NULL);
659 		(void) svc_detach_thread(sr->rq_xprt);
660 		nlm_block(argp, host, nvp, rpcp, &fl, grant_cb);
661 	}
662 
663 	DTRACE_PROBE3(lock__end, struct nlm_globals *, g,
664 	    struct nlm_host *, host, nlm4_res *, resp);
665 
666 	if (rpcp != NULL)
667 		nlm_host_rele_rpc(host, rpcp);
668 
669 	nlm_vhold_release(host, nvp);
670 	nlm_host_release(g, host);
671 }
672 
673 /*
674  * Helper for nlm_do_lock(), partly for observability,
675  * (we'll see a call blocked in this function) and
676  * because nlm_do_lock() was getting quite long.
677  */
678 static void
679 nlm_block(nlm4_lockargs *lockargs,
680     struct nlm_host *host,
681     struct nlm_vhold *nvp,
682     nlm_rpc_t *rpcp,
683     struct flock64 *flp,
684     nlm_testargs_cb grant_cb)
685 {
686 	nlm4_testargs args;
687 	int error;
688 	flk_callback_t flk_cb;
689 	struct nlm_block_cb_data cb_data;
690 
691 	/*
692 	 * Keep a list of blocked locks on nh_pending, and use it
693 	 * to cancel these threads in nlm_destroy_client_pending.
694 	 *
695 	 * Check to see if this lock is already in the list
696 	 * and if not, add an entry for it.  Allocate first,
697 	 * then if we don't insert, free the new one.
698 	 * Caller already has vp held.
699 	 */
700 
701 	error = nlm_slreq_register(host, nvp, flp);
702 	if (error != 0) {
703 		/*
704 		 * Sleeping lock request with given fl is already
705 		 * registered by someone else. This means that
706 		 * some other thread is handling the request, let
707 		 * him to do its work.
708 		 */
709 		ASSERT(error == EEXIST);
710 		return;
711 	}
712 
713 	cb_data.hostp = host;
714 	cb_data.nvp = nvp;
715 	cb_data.flp = flp;
716 	flk_init_callback(&flk_cb, nlm_block_callback, &cb_data);
717 
718 	/* BSD: VOP_ADVLOCK(vp, NULL, F_SETLK, fl, F_REMOTE); */
719 	error = nlm_vop_frlock(nvp->nv_vp, F_SETLKW, flp,
720 	    F_REMOTELOCK | FREAD | FWRITE,
721 	    (u_offset_t)0, &flk_cb, CRED(), NULL);
722 
723 	if (error != 0) {
724 		/*
725 		 * We failed getting the lock, but have no way to
726 		 * tell the client about that.  Let 'em time out.
727 		 */
728 		(void) nlm_slreq_unregister(host, nvp, flp);
729 		return;
730 	}
731 
732 	/*
733 	 * Do the "granted" call-back to the client.
734 	 */
735 	args.cookie	= lockargs->cookie;
736 	args.exclusive	= lockargs->exclusive;
737 	args.alock	= lockargs->alock;
738 
739 	NLM_INVOKE_CALLBACK("grant", rpcp, &args, grant_cb);
740 }
741 
742 /*
743  * The function that is used as flk callback when NLM server
744  * sets new sleeping lock. The function unregisters NLM
745  * sleeping lock request (nlm_slreq) associated with the
746  * sleeping lock _before_ lock becomes active. It prevents
747  * potential race condition between nlm_block() and
748  * nlm_do_cancel().
749  */
750 static callb_cpr_t *
751 nlm_block_callback(flk_cb_when_t when, void *data)
752 {
753 	struct nlm_block_cb_data *cb_data;
754 
755 	cb_data = (struct nlm_block_cb_data *)data;
756 	if (when == FLK_AFTER_SLEEP) {
757 		(void) nlm_slreq_unregister(cb_data->hostp,
758 		    cb_data->nvp, cb_data->flp);
759 	}
760 
761 	return (0);
762 }
763 
764 /*
765  * NLM_CANCEL, NLM_CANCEL_MSG,
766  * NLM4_CANCEL, NLM4_CANCEL_MSG,
767  * Client gives up waiting for a blocking lock.
768  */
769 void
770 nlm_do_cancel(nlm4_cancargs *argp, nlm4_res *resp,
771     struct svc_req *sr, nlm_res_cb cb)
772 {
773 	struct nlm_globals *g;
774 	struct nlm_host *host;
775 	struct netbuf *addr;
776 	struct nlm_vhold *nvp = NULL;
777 	nlm_rpc_t *rpcp = NULL;
778 	char *netid;
779 	char *name;
780 	int error;
781 	struct flock64 fl;
782 
783 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
784 	netid = svc_getnetid(sr->rq_xprt);
785 	addr = svc_getrpccaller(sr->rq_xprt);
786 	name = argp->alock.caller_name;
787 
788 	g = zone_getspecific(nlm_zone_key, curzone);
789 	host = nlm_host_findcreate(g, name, netid, addr);
790 	if (host == NULL) {
791 		resp->stat.stat = nlm4_denied_nolocks;
792 		return;
793 	}
794 	if (cb != NULL) {
795 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
796 		if (error != 0) {
797 			resp->stat.stat = nlm4_denied_nolocks;
798 			return;
799 		}
800 	}
801 
802 	DTRACE_PROBE3(start, struct nlm_globals *, g,
803 	    struct nlm_host *, host, nlm4_cancargs *, argp);
804 
805 	if (NLM_IN_GRACE(g)) {
806 		resp->stat.stat = nlm4_denied_grace_period;
807 		goto out;
808 	}
809 
810 	nvp = nlm_fh_to_vhold(host, &argp->alock.fh);
811 	if (nvp == NULL) {
812 		resp->stat.stat = nlm4_stale_fh;
813 		goto out;
814 	}
815 
816 	/* Convert to local form. */
817 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers,
818 	    (argp->exclusive) ? F_WRLCK : F_RDLCK);
819 	if (error) {
820 		resp->stat.stat = nlm4_failed;
821 		goto out;
822 	}
823 
824 	error = nlm_slreq_unregister(host, nvp, &fl);
825 	if (error != 0) {
826 		/*
827 		 * There's no sleeping lock request corresponding
828 		 * to the lock. Then requested sleeping lock
829 		 * doesn't exist.
830 		 */
831 		resp->stat.stat = nlm4_denied;
832 		goto out;
833 	}
834 
835 	fl.l_type = F_UNLCK;
836 	error = nlm_vop_frlock(nvp->nv_vp, F_SETLK, &fl,
837 	    F_REMOTELOCK | FREAD | FWRITE,
838 	    (u_offset_t)0, NULL, CRED(), NULL);
839 
840 	resp->stat.stat = (error == 0) ?
841 	    nlm4_granted : nlm4_denied;
842 
843 out:
844 	/*
845 	 * If we have a callback function, use that to
846 	 * deliver the response via another RPC call.
847 	 */
848 	if (cb != NULL && rpcp != NULL)
849 		NLM_INVOKE_CALLBACK("cancel", rpcp, resp, cb);
850 
851 	DTRACE_PROBE3(cancel__end, struct nlm_globals *, g,
852 	    struct nlm_host *, host, nlm4_res *, resp);
853 
854 	if (rpcp != NULL)
855 		nlm_host_rele_rpc(host, rpcp);
856 
857 	nlm_vhold_release(host, nvp);
858 	nlm_host_release(g, host);
859 }
860 
861 /*
862  * NLM_UNLOCK, NLM_UNLOCK_MSG,
863  * NLM4_UNLOCK, NLM4_UNLOCK_MSG,
864  * Client removes one of their locks.
865  */
866 void
867 nlm_do_unlock(nlm4_unlockargs *argp, nlm4_res *resp,
868     struct svc_req *sr, nlm_res_cb cb)
869 {
870 	struct nlm_globals *g;
871 	struct nlm_host *host;
872 	struct netbuf *addr;
873 	nlm_rpc_t *rpcp = NULL;
874 	vnode_t *vp = NULL;
875 	char *netid;
876 	char *name;
877 	int error;
878 	struct flock64 fl;
879 
880 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
881 
882 	netid = svc_getnetid(sr->rq_xprt);
883 	addr = svc_getrpccaller(sr->rq_xprt);
884 	name = argp->alock.caller_name;
885 
886 	/*
887 	 * NLM_UNLOCK operation doesn't have an error code
888 	 * denoting that operation failed, so we always
889 	 * return nlm4_granted except when the server is
890 	 * in a grace period.
891 	 */
892 	resp->stat.stat = nlm4_granted;
893 
894 	g = zone_getspecific(nlm_zone_key, curzone);
895 	host = nlm_host_findcreate(g, name, netid, addr);
896 	if (host == NULL)
897 		return;
898 
899 	if (cb != NULL) {
900 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
901 		if (error != 0)
902 			goto out;
903 	}
904 
905 	DTRACE_PROBE3(start, struct nlm_globals *, g,
906 	    struct nlm_host *, host, nlm4_unlockargs *, argp);
907 
908 	if (NLM_IN_GRACE(g)) {
909 		resp->stat.stat = nlm4_denied_grace_period;
910 		goto out;
911 	}
912 
913 	vp = nlm_fh_to_vp(&argp->alock.fh);
914 	if (vp == NULL)
915 		goto out;
916 
917 	/* Convert to local form. */
918 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers, F_UNLCK);
919 	if (error)
920 		goto out;
921 
922 	/* BSD: VOP_ADVLOCK(nv->nv_vp, NULL, F_UNLCK, &fl, F_REMOTE); */
923 	error = nlm_vop_frlock(vp, F_SETLK, &fl,
924 	    F_REMOTELOCK | FREAD | FWRITE,
925 	    (u_offset_t)0, NULL, CRED(), NULL);
926 
927 	DTRACE_PROBE1(unlock__res, int, error);
928 out:
929 	/*
930 	 * If we have a callback function, use that to
931 	 * deliver the response via another RPC call.
932 	 */
933 	if (cb != NULL && rpcp != NULL)
934 		NLM_INVOKE_CALLBACK("unlock", rpcp, resp, cb);
935 
936 	DTRACE_PROBE3(unlock__end, struct nlm_globals *, g,
937 	    struct nlm_host *, host, nlm4_res *, resp);
938 
939 	if (vp != NULL)
940 		VN_RELE(vp);
941 	if (rpcp != NULL)
942 		nlm_host_rele_rpc(host, rpcp);
943 
944 	nlm_host_release(g, host);
945 }
946 
947 /*
948  * NLM_GRANTED, NLM_GRANTED_MSG,
949  * NLM4_GRANTED, NLM4_GRANTED_MSG,
950  *
951  * This service routine is special.  It's the only one that's
952  * really part of our NLM _client_ support, used by _servers_
953  * to "call back" when a blocking lock from this NLM client
954  * is granted by the server.  In this case, we _know_ there is
955  * already an nlm_host allocated and held by the client code.
956  * We want to find that nlm_host here.
957  *
958  * Over in nlm_call_lock(), the client encoded the sysid for this
959  * server in the "owner handle" netbuf sent with our lock request.
960  * We can now use that to find the nlm_host object we used there.
961  * (NB: The owner handle is opaque to the server.)
962  */
963 void
964 nlm_do_granted(nlm4_testargs *argp, nlm4_res *resp,
965     struct svc_req *sr, nlm_res_cb cb)
966 {
967 	struct nlm_globals *g;
968 	struct nlm_owner_handle *oh;
969 	struct nlm_host *host;
970 	nlm_rpc_t *rpcp = NULL;
971 	int error;
972 
973 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
974 	resp->stat.stat = nlm4_denied;
975 
976 	g = zone_getspecific(nlm_zone_key, curzone);
977 	oh = (void *) argp->alock.oh.n_bytes;
978 	if (oh == NULL)
979 		return;
980 
981 	host = nlm_host_find_by_sysid(g, oh->oh_sysid);
982 	if (host == NULL)
983 		return;
984 
985 	if (cb != NULL) {
986 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
987 		if (error != 0)
988 			goto out;
989 	}
990 
991 	if (NLM_IN_GRACE(g)) {
992 		resp->stat.stat = nlm4_denied_grace_period;
993 		goto out;
994 	}
995 
996 	error = nlm_slock_grant(g, host, &argp->alock);
997 	if (error == 0)
998 		resp->stat.stat = nlm4_granted;
999 
1000 out:
1001 	/*
1002 	 * If we have a callback function, use that to
1003 	 * deliver the response via another RPC call.
1004 	 */
1005 	if (cb != NULL && rpcp != NULL)
1006 		NLM_INVOKE_CALLBACK("do_granted", rpcp, resp, cb);
1007 
1008 	if (rpcp != NULL)
1009 		nlm_host_rele_rpc(host, rpcp);
1010 
1011 	nlm_host_release(g, host);
1012 }
1013 
1014 /*
1015  * NLM_FREE_ALL, NLM4_FREE_ALL
1016  *
1017  * Destroy all lock state for the calling client.
1018  */
1019 void
1020 nlm_do_free_all(nlm4_notify *argp, void *res, struct svc_req *sr)
1021 {
1022 	struct nlm_globals *g;
1023 	struct nlm_host_list host_list;
1024 	struct nlm_host *hostp;
1025 
1026 	TAILQ_INIT(&host_list);
1027 	g = zone_getspecific(nlm_zone_key, curzone);
1028 
1029 	/* Serialize calls to clean locks. */
1030 	mutex_enter(&g->clean_lock);
1031 
1032 	/*
1033 	 * Find all hosts that have the given node name and put them on a
1034 	 * local list.
1035 	 */
1036 	mutex_enter(&g->lock);
1037 	for (hostp = avl_first(&g->nlm_hosts_tree); hostp != NULL;
1038 	    hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp)) {
1039 		if (strcasecmp(hostp->nh_name, argp->name) == 0) {
1040 			/*
1041 			 * If needed take the host out of the idle list since
1042 			 * we are taking a reference.
1043 			 */
1044 			if (hostp->nh_flags & NLM_NH_INIDLE) {
1045 				TAILQ_REMOVE(&g->nlm_idle_hosts, hostp,
1046 				    nh_link);
1047 				hostp->nh_flags &= ~NLM_NH_INIDLE;
1048 			}
1049 			hostp->nh_refs++;
1050 
1051 			TAILQ_INSERT_TAIL(&host_list, hostp, nh_link);
1052 		}
1053 	}
1054 	mutex_exit(&g->lock);
1055 
1056 	/* Free locks for all hosts on the local list. */
1057 	while (!TAILQ_EMPTY(&host_list)) {
1058 		hostp = TAILQ_FIRST(&host_list);
1059 		TAILQ_REMOVE(&host_list, hostp, nh_link);
1060 
1061 		/*
1062 		 * Note that this does not do client-side cleanup.
1063 		 * We want to do that ONLY if statd tells us the
1064 		 * server has restarted.
1065 		 */
1066 		nlm_host_notify_server(hostp, argp->state);
1067 		nlm_host_release(g, hostp);
1068 	}
1069 
1070 	mutex_exit(&g->clean_lock);
1071 
1072 	(void) res;
1073 	(void) sr;
1074 }
1075 
1076 static void
1077 nlm_init_shrlock(struct shrlock *shr,
1078     nlm4_share *nshare, struct nlm_host *host)
1079 {
1080 
1081 	switch (nshare->access) {
1082 	default:
1083 	case fsa_NONE:
1084 		shr->s_access = 0;
1085 		break;
1086 	case fsa_R:
1087 		shr->s_access = F_RDACC;
1088 		break;
1089 	case fsa_W:
1090 		shr->s_access = F_WRACC;
1091 		break;
1092 	case fsa_RW:
1093 		shr->s_access = F_RWACC;
1094 		break;
1095 	}
1096 
1097 	switch (nshare->mode) {
1098 	default:
1099 	case fsm_DN:
1100 		shr->s_deny = F_NODNY;
1101 		break;
1102 	case fsm_DR:
1103 		shr->s_deny = F_RDDNY;
1104 		break;
1105 	case fsm_DW:
1106 		shr->s_deny = F_WRDNY;
1107 		break;
1108 	case fsm_DRW:
1109 		shr->s_deny = F_RWDNY;
1110 		break;
1111 	}
1112 
1113 	shr->s_sysid = host->nh_sysid;
1114 	shr->s_pid = 0;
1115 	shr->s_own_len = nshare->oh.n_len;
1116 	shr->s_owner   = nshare->oh.n_bytes;
1117 }
1118 
1119 /*
1120  * NLM_SHARE, NLM4_SHARE
1121  *
1122  * Request a DOS-style share reservation
1123  */
1124 void
1125 nlm_do_share(nlm4_shareargs *argp, nlm4_shareres *resp, struct svc_req *sr)
1126 {
1127 	struct nlm_globals *g;
1128 	struct nlm_host *host;
1129 	struct netbuf *addr;
1130 	struct nlm_vhold *nvp = NULL;
1131 	char *netid;
1132 	char *name;
1133 	int error;
1134 	struct shrlock shr;
1135 
1136 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
1137 
1138 	name = argp->share.caller_name;
1139 	netid = svc_getnetid(sr->rq_xprt);
1140 	addr = svc_getrpccaller(sr->rq_xprt);
1141 
1142 	g = zone_getspecific(nlm_zone_key, curzone);
1143 	host = nlm_host_findcreate(g, name, netid, addr);
1144 	if (host == NULL) {
1145 		resp->stat = nlm4_denied_nolocks;
1146 		return;
1147 	}
1148 
1149 	DTRACE_PROBE3(share__start, struct nlm_globals *, g,
1150 	    struct nlm_host *, host, nlm4_shareargs *, argp);
1151 
1152 	if (argp->reclaim == 0 && NLM_IN_GRACE(g)) {
1153 		resp->stat = nlm4_denied_grace_period;
1154 		goto out;
1155 	}
1156 
1157 	/*
1158 	 * Get holded vnode when on lock operation.
1159 	 * Only lock() and share() need vhold objects.
1160 	 */
1161 	nvp = nlm_fh_to_vhold(host, &argp->share.fh);
1162 	if (nvp == NULL) {
1163 		resp->stat = nlm4_stale_fh;
1164 		goto out;
1165 	}
1166 
1167 	/* Convert to local form. */
1168 	nlm_init_shrlock(&shr, &argp->share, host);
1169 	error = VOP_SHRLOCK(nvp->nv_vp, F_SHARE, &shr,
1170 	    FREAD | FWRITE, CRED(), NULL);
1171 
1172 	if (error == 0) {
1173 		resp->stat = nlm4_granted;
1174 		nlm_host_monitor(g, host, 0);
1175 	} else {
1176 		resp->stat = nlm4_denied;
1177 	}
1178 
1179 out:
1180 	DTRACE_PROBE3(share__end, struct nlm_globals *, g,
1181 	    struct nlm_host *, host, nlm4_shareres *, resp);
1182 
1183 	nlm_vhold_release(host, nvp);
1184 	nlm_host_release(g, host);
1185 }
1186 
1187 /*
1188  * NLM_UNSHARE, NLM4_UNSHARE
1189  *
1190  * Release a DOS-style share reservation
1191  */
1192 void
1193 nlm_do_unshare(nlm4_shareargs *argp, nlm4_shareres *resp, struct svc_req *sr)
1194 {
1195 	struct nlm_globals *g;
1196 	struct nlm_host *host;
1197 	struct netbuf *addr;
1198 	vnode_t *vp = NULL;
1199 	char *netid;
1200 	int error;
1201 	struct shrlock shr;
1202 
1203 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
1204 
1205 	netid = svc_getnetid(sr->rq_xprt);
1206 	addr = svc_getrpccaller(sr->rq_xprt);
1207 
1208 	g = zone_getspecific(nlm_zone_key, curzone);
1209 	host = nlm_host_find(g, netid, addr);
1210 	if (host == NULL) {
1211 		resp->stat = nlm4_denied_nolocks;
1212 		return;
1213 	}
1214 
1215 	DTRACE_PROBE3(unshare__start, struct nlm_globals *, g,
1216 	    struct nlm_host *, host, nlm4_shareargs *, argp);
1217 
1218 	if (NLM_IN_GRACE(g)) {
1219 		resp->stat = nlm4_denied_grace_period;
1220 		goto out;
1221 	}
1222 
1223 	vp = nlm_fh_to_vp(&argp->share.fh);
1224 	if (vp == NULL) {
1225 		resp->stat = nlm4_stale_fh;
1226 		goto out;
1227 	}
1228 
1229 	/* Convert to local form. */
1230 	nlm_init_shrlock(&shr, &argp->share, host);
1231 	error = VOP_SHRLOCK(vp, F_UNSHARE, &shr,
1232 	    FREAD | FWRITE, CRED(), NULL);
1233 
1234 	(void) error;
1235 	resp->stat = nlm4_granted;
1236 
1237 out:
1238 	DTRACE_PROBE3(unshare__end, struct nlm_globals *, g,
1239 	    struct nlm_host *, host, nlm4_shareres *, resp);
1240 
1241 	if (vp != NULL)
1242 		VN_RELE(vp);
1243 
1244 	nlm_host_release(g, host);
1245 }
1246 
1247 /*
1248  * NLM wrapper to VOP_FRLOCK that checks the validity of the lock before
1249  * invoking the vnode operation.
1250  */
1251 static int
1252 nlm_vop_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
1253 	struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
1254 {
1255 	if (bfp->l_len != 0 && bfp->l_start + (bfp->l_len - 1) < bfp->l_start) {
1256 		return (EOVERFLOW);
1257 	}
1258 
1259 	return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1260 }
1261