1 /*
2  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
3  * Authors: Doug Rabson <dfr@rabson.org>
4  * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012 by Delphix. All rights reserved.
31  */
32 
33 /*
34  * NFS Lock Manager service functions (nlm_do_...)
35  * Called from nlm_rpc_svc.c wrappers.
36  *
37  * Source code derived from FreeBSD nlm_prot_impl.c
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/thread.h>
43 #include <sys/fcntl.h>
44 #include <sys/flock.h>
45 #include <sys/mount.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/share.h>
49 #include <sys/socket.h>
50 #include <sys/syscall.h>
51 #include <sys/syslog.h>
52 #include <sys/systm.h>
53 #include <sys/taskq.h>
54 #include <sys/unistd.h>
55 #include <sys/vnode.h>
56 #include <sys/vfs.h>
57 #include <sys/queue.h>
58 #include <sys/sdt.h>
59 #include <netinet/in.h>
60 
61 #include <rpc/rpc.h>
62 #include <rpc/xdr.h>
63 #include <rpc/pmap_prot.h>
64 #include <rpc/pmap_clnt.h>
65 #include <rpc/rpcb_prot.h>
66 
67 #include <rpcsvc/nlm_prot.h>
68 #include <rpcsvc/sm_inter.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs_clnt.h>
72 #include <nfs/export.h>
73 #include <nfs/rnode.h>
74 
75 #include "nlm_impl.h"
76 
77 #define	NLM_IN_GRACE(g) (ddi_get_lbolt() < (g)->grace_threshold)
78 
79 struct nlm_block_cb_data {
80 	struct nlm_host		*hostp;
81 	struct nlm_vhold	*nvp;
82 	struct flock64		*flp;
83 };
84 
85 /*
86  * Invoke an asyncronous RPC callbeck
87  * (used when NLM server needs to reply to MSG NLM procedure).
88  */
89 #define	NLM_INVOKE_CALLBACK(descr, rpcp, resp, callb)			\
90 	do {								\
91 		enum clnt_stat _stat;					\
92 									\
93 		_stat = (*(callb))(resp, NULL, (rpcp)->nr_handle);	\
94 		if (_stat != RPC_SUCCESS && _stat != RPC_TIMEDOUT) {	\
95 			struct rpc_err _err;				\
96 									\
97 			CLNT_GETERR((rpcp)->nr_handle, &_err);		\
98 			NLM_ERR("NLM: %s callback failed: "		\
99 			    "stat %d, err %d\n", descr, _stat,		\
100 			    _err.re_errno);				\
101 		}							\
102 									\
103 	_NOTE(CONSTCOND) } while (0)
104 
105 static void nlm_block(
106 	nlm4_lockargs *lockargs,
107 	struct nlm_host *host,
108 	struct nlm_vhold *nvp,
109 	nlm_rpc_t *rpcp,
110 	struct flock64 *fl,
111 	nlm_testargs_cb grant_cb);
112 
113 static vnode_t *nlm_fh_to_vp(struct netobj *);
114 static struct nlm_vhold *nlm_fh_to_vhold(struct nlm_host *, struct netobj *);
115 static void nlm_init_shrlock(struct shrlock *, nlm4_share *, struct nlm_host *);
116 static callb_cpr_t *nlm_block_callback(flk_cb_when_t, void *);
117 static int nlm_vop_frlock(vnode_t *, int, flock64_t *, int, offset_t,
118     struct flk_callback *, cred_t *, caller_context_t *);
119 
120 /*
121  * Convert a lock from network to local form, and
122  * check for valid range (no overflow).
123  */
124 static int
125 nlm_init_flock(struct flock64 *fl, struct nlm4_lock *nl,
126 	struct nlm_host *host, rpcvers_t vers, short type)
127 {
128 	uint64_t off, len;
129 
130 	bzero(fl, sizeof (*fl));
131 	off = nl->l_offset;
132 	len = nl->l_len;
133 
134 	if (vers < NLM4_VERS) {
135 		if (off > MAX_UOFF32 || len > MAX_UOFF32)
136 			return (EINVAL);
137 		if (off + len > MAX_UOFF32 + 1)
138 			return (EINVAL);
139 	} else {
140 		/*
141 		 * Check range for 64-bit client (no overflow).
142 		 * Again allow len == ~0 to mean lock to EOF.
143 		 */
144 		if (len == MAX_U_OFFSET_T)
145 			len = 0;
146 		if (len != 0 && off + (len - 1) < off)
147 			return (EINVAL);
148 	}
149 
150 	fl->l_type = type;
151 	fl->l_whence = SEEK_SET;
152 	fl->l_start = off;
153 	fl->l_len = len;
154 	fl->l_sysid = host->nh_sysid;
155 	fl->l_pid = nl->svid;
156 	/* l_pad */
157 
158 	return (0);
159 }
160 
161 /*
162  * Gets vnode from client's filehandle
163  * NOTE: Holds vnode, it _must_ be explicitly
164  * released by VN_RELE().
165  */
166 static vnode_t *
167 nlm_fh_to_vp(struct netobj *fh)
168 {
169 	fhandle_t *fhp;
170 
171 	/*
172 	 * Get a vnode pointer for the given NFS file handle.
173 	 * Note that it could be an NFSv2 for NFSv3 handle,
174 	 * which means the size might vary.  (don't copy)
175 	 */
176 	if (fh->n_len < sizeof (*fhp))
177 		return (NULL);
178 
179 	/* We know this is aligned (kmem_alloc) */
180 	/* LINTED E_BAD_PTR_CAST_ALIGN */
181 	fhp = (fhandle_t *)fh->n_bytes;
182 	return (lm_fhtovp(fhp));
183 }
184 
185 /*
186  * Get vhold from client's filehandle, but in contrast to
187  * The function tries to check some access rights as well.
188  *
189  * NOTE: vhold object _must_ be explicitly released by
190  * nlm_vhold_release().
191  */
192 static struct nlm_vhold *
193 nlm_fh_to_vhold(struct nlm_host *hostp, struct netobj *fh)
194 {
195 	vnode_t *vp;
196 	struct nlm_vhold *nvp;
197 
198 	vp = nlm_fh_to_vp(fh);
199 	if (vp == NULL)
200 		return (NULL);
201 
202 
203 	nvp = nlm_vhold_get(hostp, vp);
204 
205 	/*
206 	 * Both nlm_fh_to_vp() and nlm_vhold_get()
207 	 * do VN_HOLD(), so we need to drop one
208 	 * reference on vnode.
209 	 */
210 	VN_RELE(vp);
211 	return (nvp);
212 }
213 
214 /* ******************************************************************* */
215 
216 /*
217  * NLM implementation details, called from the RPC svc code.
218  */
219 
220 /*
221  * Call-back from NFS statd, used to notify that one of our
222  * hosts had a status change. The host can be either an
223  * NFS client, NFS server or both.
224  * According to NSM protocol description, the state is a
225  * number that is increases monotonically each time the
226  * state of host changes. An even number indicates that
227  * the host is down, while an odd number indicates that
228  * the host is up.
229  *
230  * Here we ignore this even/odd difference of status number
231  * reported by the NSM, we launch notification handlers
232  * every time the state is changed. The reason we why do so
233  * is that client and server can talk to each other using
234  * connectionless transport and it's easy to lose packet
235  * containing NSM notification with status number update.
236  *
237  * In nlm_host_monitor(), we put the sysid in the private data
238  * that statd carries in this callback, so we can easliy find
239  * the host this call applies to.
240  */
241 /* ARGSUSED */
242 void
243 nlm_do_notify1(nlm_sm_status *argp, void *res, struct svc_req *sr)
244 {
245 	struct nlm_globals *g;
246 	struct nlm_host *host;
247 	uint16_t sysid;
248 
249 	g = zone_getspecific(nlm_zone_key, curzone);
250 	bcopy(&argp->priv, &sysid, sizeof (sysid));
251 
252 	DTRACE_PROBE2(nsm__notify, uint16_t, sysid,
253 	    int, argp->state);
254 
255 	host = nlm_host_find_by_sysid(g, (sysid_t)sysid);
256 	if (host == NULL)
257 		return;
258 
259 	nlm_host_notify_server(host, argp->state);
260 	nlm_host_notify_client(host, argp->state);
261 	nlm_host_release(g, host);
262 }
263 
264 /*
265  * Another available call-back for NFS statd.
266  * Not currently used.
267  */
268 /* ARGSUSED */
269 void
270 nlm_do_notify2(nlm_sm_status *argp, void *res, struct svc_req *sr)
271 {
272 	ASSERT(0);
273 }
274 
275 
276 /*
277  * NLM_TEST, NLM_TEST_MSG,
278  * NLM4_TEST, NLM4_TEST_MSG,
279  * Client inquiry about locks, non-blocking.
280  */
281 void
282 nlm_do_test(nlm4_testargs *argp, nlm4_testres *resp,
283     struct svc_req *sr, nlm_testres_cb cb)
284 {
285 	struct nlm_globals *g;
286 	struct nlm_host *host;
287 	struct nlm4_holder *lh;
288 	struct nlm_owner_handle *oh;
289 	nlm_rpc_t *rpcp = NULL;
290 	vnode_t *vp = NULL;
291 	struct netbuf *addr;
292 	char *netid;
293 	char *name;
294 	int error;
295 	struct flock64 fl;
296 
297 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
298 
299 	name = argp->alock.caller_name;
300 	netid = svc_getnetid(sr->rq_xprt);
301 	addr = svc_getrpccaller(sr->rq_xprt);
302 
303 	g = zone_getspecific(nlm_zone_key, curzone);
304 	host = nlm_host_findcreate(g, name, netid, addr);
305 	if (host == NULL) {
306 		resp->stat.stat = nlm4_denied_nolocks;
307 		return;
308 	}
309 	if (cb != NULL) {
310 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
311 		if (error != 0) {
312 			resp->stat.stat = nlm4_denied_nolocks;
313 			goto out;
314 		}
315 	}
316 
317 	vp = nlm_fh_to_vp(&argp->alock.fh);
318 	if (vp == NULL) {
319 		resp->stat.stat = nlm4_stale_fh;
320 		goto out;
321 	}
322 
323 	if (NLM_IN_GRACE(g)) {
324 		resp->stat.stat = nlm4_denied_grace_period;
325 		goto out;
326 	}
327 
328 	/* Convert to local form. */
329 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers,
330 	    (argp->exclusive) ? F_WRLCK : F_RDLCK);
331 	if (error) {
332 		resp->stat.stat = nlm4_failed;
333 		goto out;
334 	}
335 
336 	/* BSD: VOP_ADVLOCK(nv->nv_vp, NULL, F_GETLK, &fl, F_REMOTE); */
337 	error = nlm_vop_frlock(vp, F_GETLK, &fl,
338 	    F_REMOTELOCK | FREAD | FWRITE,
339 	    (u_offset_t)0, NULL, CRED(), NULL);
340 	if (error) {
341 		resp->stat.stat = nlm4_failed;
342 		goto out;
343 	}
344 
345 	if (fl.l_type == F_UNLCK) {
346 		resp->stat.stat = nlm4_granted;
347 		goto out;
348 	}
349 	resp->stat.stat = nlm4_denied;
350 
351 	/*
352 	 * This lock "test" fails due to a conflicting lock.
353 	 *
354 	 * If this is a v1 client, make sure the conflicting
355 	 * lock range we report can be expressed with 32-bit
356 	 * offsets.  The lock range requested was expressed
357 	 * as 32-bit offset and length, so at least part of
358 	 * the conflicting lock should lie below MAX_UOFF32.
359 	 * If the conflicting lock extends past that, we'll
360 	 * trim the range to end at MAX_UOFF32 so this lock
361 	 * can be represented in a 32-bit response.  Check
362 	 * the start also (paranoid, but a low cost check).
363 	 */
364 	if (sr->rq_vers < NLM4_VERS) {
365 		uint64 maxlen;
366 		if (fl.l_start > MAX_UOFF32)
367 			fl.l_start = MAX_UOFF32;
368 		maxlen = MAX_UOFF32 + 1 - fl.l_start;
369 		if (fl.l_len > maxlen)
370 			fl.l_len = maxlen;
371 	}
372 
373 	/*
374 	 * Build the nlm4_holder result structure.
375 	 *
376 	 * Note that lh->oh is freed via xdr_free,
377 	 * xdr_nlm4_holder, xdr_netobj, xdr_bytes.
378 	 */
379 	oh = kmem_zalloc(sizeof (*oh), KM_SLEEP);
380 	oh->oh_sysid = (sysid_t)fl.l_sysid;
381 	lh = &resp->stat.nlm4_testrply_u.holder;
382 	lh->exclusive = (fl.l_type == F_WRLCK);
383 	lh->svid = fl.l_pid;
384 	lh->oh.n_len = sizeof (*oh);
385 	lh->oh.n_bytes = (void *)oh;
386 	lh->l_offset = fl.l_start;
387 	lh->l_len = fl.l_len;
388 
389 out:
390 	/*
391 	 * If we have a callback funtion, use that to
392 	 * deliver the response via another RPC call.
393 	 */
394 	if (cb != NULL && rpcp != NULL)
395 		NLM_INVOKE_CALLBACK("test", rpcp, resp, cb);
396 
397 	if (vp != NULL)
398 		VN_RELE(vp);
399 	if (rpcp != NULL)
400 		nlm_host_rele_rpc(host, rpcp);
401 
402 	nlm_host_release(g, host);
403 }
404 
405 /*
406  * NLM_LOCK, NLM_LOCK_MSG, NLM_NM_LOCK
407  * NLM4_LOCK, NLM4_LOCK_MSG, NLM4_NM_LOCK
408  *
409  * Client request to set a lock, possibly blocking.
410  *
411  * If the lock needs to block, we return status blocked to
412  * this RPC call, and then later call back the client with
413  * a "granted" callback.  Tricky aspects of this include:
414  * sending a reply before this function returns, and then
415  * borrowing this thread from the RPC service pool for the
416  * wait on the lock and doing the later granted callback.
417  *
418  * We also have to keep a list of locks (pending + granted)
419  * both to handle retransmitted requests, and to keep the
420  * vnodes for those locks active.
421  */
422 void
423 nlm_do_lock(nlm4_lockargs *argp, nlm4_res *resp, struct svc_req *sr,
424     nlm_reply_cb reply_cb, nlm_res_cb res_cb, nlm_testargs_cb grant_cb)
425 {
426 	struct nlm_globals *g;
427 	struct flock64 fl;
428 	struct nlm_host *host = NULL;
429 	struct netbuf *addr;
430 	struct nlm_vhold *nvp = NULL;
431 	nlm_rpc_t *rpcp = NULL;
432 	char *netid;
433 	char *name;
434 	int error, flags;
435 	bool_t do_blocking = FALSE;
436 	bool_t do_mon_req = FALSE;
437 	enum nlm4_stats status;
438 
439 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
440 
441 	name = argp->alock.caller_name;
442 	netid = svc_getnetid(sr->rq_xprt);
443 	addr = svc_getrpccaller(sr->rq_xprt);
444 
445 	g = zone_getspecific(nlm_zone_key, curzone);
446 	host = nlm_host_findcreate(g, name, netid, addr);
447 	if (host == NULL) {
448 		DTRACE_PROBE4(no__host, struct nlm_globals *, g,
449 		    char *, name, char *, netid, struct netbuf *, addr);
450 		status = nlm4_denied_nolocks;
451 		goto doreply;
452 	}
453 
454 	DTRACE_PROBE3(start, struct nlm_globals *, g,
455 	    struct nlm_host *, host, nlm4_lockargs *, argp);
456 
457 	/*
458 	 * If we may need to do _msg_ call needing an RPC
459 	 * callback, get the RPC client handle now,
460 	 * so we know if we can bind to the NLM service on
461 	 * this client.
462 	 *
463 	 * Note: host object carries transport type.
464 	 * One client using multiple transports gets
465 	 * separate sysids for each of its transports.
466 	 */
467 	if (res_cb != NULL || (grant_cb != NULL && argp->block == TRUE)) {
468 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
469 		if (error != 0) {
470 			status = nlm4_denied_nolocks;
471 			goto doreply;
472 		}
473 	}
474 
475 	/*
476 	 * During the "grace period", only allow reclaim.
477 	 */
478 	if (argp->reclaim == 0 && NLM_IN_GRACE(g)) {
479 		status = nlm4_denied_grace_period;
480 		goto doreply;
481 	}
482 
483 	/*
484 	 * Check whether we missed host shutdown event
485 	 */
486 	if (nlm_host_get_state(host) != argp->state)
487 		nlm_host_notify_server(host, argp->state);
488 
489 	/*
490 	 * Get a hold on the vnode for a lock operation.
491 	 * Only lock() and share() need vhold objects.
492 	 */
493 	nvp = nlm_fh_to_vhold(host, &argp->alock.fh);
494 	if (nvp == NULL) {
495 		status = nlm4_stale_fh;
496 		goto doreply;
497 	}
498 
499 	/* Convert to local form. */
500 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers,
501 	    (argp->exclusive) ? F_WRLCK : F_RDLCK);
502 	if (error) {
503 		status = nlm4_failed;
504 		goto doreply;
505 	}
506 
507 	/*
508 	 * Try to lock non-blocking first.  If we succeed
509 	 * getting the lock, we can reply with the granted
510 	 * status directly and avoid the complications of
511 	 * making the "granted" RPC callback later.
512 	 *
513 	 * This also let's us find out now about some
514 	 * possible errors like EROFS, etc.
515 	 */
516 	flags = F_REMOTELOCK | FREAD | FWRITE;
517 	error = nlm_vop_frlock(nvp->nv_vp, F_SETLK, &fl, flags,
518 	    (u_offset_t)0, NULL, CRED(), NULL);
519 
520 	DTRACE_PROBE3(setlk__res, struct flock64 *, &fl,
521 	    int, flags, int, error);
522 
523 	switch (error) {
524 	case 0:
525 		/* Got it without waiting! */
526 		status = nlm4_granted;
527 		do_mon_req = TRUE;
528 		break;
529 
530 	/* EINPROGRESS too? */
531 	case EAGAIN:
532 		/* We did not get the lock. Should we block? */
533 		if (argp->block == FALSE || grant_cb == NULL) {
534 			status = nlm4_denied;
535 			break;
536 		}
537 		/*
538 		 * Should block.  Try to reserve this thread
539 		 * so we can use it to wait for the lock and
540 		 * later send the granted message.  If this
541 		 * reservation fails, say "no resources".
542 		 */
543 		if (!svc_reserve_thread(sr->rq_xprt)) {
544 			status = nlm4_denied_nolocks;
545 			break;
546 		}
547 		/*
548 		 * OK, can detach this thread, so this call
549 		 * will block below (after we reply).
550 		 */
551 		status = nlm4_blocked;
552 		do_blocking = TRUE;
553 		do_mon_req = TRUE;
554 		break;
555 
556 	case ENOLCK:
557 		/* Failed for lack of resources. */
558 		status = nlm4_denied_nolocks;
559 		break;
560 
561 	case EROFS:
562 		/* read-only file system */
563 		status = nlm4_rofs;
564 		break;
565 
566 	case EFBIG:
567 		/* file too big */
568 		status = nlm4_fbig;
569 		break;
570 
571 	case EDEADLK:
572 		/* dead lock condition */
573 		status = nlm4_deadlck;
574 		break;
575 
576 	default:
577 		status = nlm4_denied;
578 		break;
579 	}
580 
581 doreply:
582 	resp->stat.stat = status;
583 
584 	/*
585 	 * We get one of two function pointers; one for a
586 	 * normal RPC reply, and another for doing an RPC
587 	 * "callback" _res reply for a _msg function.
588 	 * Use either of those to send the reply now.
589 	 *
590 	 * If sending this reply fails, just leave the
591 	 * lock in the list for retransmitted requests.
592 	 * Cleanup is via unlock or host rele (statmon).
593 	 */
594 	if (reply_cb != NULL) {
595 		/* i.e. nlm_lock_1_reply */
596 		if (!(*reply_cb)(sr->rq_xprt, resp))
597 			svcerr_systemerr(sr->rq_xprt);
598 	}
599 	if (res_cb != NULL && rpcp != NULL)
600 		NLM_INVOKE_CALLBACK("lock", rpcp, resp, res_cb);
601 
602 	/*
603 	 * The reply has been sent to the client.
604 	 * Start monitoring this client (maybe).
605 	 *
606 	 * Note that the non-monitored (NM) calls pass grant_cb=NULL
607 	 * indicating that the client doesn't support RPC callbacks.
608 	 * No monitoring for these (lame) clients.
609 	 */
610 	if (do_mon_req && grant_cb != NULL)
611 		nlm_host_monitor(g, host, argp->state);
612 
613 	if (do_blocking) {
614 		/*
615 		 * We need to block on this lock, and when that
616 		 * completes, do the granted RPC call. Note that
617 		 * we "reserved" this thread above, so we can now
618 		 * "detach" it from the RPC SVC pool, allowing it
619 		 * to block indefinitely if needed.
620 		 */
621 		ASSERT(rpcp != NULL);
622 		(void) svc_detach_thread(sr->rq_xprt);
623 		nlm_block(argp, host, nvp, rpcp, &fl, grant_cb);
624 	}
625 
626 	DTRACE_PROBE3(lock__end, struct nlm_globals *, g,
627 	    struct nlm_host *, host, nlm4_res *, resp);
628 
629 	if (rpcp != NULL)
630 		nlm_host_rele_rpc(host, rpcp);
631 
632 	nlm_vhold_release(host, nvp);
633 	nlm_host_release(g, host);
634 }
635 
636 /*
637  * Helper for nlm_do_lock(), partly for observability,
638  * (we'll see a call blocked in this function) and
639  * because nlm_do_lock() was getting quite long.
640  */
641 static void
642 nlm_block(nlm4_lockargs *lockargs,
643     struct nlm_host *host,
644     struct nlm_vhold *nvp,
645     nlm_rpc_t *rpcp,
646     struct flock64 *flp,
647     nlm_testargs_cb grant_cb)
648 {
649 	nlm4_testargs args;
650 	int error;
651 	flk_callback_t flk_cb;
652 	struct nlm_block_cb_data cb_data;
653 
654 	/*
655 	 * Keep a list of blocked locks on nh_pending, and use it
656 	 * to cancel these threads in nlm_destroy_client_pending.
657 	 *
658 	 * Check to see if this lock is already in the list
659 	 * and if not, add an entry for it.  Allocate first,
660 	 * then if we don't insert, free the new one.
661 	 * Caller already has vp held.
662 	 */
663 
664 	error = nlm_slreq_register(host, nvp, flp);
665 	if (error != 0) {
666 		/*
667 		 * Sleeping lock request with given fl is already
668 		 * registered by someone else. This means that
669 		 * some other thread is handling the request, let
670 		 * him to do its work.
671 		 */
672 		ASSERT(error == EEXIST);
673 		return;
674 	}
675 
676 	cb_data.hostp = host;
677 	cb_data.nvp = nvp;
678 	cb_data.flp = flp;
679 	flk_init_callback(&flk_cb, nlm_block_callback, &cb_data);
680 
681 	/* BSD: VOP_ADVLOCK(vp, NULL, F_SETLK, fl, F_REMOTE); */
682 	error = nlm_vop_frlock(nvp->nv_vp, F_SETLKW, flp,
683 	    F_REMOTELOCK | FREAD | FWRITE,
684 	    (u_offset_t)0, &flk_cb, CRED(), NULL);
685 
686 	if (error != 0) {
687 		/*
688 		 * We failed getting the lock, but have no way to
689 		 * tell the client about that.  Let 'em time out.
690 		 */
691 		(void) nlm_slreq_unregister(host, nvp, flp);
692 		return;
693 	}
694 
695 	/*
696 	 * Do the "granted" call-back to the client.
697 	 */
698 	args.cookie	= lockargs->cookie;
699 	args.exclusive	= lockargs->exclusive;
700 	args.alock	= lockargs->alock;
701 
702 	NLM_INVOKE_CALLBACK("grant", rpcp, &args, grant_cb);
703 }
704 
705 /*
706  * The function that is used as flk callback when NLM server
707  * sets new sleeping lock. The function unregisters NLM
708  * sleeping lock request (nlm_slreq) associated with the
709  * sleeping lock _before_ lock becomes active. It prevents
710  * potential race condition between nlm_block() and
711  * nlm_do_cancel().
712  */
713 static callb_cpr_t *
714 nlm_block_callback(flk_cb_when_t when, void *data)
715 {
716 	struct nlm_block_cb_data *cb_data;
717 
718 	cb_data = (struct nlm_block_cb_data *)data;
719 	if (when == FLK_AFTER_SLEEP) {
720 		(void) nlm_slreq_unregister(cb_data->hostp,
721 		    cb_data->nvp, cb_data->flp);
722 	}
723 
724 	return (0);
725 }
726 
727 /*
728  * NLM_CANCEL, NLM_CANCEL_MSG,
729  * NLM4_CANCEL, NLM4_CANCEL_MSG,
730  * Client gives up waiting for a blocking lock.
731  */
732 void
733 nlm_do_cancel(nlm4_cancargs *argp, nlm4_res *resp,
734     struct svc_req *sr, nlm_res_cb cb)
735 {
736 	struct nlm_globals *g;
737 	struct nlm_host *host;
738 	struct netbuf *addr;
739 	struct nlm_vhold *nvp = NULL;
740 	nlm_rpc_t *rpcp = NULL;
741 	char *netid;
742 	char *name;
743 	int error;
744 	struct flock64 fl;
745 
746 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
747 	netid = svc_getnetid(sr->rq_xprt);
748 	addr = svc_getrpccaller(sr->rq_xprt);
749 	name = argp->alock.caller_name;
750 
751 	g = zone_getspecific(nlm_zone_key, curzone);
752 	host = nlm_host_findcreate(g, name, netid, addr);
753 	if (host == NULL) {
754 		resp->stat.stat = nlm4_denied_nolocks;
755 		return;
756 	}
757 	if (cb != NULL) {
758 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
759 		if (error != 0) {
760 			resp->stat.stat = nlm4_denied_nolocks;
761 			return;
762 		}
763 	}
764 
765 	DTRACE_PROBE3(start, struct nlm_globals *, g,
766 	    struct nlm_host *, host, nlm4_cancargs *, argp);
767 
768 	if (NLM_IN_GRACE(g)) {
769 		resp->stat.stat = nlm4_denied_grace_period;
770 		goto out;
771 	}
772 
773 	nvp = nlm_fh_to_vhold(host, &argp->alock.fh);
774 	if (nvp == NULL) {
775 		resp->stat.stat = nlm4_stale_fh;
776 		goto out;
777 	}
778 
779 	/* Convert to local form. */
780 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers,
781 	    (argp->exclusive) ? F_WRLCK : F_RDLCK);
782 	if (error) {
783 		resp->stat.stat = nlm4_failed;
784 		goto out;
785 	}
786 
787 	error = nlm_slreq_unregister(host, nvp, &fl);
788 	if (error != 0) {
789 		/*
790 		 * There's no sleeping lock request corresponding
791 		 * to the lock. Then requested sleeping lock
792 		 * doesn't exist.
793 		 */
794 		resp->stat.stat = nlm4_denied;
795 		goto out;
796 	}
797 
798 	fl.l_type = F_UNLCK;
799 	error = nlm_vop_frlock(nvp->nv_vp, F_SETLK, &fl,
800 	    F_REMOTELOCK | FREAD | FWRITE,
801 	    (u_offset_t)0, NULL, CRED(), NULL);
802 
803 	resp->stat.stat = (error == 0) ?
804 	    nlm4_granted : nlm4_denied;
805 
806 out:
807 	/*
808 	 * If we have a callback funtion, use that to
809 	 * deliver the response via another RPC call.
810 	 */
811 	if (cb != NULL && rpcp != NULL)
812 		NLM_INVOKE_CALLBACK("cancel", rpcp, resp, cb);
813 
814 	DTRACE_PROBE3(cancel__end, struct nlm_globals *, g,
815 	    struct nlm_host *, host, nlm4_res *, resp);
816 
817 	if (rpcp != NULL)
818 		nlm_host_rele_rpc(host, rpcp);
819 
820 	nlm_vhold_release(host, nvp);
821 	nlm_host_release(g, host);
822 }
823 
824 /*
825  * NLM_UNLOCK, NLM_UNLOCK_MSG,
826  * NLM4_UNLOCK, NLM4_UNLOCK_MSG,
827  * Client removes one of their locks.
828  */
829 void
830 nlm_do_unlock(nlm4_unlockargs *argp, nlm4_res *resp,
831     struct svc_req *sr, nlm_res_cb cb)
832 {
833 	struct nlm_globals *g;
834 	struct nlm_host *host;
835 	struct netbuf *addr;
836 	nlm_rpc_t *rpcp = NULL;
837 	vnode_t *vp = NULL;
838 	char *netid;
839 	char *name;
840 	int error;
841 	struct flock64 fl;
842 
843 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
844 
845 	netid = svc_getnetid(sr->rq_xprt);
846 	addr = svc_getrpccaller(sr->rq_xprt);
847 	name = argp->alock.caller_name;
848 
849 	/*
850 	 * NLM_UNLOCK operation doesn't have an error code
851 	 * denoting that operation failed, so we always
852 	 * return nlm4_granted except when the server is
853 	 * in a grace period.
854 	 */
855 	resp->stat.stat = nlm4_granted;
856 
857 	g = zone_getspecific(nlm_zone_key, curzone);
858 	host = nlm_host_findcreate(g, name, netid, addr);
859 	if (host == NULL)
860 		return;
861 
862 	if (cb != NULL) {
863 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
864 		if (error != 0)
865 			goto out;
866 	}
867 
868 	DTRACE_PROBE3(start, struct nlm_globals *, g,
869 	    struct nlm_host *, host, nlm4_unlockargs *, argp);
870 
871 	if (NLM_IN_GRACE(g)) {
872 		resp->stat.stat = nlm4_denied_grace_period;
873 		goto out;
874 	}
875 
876 	vp = nlm_fh_to_vp(&argp->alock.fh);
877 	if (vp == NULL)
878 		goto out;
879 
880 	/* Convert to local form. */
881 	error = nlm_init_flock(&fl, &argp->alock, host, sr->rq_vers, F_UNLCK);
882 	if (error)
883 		goto out;
884 
885 	/* BSD: VOP_ADVLOCK(nv->nv_vp, NULL, F_UNLCK, &fl, F_REMOTE); */
886 	error = nlm_vop_frlock(vp, F_SETLK, &fl,
887 	    F_REMOTELOCK | FREAD | FWRITE,
888 	    (u_offset_t)0, NULL, CRED(), NULL);
889 
890 	DTRACE_PROBE1(unlock__res, int, error);
891 out:
892 	/*
893 	 * If we have a callback funtion, use that to
894 	 * deliver the response via another RPC call.
895 	 */
896 	if (cb != NULL && rpcp != NULL)
897 		NLM_INVOKE_CALLBACK("unlock", rpcp, resp, cb);
898 
899 	DTRACE_PROBE3(unlock__end, struct nlm_globals *, g,
900 	    struct nlm_host *, host, nlm4_res *, resp);
901 
902 	if (vp != NULL)
903 		VN_RELE(vp);
904 	if (rpcp != NULL)
905 		nlm_host_rele_rpc(host, rpcp);
906 
907 	nlm_host_release(g, host);
908 }
909 
910 /*
911  * NLM_GRANTED, NLM_GRANTED_MSG,
912  * NLM4_GRANTED, NLM4_GRANTED_MSG,
913  *
914  * This service routine is special.  It's the only one that's
915  * really part of our NLM _client_ support, used by _servers_
916  * to "call back" when a blocking lock from this NLM client
917  * is granted by the server.  In this case, we _know_ there is
918  * already an nlm_host allocated and held by the client code.
919  * We want to find that nlm_host here.
920  *
921  * Over in nlm_call_lock(), the client encoded the sysid for this
922  * server in the "owner handle" netbuf sent with our lock request.
923  * We can now use that to find the nlm_host object we used there.
924  * (NB: The owner handle is opaque to the server.)
925  */
926 void
927 nlm_do_granted(nlm4_testargs *argp, nlm4_res *resp,
928     struct svc_req *sr, nlm_res_cb cb)
929 {
930 	struct nlm_globals *g;
931 	struct nlm_owner_handle *oh;
932 	struct nlm_host *host;
933 	nlm_rpc_t *rpcp = NULL;
934 	int error;
935 
936 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
937 	resp->stat.stat = nlm4_denied;
938 
939 	g = zone_getspecific(nlm_zone_key, curzone);
940 	oh = (void *) argp->alock.oh.n_bytes;
941 	if (oh == NULL)
942 		return;
943 
944 	host = nlm_host_find_by_sysid(g, oh->oh_sysid);
945 	if (host == NULL)
946 		return;
947 
948 	if (cb != NULL) {
949 		error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp);
950 		if (error != 0)
951 			goto out;
952 	}
953 
954 	if (NLM_IN_GRACE(g)) {
955 		resp->stat.stat = nlm4_denied_grace_period;
956 		goto out;
957 	}
958 
959 	error = nlm_slock_grant(g, host, &argp->alock);
960 	if (error == 0)
961 		resp->stat.stat = nlm4_granted;
962 
963 out:
964 	/*
965 	 * If we have a callback funtion, use that to
966 	 * deliver the response via another RPC call.
967 	 */
968 	if (cb != NULL && rpcp != NULL)
969 		NLM_INVOKE_CALLBACK("do_granted", rpcp, resp, cb);
970 
971 	if (rpcp != NULL)
972 		nlm_host_rele_rpc(host, rpcp);
973 
974 	nlm_host_release(g, host);
975 }
976 
977 /*
978  * NLM_FREE_ALL, NLM4_FREE_ALL
979  *
980  * Destroy all lock state for the calling client.
981  */
982 void
983 nlm_do_free_all(nlm4_notify *argp, void *res, struct svc_req *sr)
984 {
985 	struct nlm_globals *g;
986 	struct nlm_host_list host_list;
987 	struct nlm_host *hostp;
988 
989 	TAILQ_INIT(&host_list);
990 	g = zone_getspecific(nlm_zone_key, curzone);
991 
992 	/* Serialize calls to clean locks. */
993 	mutex_enter(&g->clean_lock);
994 
995 	/*
996 	 * Find all hosts that have the given node name and put them on a
997 	 * local list.
998 	 */
999 	mutex_enter(&g->lock);
1000 	for (hostp = avl_first(&g->nlm_hosts_tree); hostp != NULL;
1001 	    hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp)) {
1002 		if (strcasecmp(hostp->nh_name, argp->name) == 0) {
1003 			/*
1004 			 * If needed take the host out of the idle list since
1005 			 * we are taking a reference.
1006 			 */
1007 			if (hostp->nh_flags & NLM_NH_INIDLE) {
1008 				TAILQ_REMOVE(&g->nlm_idle_hosts, hostp,
1009 				    nh_link);
1010 				hostp->nh_flags &= ~NLM_NH_INIDLE;
1011 			}
1012 			hostp->nh_refs++;
1013 
1014 			TAILQ_INSERT_TAIL(&host_list, hostp, nh_link);
1015 		}
1016 	}
1017 	mutex_exit(&g->lock);
1018 
1019 	/* Free locks for all hosts on the local list. */
1020 	while (!TAILQ_EMPTY(&host_list)) {
1021 		hostp = TAILQ_FIRST(&host_list);
1022 		TAILQ_REMOVE(&host_list, hostp, nh_link);
1023 
1024 		/*
1025 		 * Note that this does not do client-side cleanup.
1026 		 * We want to do that ONLY if statd tells us the
1027 		 * server has restarted.
1028 		 */
1029 		nlm_host_notify_server(hostp, argp->state);
1030 		nlm_host_release(g, hostp);
1031 	}
1032 
1033 	mutex_exit(&g->clean_lock);
1034 
1035 	(void) res;
1036 	(void) sr;
1037 }
1038 
1039 static void
1040 nlm_init_shrlock(struct shrlock *shr,
1041     nlm4_share *nshare, struct nlm_host *host)
1042 {
1043 
1044 	switch (nshare->access) {
1045 	default:
1046 	case fsa_NONE:
1047 		shr->s_access = 0;
1048 		break;
1049 	case fsa_R:
1050 		shr->s_access = F_RDACC;
1051 		break;
1052 	case fsa_W:
1053 		shr->s_access = F_WRACC;
1054 		break;
1055 	case fsa_RW:
1056 		shr->s_access = F_RWACC;
1057 		break;
1058 	}
1059 
1060 	switch (nshare->mode) {
1061 	default:
1062 	case fsm_DN:
1063 		shr->s_deny = F_NODNY;
1064 		break;
1065 	case fsm_DR:
1066 		shr->s_deny = F_RDDNY;
1067 		break;
1068 	case fsm_DW:
1069 		shr->s_deny = F_WRDNY;
1070 		break;
1071 	case fsm_DRW:
1072 		shr->s_deny = F_RWDNY;
1073 		break;
1074 	}
1075 
1076 	shr->s_sysid = host->nh_sysid;
1077 	shr->s_pid = 0;
1078 	shr->s_own_len = nshare->oh.n_len;
1079 	shr->s_owner   = nshare->oh.n_bytes;
1080 }
1081 
1082 /*
1083  * NLM_SHARE, NLM4_SHARE
1084  *
1085  * Request a DOS-style share reservation
1086  */
1087 void
1088 nlm_do_share(nlm4_shareargs *argp, nlm4_shareres *resp, struct svc_req *sr)
1089 {
1090 	struct nlm_globals *g;
1091 	struct nlm_host *host;
1092 	struct netbuf *addr;
1093 	struct nlm_vhold *nvp = NULL;
1094 	char *netid;
1095 	char *name;
1096 	int error;
1097 	struct shrlock shr;
1098 
1099 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
1100 
1101 	name = argp->share.caller_name;
1102 	netid = svc_getnetid(sr->rq_xprt);
1103 	addr = svc_getrpccaller(sr->rq_xprt);
1104 
1105 	g = zone_getspecific(nlm_zone_key, curzone);
1106 	host = nlm_host_findcreate(g, name, netid, addr);
1107 	if (host == NULL) {
1108 		resp->stat = nlm4_denied_nolocks;
1109 		return;
1110 	}
1111 
1112 	DTRACE_PROBE3(share__start, struct nlm_globals *, g,
1113 	    struct nlm_host *, host, nlm4_shareargs *, argp);
1114 
1115 	if (argp->reclaim == 0 && NLM_IN_GRACE(g)) {
1116 		resp->stat = nlm4_denied_grace_period;
1117 		goto out;
1118 	}
1119 
1120 	/*
1121 	 * Get holded vnode when on lock operation.
1122 	 * Only lock() and share() need vhold objects.
1123 	 */
1124 	nvp = nlm_fh_to_vhold(host, &argp->share.fh);
1125 	if (nvp == NULL) {
1126 		resp->stat = nlm4_stale_fh;
1127 		goto out;
1128 	}
1129 
1130 	/* Convert to local form. */
1131 	nlm_init_shrlock(&shr, &argp->share, host);
1132 	error = VOP_SHRLOCK(nvp->nv_vp, F_SHARE, &shr,
1133 	    FREAD | FWRITE, CRED(), NULL);
1134 
1135 	if (error == 0) {
1136 		resp->stat = nlm4_granted;
1137 		nlm_host_monitor(g, host, 0);
1138 	} else {
1139 		resp->stat = nlm4_denied;
1140 	}
1141 
1142 out:
1143 	DTRACE_PROBE3(share__end, struct nlm_globals *, g,
1144 	    struct nlm_host *, host, nlm4_shareres *, resp);
1145 
1146 	nlm_vhold_release(host, nvp);
1147 	nlm_host_release(g, host);
1148 }
1149 
1150 /*
1151  * NLM_UNSHARE, NLM4_UNSHARE
1152  *
1153  * Release a DOS-style share reservation
1154  */
1155 void
1156 nlm_do_unshare(nlm4_shareargs *argp, nlm4_shareres *resp, struct svc_req *sr)
1157 {
1158 	struct nlm_globals *g;
1159 	struct nlm_host *host;
1160 	struct netbuf *addr;
1161 	vnode_t *vp = NULL;
1162 	char *netid;
1163 	int error;
1164 	struct shrlock shr;
1165 
1166 	nlm_copy_netobj(&resp->cookie, &argp->cookie);
1167 
1168 	netid = svc_getnetid(sr->rq_xprt);
1169 	addr = svc_getrpccaller(sr->rq_xprt);
1170 
1171 	g = zone_getspecific(nlm_zone_key, curzone);
1172 	host = nlm_host_find(g, netid, addr);
1173 	if (host == NULL) {
1174 		resp->stat = nlm4_denied_nolocks;
1175 		return;
1176 	}
1177 
1178 	DTRACE_PROBE3(unshare__start, struct nlm_globals *, g,
1179 	    struct nlm_host *, host, nlm4_shareargs *, argp);
1180 
1181 	if (NLM_IN_GRACE(g)) {
1182 		resp->stat = nlm4_denied_grace_period;
1183 		goto out;
1184 	}
1185 
1186 	vp = nlm_fh_to_vp(&argp->share.fh);
1187 	if (vp == NULL) {
1188 		resp->stat = nlm4_stale_fh;
1189 		goto out;
1190 	}
1191 
1192 	/* Convert to local form. */
1193 	nlm_init_shrlock(&shr, &argp->share, host);
1194 	error = VOP_SHRLOCK(vp, F_UNSHARE, &shr,
1195 	    FREAD | FWRITE, CRED(), NULL);
1196 
1197 	(void) error;
1198 	resp->stat = nlm4_granted;
1199 
1200 out:
1201 	DTRACE_PROBE3(unshare__end, struct nlm_globals *, g,
1202 	    struct nlm_host *, host, nlm4_shareres *, resp);
1203 
1204 	if (vp != NULL)
1205 		VN_RELE(vp);
1206 
1207 	nlm_host_release(g, host);
1208 }
1209 
1210 /*
1211  * NLM wrapper to VOP_FRLOCK that checks the validity of the lock before
1212  * invoking the vnode operation.
1213  */
1214 static int
1215 nlm_vop_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
1216 	struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
1217 {
1218 	if (bfp->l_len != 0 && bfp->l_start + (bfp->l_len - 1) < bfp->l_start) {
1219 		return (EOVERFLOW);
1220 	}
1221 
1222 	return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1223 }
1224