1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
24  */
25 
26 /*
27  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #include <sys/systm.h>
32 #include <sys/sdt.h>
33 #include <rpc/types.h>
34 #include <rpc/auth.h>
35 #include <rpc/auth_unix.h>
36 #include <rpc/auth_des.h>
37 #include <rpc/svc.h>
38 #include <rpc/xdr.h>
39 #include <nfs/nfs4.h>
40 #include <nfs/nfs_dispatch.h>
41 #include <nfs/nfs4_drc.h>
42 
43 #define	NFS4_MAX_MINOR_VERSION	0
44 
45 /*
46  * This is the duplicate request cache for NFSv4
47  */
48 rfs4_drc_t *nfs4_drc = NULL;
49 
50 /*
51  * The default size of the duplicate request cache
52  */
53 uint32_t nfs4_drc_max = 8 * 1024;
54 
55 /*
56  * The number of buckets we'd like to hash the
57  * replies into.. do not change this on the fly.
58  */
59 uint32_t nfs4_drc_hash = 541;
60 
61 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
62 
63 /*
64  * Initialize a duplicate request cache.
65  */
66 rfs4_drc_t *
67 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
68 {
69 	rfs4_drc_t *drc;
70 	uint32_t   bki;
71 
72 	ASSERT(drc_size);
73 	ASSERT(drc_hash_size);
74 
75 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
76 
77 	drc->max_size = drc_size;
78 	drc->in_use = 0;
79 
80 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
81 
82 	drc->dr_hash = drc_hash_size;
83 
84 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
85 
86 	for (bki = 0; bki < drc_hash_size; bki++) {
87 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
88 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
89 	}
90 
91 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
92 	    offsetof(rfs4_dupreq_t, dr_next));
93 
94 	return (drc);
95 }
96 
97 /*
98  * Destroy a duplicate request cache.
99  */
100 void
101 rfs4_fini_drc(rfs4_drc_t *drc)
102 {
103 	rfs4_dupreq_t *drp, *drp_next;
104 
105 	ASSERT(drc);
106 
107 	/* iterate over the dr_cache and free the enties */
108 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
109 
110 		if (drp->dr_state == NFS4_DUP_REPLAY)
111 			rfs4_compound_free(&(drp->dr_res));
112 
113 		if (drp->dr_addr.buf != NULL)
114 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
115 
116 		drp_next = list_next(&(drc->dr_cache), drp);
117 
118 		kmem_free(drp, sizeof (rfs4_dupreq_t));
119 	}
120 
121 	mutex_destroy(&drc->lock);
122 	kmem_free(drc->dr_buckets,
123 	    sizeof (list_t)*drc->dr_hash);
124 	kmem_free(drc, sizeof (rfs4_drc_t));
125 }
126 
127 /*
128  * rfs4_dr_chstate:
129  *
130  * Change the state of a rfs4_dupreq. If it's not in transition
131  * to the FREE state, return. If we are moving to the FREE state
132  * then we need to clean up the compound results and move the entry
133  * to the end of the list.
134  */
135 void
136 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
137 {
138 	rfs4_drc_t *drc;
139 
140 	ASSERT(drp);
141 	ASSERT(drp->drc);
142 	ASSERT(drp->dr_bkt);
143 	ASSERT(MUTEX_HELD(&drp->drc->lock));
144 
145 	drp->dr_state = new_state;
146 
147 	if (new_state != NFS4_DUP_FREE)
148 		return;
149 
150 	drc = drp->drc;
151 
152 	/*
153 	 * Remove entry from the bucket and
154 	 * dr_cache list, free compound results.
155 	 */
156 	list_remove(drp->dr_bkt, drp);
157 	list_remove(&(drc->dr_cache), drp);
158 	rfs4_compound_free(&(drp->dr_res));
159 }
160 
161 /*
162  * rfs4_alloc_dr:
163  *
164  * Malloc a new one if we have not reached our maximum cache
165  * limit, otherwise pick an entry off the tail -- Use if it
166  * is marked as NFS4_DUP_FREE, or is an entry in the
167  * NFS4_DUP_REPLAY state.
168  */
169 rfs4_dupreq_t *
170 rfs4_alloc_dr(rfs4_drc_t *drc)
171 {
172 	rfs4_dupreq_t *drp_tail, *drp = NULL;
173 
174 	ASSERT(drc);
175 	ASSERT(MUTEX_HELD(&drc->lock));
176 
177 	/*
178 	 * Have we hit the cache limit yet ?
179 	 */
180 	if (drc->in_use < drc->max_size) {
181 		/*
182 		 * nope, so let's malloc a new one
183 		 */
184 		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
185 		drp->drc = drc;
186 		drc->in_use++;
187 		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
188 		return (drp);
189 	}
190 
191 	/*
192 	 * Cache is all allocated now traverse the list
193 	 * backwards to find one we can reuse.
194 	 */
195 	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
196 	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
197 
198 		switch (drp_tail->dr_state) {
199 
200 		case NFS4_DUP_FREE:
201 			list_remove(&(drc->dr_cache), drp_tail);
202 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
203 			    rfs4_dupreq_t *, drp_tail);
204 			return (drp_tail);
205 			/* NOTREACHED */
206 
207 		case NFS4_DUP_REPLAY:
208 			/* grab it. */
209 			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
210 			DTRACE_PROBE1(nfss__i__drc_replayclaim,
211 			    rfs4_dupreq_t *, drp_tail);
212 			return (drp_tail);
213 			/* NOTREACHED */
214 		}
215 	}
216 	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
217 	return (NULL);
218 }
219 
220 /*
221  * rfs4_find_dr:
222  *
223  * Search for an entry in the duplicate request cache by
224  * calculating the hash index based on the XID, and examining
225  * the entries in the hash bucket. If we find a match, return.
226  * Once we have searched the bucket we call rfs4_alloc_dr() to
227  * allocate a new entry, or reuse one that is available.
228  */
229 int
230 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
231 {
232 
233 	uint32_t	the_xid;
234 	list_t		*dr_bkt;
235 	rfs4_dupreq_t	*drp;
236 	int		bktdex;
237 
238 	/*
239 	 * Get the XID, calculate the bucket and search to
240 	 * see if we need to replay from the cache.
241 	 */
242 	the_xid = req->rq_xprt->xp_xid;
243 	bktdex = the_xid % drc->dr_hash;
244 
245 	dr_bkt = (list_t *)
246 	    &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
247 
248 	DTRACE_PROBE3(nfss__i__drc_bktdex,
249 	    int, bktdex,
250 	    uint32_t, the_xid,
251 	    list_t *, dr_bkt);
252 
253 	*dup = NULL;
254 
255 	mutex_enter(&drc->lock);
256 	/*
257 	 * Search the bucket for a matching xid and address.
258 	 */
259 	for (drp = list_head(dr_bkt); drp != NULL;
260 	    drp = list_next(dr_bkt, drp)) {
261 
262 		if (drp->dr_xid == the_xid &&
263 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
264 		    bcmp((caddr_t)drp->dr_addr.buf,
265 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
266 		    drp->dr_addr.len) == 0) {
267 
268 			/*
269 			 * Found a match so REPLAY the Reply
270 			 */
271 			if (drp->dr_state == NFS4_DUP_REPLAY) {
272 				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
273 				mutex_exit(&drc->lock);
274 				*dup = drp;
275 				DTRACE_PROBE1(nfss__i__drc_replay,
276 				    rfs4_dupreq_t *, drp);
277 				return (NFS4_DUP_REPLAY);
278 			}
279 
280 			/*
281 			 * This entry must be in transition, so return
282 			 * the 'pending' status.
283 			 */
284 			mutex_exit(&drc->lock);
285 			return (NFS4_DUP_PENDING);
286 		}
287 	}
288 
289 	drp = rfs4_alloc_dr(drc);
290 	mutex_exit(&drc->lock);
291 
292 	/*
293 	 * The DRC is full and all entries are in use. Upper function
294 	 * should error out this request and force the client to
295 	 * retransmit -- effectively this is a resource issue. NFSD
296 	 * threads tied up with native File System, or the cache size
297 	 * is too small for the server load.
298 	 */
299 	if (drp == NULL)
300 		return (NFS4_DUP_ERROR);
301 
302 	/*
303 	 * Init the state to NEW.
304 	 */
305 	drp->dr_state = NFS4_DUP_NEW;
306 
307 	/*
308 	 * If needed, resize the address buffer
309 	 */
310 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
311 		if (drp->dr_addr.buf != NULL)
312 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
313 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
314 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
315 		if (drp->dr_addr.buf == NULL) {
316 			/*
317 			 * If the malloc fails, mark the entry
318 			 * as free and put on the tail.
319 			 */
320 			drp->dr_addr.maxlen = 0;
321 			drp->dr_state = NFS4_DUP_FREE;
322 			mutex_enter(&drc->lock);
323 			list_insert_tail(&(drc->dr_cache), drp);
324 			mutex_exit(&drc->lock);
325 			return (NFS4_DUP_ERROR);
326 		}
327 	}
328 
329 
330 	/*
331 	 * Copy the address.
332 	 */
333 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
334 
335 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
336 	    (caddr_t)drp->dr_addr.buf,
337 	    drp->dr_addr.len);
338 
339 	drp->dr_xid = the_xid;
340 	drp->dr_bkt = dr_bkt;
341 
342 	/*
343 	 * Insert at the head of the bucket and
344 	 * the drc lists..
345 	 */
346 	mutex_enter(&drc->lock);
347 	list_insert_head(&drc->dr_cache, drp);
348 	list_insert_head(dr_bkt, drp);
349 	mutex_exit(&drc->lock);
350 
351 	*dup = drp;
352 
353 	return (NFS4_DUP_NEW);
354 }
355 
356 /*
357  *
358  * This function handles the duplicate request cache,
359  * NULL_PROC and COMPOUND procedure calls for NFSv4;
360  *
361  * Passed into this function are:-
362  *
363  * 	disp	A pointer to our dispatch table entry
364  * 	req	The request to process
365  * 	xprt	The server transport handle
366  * 	ap	A pointer to the arguments
367  *	rlen	A pointer to the reply length (output)
368  *
369  *
370  * When appropriate this function is responsible for inserting
371  * the reply into the duplicate cache or replaying an existing
372  * cached reply.
373  *
374  * dr_stat 	reflects the state of the duplicate request that
375  * 		has been inserted into or retrieved from the cache
376  *
377  * drp		is the duplicate request entry
378  *
379  */
380 int
381 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
382     SVCXPRT *xprt, char *ap, size_t *rlen)
383 {
384 
385 	COMPOUND4res	 res_buf;
386 	COMPOUND4res	*rbp;
387 	COMPOUND4args	*cap;
388 	cred_t		*cr = NULL;
389 	int		 error = 0;
390 	int		 dis_flags = 0;
391 	int		 dr_stat = NFS4_NOT_DUP;
392 	rfs4_dupreq_t	*drp = NULL;
393 	int		 rv;
394 
395 	ASSERT(disp);
396 
397 	/*
398 	 * Short circuit the RPC_NULL proc.
399 	 */
400 	if (disp->dis_proc == rpc_null) {
401 		DTRACE_NFSV4_1(null__start, struct svc_req *, req);
402 		if (!svc_sendreply(xprt, xdr_void, NULL)) {
403 			DTRACE_NFSV4_1(null__done, struct svc_req *, req);
404 			svcerr_systemerr(xprt);
405 			return (1);
406 		}
407 		DTRACE_NFSV4_1(null__done, struct svc_req *, req);
408 		*rlen = xdr_sizeof(xdr_void, NULL);
409 		return (0);
410 	}
411 
412 	/* Only NFSv4 Compounds from this point onward */
413 
414 	rbp = &res_buf;
415 	cap = (COMPOUND4args *)ap;
416 
417 	/*
418 	 * Update kstats
419 	 */
420 	rfs4_compound_kstat_args(cap);
421 
422 	/*
423 	 * Figure out the disposition of the whole COMPOUND
424 	 * and record it's IDEMPOTENTCY.
425 	 */
426 	rfs4_compound_flagproc(cap, &dis_flags);
427 
428 	/*
429 	 * If NON-IDEMPOTENT then we need to figure out if this
430 	 * request can be replied from the duplicate cache.
431 	 *
432 	 * If this is a new request then we need to insert the
433 	 * reply into the duplicate cache.
434 	 */
435 	if (!(dis_flags & RPC_IDEMPOTENT)) {
436 		/* look for a replay from the cache or allocate */
437 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
438 
439 		switch (dr_stat) {
440 
441 		case NFS4_DUP_ERROR:
442 			rfs4_resource_err(req, cap);
443 			return (1);
444 			/* NOTREACHED */
445 
446 		case NFS4_DUP_PENDING:
447 			/*
448 			 * reply has previously been inserted into the
449 			 * duplicate cache, however the reply has
450 			 * not yet been sent via svc_sendreply()
451 			 */
452 			return (1);
453 			/* NOTREACHED */
454 
455 		case NFS4_DUP_NEW:
456 			curthread->t_flag |= T_DONTPEND;
457 			/* NON-IDEMPOTENT proc call */
458 			rfs4_compound(cap, rbp, NULL, req, cr, &rv);
459 			curthread->t_flag &= ~T_DONTPEND;
460 
461 			if (rv)		/* short ckt sendreply on error */
462 				return (rv);
463 
464 			/*
465 			 * dr_res must be initialized before calling
466 			 * rfs4_dr_chstate (it frees the reply).
467 			 */
468 			drp->dr_res = res_buf;
469 			if (curthread->t_flag & T_WOULDBLOCK) {
470 				curthread->t_flag &= ~T_WOULDBLOCK;
471 				/*
472 				 * mark this entry as FREE and plop
473 				 * on the end of the cache list
474 				 */
475 				mutex_enter(&drp->drc->lock);
476 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
477 				list_insert_tail(&(drp->drc->dr_cache), drp);
478 				mutex_exit(&drp->drc->lock);
479 				return (1);
480 			}
481 			break;
482 
483 		case NFS4_DUP_REPLAY:
484 			/* replay from the cache */
485 			rbp = &(drp->dr_res);
486 			break;
487 		}
488 	} else {
489 		curthread->t_flag |= T_DONTPEND;
490 		/* IDEMPOTENT proc call */
491 		rfs4_compound(cap, rbp, NULL, req, cr, &rv);
492 		curthread->t_flag &= ~T_DONTPEND;
493 
494 		if (rv)		/* short ckt sendreply on error */
495 			return (rv);
496 
497 		if (curthread->t_flag & T_WOULDBLOCK) {
498 			curthread->t_flag &= ~T_WOULDBLOCK;
499 			return (1);
500 		}
501 	}
502 
503 	/*
504 	 * Send out the replayed reply or the 'real' one.
505 	 */
506 	if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
507 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
508 		    struct svc_req *, xprt,
509 		    char *, rbp);
510 		svcerr_systemerr(xprt);
511 		error++;
512 	} else {
513 		/*
514 		 * Update kstats
515 		 */
516 		rfs4_compound_kstat_res(rbp);
517 		*rlen = xdr_sizeof(xdr_COMPOUND4res_srv, rbp);
518 	}
519 
520 	/*
521 	 * If this reply was just inserted into the duplicate cache
522 	 * or it was replayed from the dup cache; (re)mark it as
523 	 * available for replay
524 	 *
525 	 * At first glance, this 'if' statement seems a little strange;
526 	 * testing for NFS4_DUP_REPLAY, and then calling...
527 	 *
528 	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
529 	 *
530 	 * ... but notice that we are checking dr_stat, and not the
531 	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
532 	 * we do that so that we know not to prematurely reap it whilst
533 	 * we resent it to the client.
534 	 *
535 	 */
536 	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
537 		mutex_enter(&drp->drc->lock);
538 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
539 		mutex_exit(&drp->drc->lock);
540 	} else if (dr_stat == NFS4_NOT_DUP) {
541 		rfs4_compound_free(rbp);
542 	}
543 
544 	return (error);
545 }
546 
547 bool_t
548 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
549 {
550 	COMPOUND4args *argsp;
551 	COMPOUND4res res_buf, *resp;
552 
553 	if (req->rq_vers != 4)
554 		return (FALSE);
555 
556 	argsp = (COMPOUND4args *)args;
557 
558 	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
559 		return (FALSE);
560 
561 	resp = &res_buf;
562 
563 	/*
564 	 * Form a reply tag by copying over the reqeuest tag.
565 	 */
566 	resp->tag.utf8string_val =
567 	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
568 	resp->tag.utf8string_len = argsp->tag.utf8string_len;
569 	bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
570 	    resp->tag.utf8string_len);
571 	resp->array_len = 0;
572 	resp->array = NULL;
573 	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
574 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
575 		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
576 		    SVCXPRT *, xprt, char *, resp);
577 		svcerr_systemerr(xprt);
578 	}
579 	rfs4_compound_free(resp);
580 	return (TRUE);
581 }
582 
583 void
584 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
585 {
586 	COMPOUND4res res_buf, *rbp;
587 	nfs_resop4 *resop;
588 	PUTFH4res *resp;
589 
590 	rbp = &res_buf;
591 
592 	/*
593 	 * Form a reply tag by copying over the request tag.
594 	 */
595 	rbp->tag.utf8string_val =
596 	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
597 	rbp->tag.utf8string_len = argsp->tag.utf8string_len;
598 	bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
599 	    rbp->tag.utf8string_len);
600 
601 	rbp->array_len = 1;
602 	rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
603 	    KM_SLEEP);
604 	resop = &rbp->array[0];
605 	resop->resop = argsp->array[0].argop;	/* copy first op over */
606 
607 	/* Any op will do, just need to access status field */
608 	resp = &resop->nfs_resop4_u.opputfh;
609 
610 	/*
611 	 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
612 	 * Note that all op numbers in the compound array were already
613 	 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
614 	 */
615 	resp->status = (resop->resop == OP_ILLEGAL ?
616 	    NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
617 
618 	/* compound status is same as first op status */
619 	rbp->status = resp->status;
620 
621 	if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
622 		DTRACE_PROBE2(nfss__rsrc_err__sendfail,
623 		    struct svc_req *, req->rq_xprt, char *, rbp);
624 		svcerr_systemerr(req->rq_xprt);
625 	}
626 
627 	UTF8STRING_FREE(rbp->tag);
628 	kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
629 }
630