nfs4_dispatch.c revision 22146ea93e24c7deb02c49c33b2ab98605ce78b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
24 */
25
26/*
27 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
28 * Use is subject to license terms.
29 */
30
31#include <sys/systm.h>
32#include <sys/sdt.h>
33#include <rpc/types.h>
34#include <rpc/auth.h>
35#include <rpc/auth_unix.h>
36#include <rpc/auth_des.h>
37#include <rpc/svc.h>
38#include <rpc/xdr.h>
39#include <nfs/nfs4.h>
40#include <nfs/nfs_dispatch.h>
41#include <nfs/nfs4_drc.h>
42
43#define	NFS4_MAX_MINOR_VERSION	0
44
45/*
46 * This is the duplicate request cache for NFSv4
47 */
48rfs4_drc_t *nfs4_drc = NULL;
49
50/*
51 * The default size of the duplicate request cache
52 */
53uint32_t nfs4_drc_max = 8 * 1024;
54
55/*
56 * The number of buckets we'd like to hash the
57 * replies into.. do not change this on the fly.
58 */
59uint32_t nfs4_drc_hash = 541;
60
61static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
62
63/*
64 * Initialize a duplicate request cache.
65 */
66rfs4_drc_t *
67rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
68{
69	rfs4_drc_t *drc;
70	uint32_t   bki;
71
72	ASSERT(drc_size);
73	ASSERT(drc_hash_size);
74
75	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
76
77	drc->max_size = drc_size;
78	drc->in_use = 0;
79
80	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
81
82	drc->dr_hash = drc_hash_size;
83
84	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
85
86	for (bki = 0; bki < drc_hash_size; bki++) {
87		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
88		    offsetof(rfs4_dupreq_t, dr_bkt_next));
89	}
90
91	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
92	    offsetof(rfs4_dupreq_t, dr_next));
93
94	return (drc);
95}
96
97/*
98 * Destroy a duplicate request cache.
99 */
100void
101rfs4_fini_drc(rfs4_drc_t *drc)
102{
103	rfs4_dupreq_t *drp, *drp_next;
104
105	ASSERT(drc);
106
107	/* iterate over the dr_cache and free the enties */
108	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
109
110		if (drp->dr_state == NFS4_DUP_REPLAY)
111			rfs4_compound_free(&(drp->dr_res));
112
113		if (drp->dr_addr.buf != NULL)
114			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
115
116		drp_next = list_next(&(drc->dr_cache), drp);
117
118		kmem_free(drp, sizeof (rfs4_dupreq_t));
119	}
120
121	mutex_destroy(&drc->lock);
122	kmem_free(drc->dr_buckets,
123	    sizeof (list_t)*drc->dr_hash);
124	kmem_free(drc, sizeof (rfs4_drc_t));
125}
126
127/*
128 * rfs4_dr_chstate:
129 *
130 * Change the state of a rfs4_dupreq. If it's not in transition
131 * to the FREE state, return. If we are moving to the FREE state
132 * then we need to clean up the compound results and move the entry
133 * to the end of the list.
134 */
135void
136rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
137{
138	rfs4_drc_t *drc;
139
140	ASSERT(drp);
141	ASSERT(drp->drc);
142	ASSERT(drp->dr_bkt);
143	ASSERT(MUTEX_HELD(&drp->drc->lock));
144
145	drp->dr_state = new_state;
146
147	if (new_state != NFS4_DUP_FREE)
148		return;
149
150	drc = drp->drc;
151
152	/*
153	 * Remove entry from the bucket and
154	 * dr_cache list, free compound results.
155	 */
156	list_remove(drp->dr_bkt, drp);
157	list_remove(&(drc->dr_cache), drp);
158	rfs4_compound_free(&(drp->dr_res));
159}
160
161/*
162 * rfs4_alloc_dr:
163 *
164 * Malloc a new one if we have not reached our maximum cache
165 * limit, otherwise pick an entry off the tail -- Use if it
166 * is marked as NFS4_DUP_FREE, or is an entry in the
167 * NFS4_DUP_REPLAY state.
168 */
169rfs4_dupreq_t *
170rfs4_alloc_dr(rfs4_drc_t *drc)
171{
172	rfs4_dupreq_t *drp_tail, *drp = NULL;
173
174	ASSERT(drc);
175	ASSERT(MUTEX_HELD(&drc->lock));
176
177	/*
178	 * Have we hit the cache limit yet ?
179	 */
180	if (drc->in_use < drc->max_size) {
181		/*
182		 * nope, so let's malloc a new one
183		 */
184		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
185		drp->drc = drc;
186		drc->in_use++;
187		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
188		return (drp);
189	}
190
191	/*
192	 * Cache is all allocated now traverse the list
193	 * backwards to find one we can reuse.
194	 */
195	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
196	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
197
198		switch (drp_tail->dr_state) {
199
200		case NFS4_DUP_FREE:
201			list_remove(&(drc->dr_cache), drp_tail);
202			DTRACE_PROBE1(nfss__i__drc_freeclaim,
203			    rfs4_dupreq_t *, drp_tail);
204			return (drp_tail);
205			/* NOTREACHED */
206
207		case NFS4_DUP_REPLAY:
208			/* grab it. */
209			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
210			DTRACE_PROBE1(nfss__i__drc_replayclaim,
211			    rfs4_dupreq_t *, drp_tail);
212			return (drp_tail);
213			/* NOTREACHED */
214		}
215	}
216	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
217	return (NULL);
218}
219
220/*
221 * rfs4_find_dr:
222 *
223 * Search for an entry in the duplicate request cache by
224 * calculating the hash index based on the XID, and examining
225 * the entries in the hash bucket. If we find a match, return.
226 * Once we have searched the bucket we call rfs4_alloc_dr() to
227 * allocate a new entry, or reuse one that is available.
228 */
229int
230rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
231{
232
233	uint32_t	the_xid;
234	list_t		*dr_bkt;
235	rfs4_dupreq_t	*drp;
236	int		bktdex;
237
238	/*
239	 * Get the XID, calculate the bucket and search to
240	 * see if we need to replay from the cache.
241	 */
242	the_xid = req->rq_xprt->xp_xid;
243	bktdex = the_xid % drc->dr_hash;
244
245	dr_bkt = (list_t *)
246	    &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
247
248	DTRACE_PROBE3(nfss__i__drc_bktdex,
249	    int, bktdex,
250	    uint32_t, the_xid,
251	    list_t *, dr_bkt);
252
253	*dup = NULL;
254
255	mutex_enter(&drc->lock);
256	/*
257	 * Search the bucket for a matching xid and address.
258	 */
259	for (drp = list_head(dr_bkt); drp != NULL;
260	    drp = list_next(dr_bkt, drp)) {
261
262		if (drp->dr_xid == the_xid &&
263		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
264		    bcmp((caddr_t)drp->dr_addr.buf,
265		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
266		    drp->dr_addr.len) == 0) {
267
268			/*
269			 * Found a match so REPLAY the Reply
270			 */
271			if (drp->dr_state == NFS4_DUP_REPLAY) {
272				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
273				mutex_exit(&drc->lock);
274				*dup = drp;
275				DTRACE_PROBE1(nfss__i__drc_replay,
276				    rfs4_dupreq_t *, drp);
277				return (NFS4_DUP_REPLAY);
278			}
279
280			/*
281			 * This entry must be in transition, so return
282			 * the 'pending' status.
283			 */
284			mutex_exit(&drc->lock);
285			return (NFS4_DUP_PENDING);
286		}
287	}
288
289	drp = rfs4_alloc_dr(drc);
290	mutex_exit(&drc->lock);
291
292	/*
293	 * The DRC is full and all entries are in use. Upper function
294	 * should error out this request and force the client to
295	 * retransmit -- effectively this is a resource issue. NFSD
296	 * threads tied up with native File System, or the cache size
297	 * is too small for the server load.
298	 */
299	if (drp == NULL)
300		return (NFS4_DUP_ERROR);
301
302	/*
303	 * Init the state to NEW.
304	 */
305	drp->dr_state = NFS4_DUP_NEW;
306
307	/*
308	 * If needed, resize the address buffer
309	 */
310	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
311		if (drp->dr_addr.buf != NULL)
312			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
313		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
314		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
315		if (drp->dr_addr.buf == NULL) {
316			/*
317			 * If the malloc fails, mark the entry
318			 * as free and put on the tail.
319			 */
320			drp->dr_addr.maxlen = 0;
321			drp->dr_state = NFS4_DUP_FREE;
322			mutex_enter(&drc->lock);
323			list_insert_tail(&(drc->dr_cache), drp);
324			mutex_exit(&drc->lock);
325			return (NFS4_DUP_ERROR);
326		}
327	}
328
329
330	/*
331	 * Copy the address.
332	 */
333	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
334
335	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
336	    (caddr_t)drp->dr_addr.buf,
337	    drp->dr_addr.len);
338
339	drp->dr_xid = the_xid;
340	drp->dr_bkt = dr_bkt;
341
342	/*
343	 * Insert at the head of the bucket and
344	 * the drc lists..
345	 */
346	mutex_enter(&drc->lock);
347	list_insert_head(&drc->dr_cache, drp);
348	list_insert_head(dr_bkt, drp);
349	mutex_exit(&drc->lock);
350
351	*dup = drp;
352
353	return (NFS4_DUP_NEW);
354}
355
356/*
357 *
358 * This function handles the duplicate request cache,
359 * NULL_PROC and COMPOUND procedure calls for NFSv4;
360 *
361 * Passed into this function are:-
362 *
363 * 	disp	A pointer to our dispatch table entry
364 * 	req	The request to process
365 * 	xprt	The server transport handle
366 * 	ap	A pointer to the arguments
367 *	rlen	A pointer to the reply length (output)
368 *
369 *
370 * When appropriate this function is responsible for inserting
371 * the reply into the duplicate cache or replaying an existing
372 * cached reply.
373 *
374 * dr_stat 	reflects the state of the duplicate request that
375 * 		has been inserted into or retrieved from the cache
376 *
377 * drp		is the duplicate request entry
378 *
379 */
380int
381rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
382    SVCXPRT *xprt, char *ap, size_t *rlen)
383{
384
385	COMPOUND4res	 res_buf;
386	COMPOUND4res	*rbp;
387	COMPOUND4args	*cap;
388	cred_t		*cr = NULL;
389	int		 error = 0;
390	int		 dis_flags = 0;
391	int		 dr_stat = NFS4_NOT_DUP;
392	rfs4_dupreq_t	*drp = NULL;
393	int		 rv;
394
395	ASSERT(disp);
396
397	/*
398	 * Short circuit the RPC_NULL proc.
399	 */
400	if (disp->dis_proc == rpc_null) {
401		DTRACE_NFSV4_1(null__start, struct svc_req *, req);
402		if (!svc_sendreply(xprt, xdr_void, NULL)) {
403			DTRACE_NFSV4_1(null__done, struct svc_req *, req);
404			svcerr_systemerr(xprt);
405			return (1);
406		}
407		DTRACE_NFSV4_1(null__done, struct svc_req *, req);
408		*rlen = xdr_sizeof(xdr_void, NULL);
409		return (0);
410	}
411
412	/* Only NFSv4 Compounds from this point onward */
413
414	rbp = &res_buf;
415	cap = (COMPOUND4args *)ap;
416
417	/*
418	 * Update kstats
419	 */
420	rfs4_compound_kstat_args(cap);
421
422	/*
423	 * Figure out the disposition of the whole COMPOUND
424	 * and record it's IDEMPOTENTCY.
425	 */
426	rfs4_compound_flagproc(cap, &dis_flags);
427
428	/*
429	 * If NON-IDEMPOTENT then we need to figure out if this
430	 * request can be replied from the duplicate cache.
431	 *
432	 * If this is a new request then we need to insert the
433	 * reply into the duplicate cache.
434	 */
435	if (!(dis_flags & RPC_IDEMPOTENT)) {
436		/* look for a replay from the cache or allocate */
437		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
438
439		switch (dr_stat) {
440
441		case NFS4_DUP_ERROR:
442			rfs4_resource_err(req, cap);
443			return (1);
444			/* NOTREACHED */
445
446		case NFS4_DUP_PENDING:
447			/*
448			 * reply has previously been inserted into the
449			 * duplicate cache, however the reply has
450			 * not yet been sent via svc_sendreply()
451			 */
452			return (1);
453			/* NOTREACHED */
454
455		case NFS4_DUP_NEW:
456			curthread->t_flag |= T_DONTPEND;
457			/* NON-IDEMPOTENT proc call */
458			rfs4_compound(cap, rbp, NULL, req, cr, &rv);
459			curthread->t_flag &= ~T_DONTPEND;
460
461			if (rv)		/* short ckt sendreply on error */
462				return (rv);
463
464			/*
465			 * dr_res must be initialized before calling
466			 * rfs4_dr_chstate (it frees the reply).
467			 */
468			drp->dr_res = res_buf;
469			if (curthread->t_flag & T_WOULDBLOCK) {
470				curthread->t_flag &= ~T_WOULDBLOCK;
471				/*
472				 * mark this entry as FREE and plop
473				 * on the end of the cache list
474				 */
475				mutex_enter(&drp->drc->lock);
476				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
477				list_insert_tail(&(drp->drc->dr_cache), drp);
478				mutex_exit(&drp->drc->lock);
479				return (1);
480			}
481			break;
482
483		case NFS4_DUP_REPLAY:
484			/* replay from the cache */
485			rbp = &(drp->dr_res);
486			break;
487		}
488	} else {
489		curthread->t_flag |= T_DONTPEND;
490		/* IDEMPOTENT proc call */
491		rfs4_compound(cap, rbp, NULL, req, cr, &rv);
492		curthread->t_flag &= ~T_DONTPEND;
493
494		if (rv)		/* short ckt sendreply on error */
495			return (rv);
496
497		if (curthread->t_flag & T_WOULDBLOCK) {
498			curthread->t_flag &= ~T_WOULDBLOCK;
499			return (1);
500		}
501	}
502
503	/*
504	 * Send out the replayed reply or the 'real' one.
505	 */
506	if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
507		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
508		    struct svc_req *, xprt,
509		    char *, rbp);
510		svcerr_systemerr(xprt);
511		error++;
512	} else {
513		/*
514		 * Update kstats
515		 */
516		rfs4_compound_kstat_res(rbp);
517		*rlen = xdr_sizeof(xdr_COMPOUND4res_srv, rbp);
518	}
519
520	/*
521	 * If this reply was just inserted into the duplicate cache
522	 * or it was replayed from the dup cache; (re)mark it as
523	 * available for replay
524	 *
525	 * At first glance, this 'if' statement seems a little strange;
526	 * testing for NFS4_DUP_REPLAY, and then calling...
527	 *
528	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
529	 *
530	 * ... but notice that we are checking dr_stat, and not the
531	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
532	 * we do that so that we know not to prematurely reap it whilst
533	 * we resent it to the client.
534	 *
535	 */
536	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
537		mutex_enter(&drp->drc->lock);
538		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
539		mutex_exit(&drp->drc->lock);
540	} else if (dr_stat == NFS4_NOT_DUP) {
541		rfs4_compound_free(rbp);
542	}
543
544	return (error);
545}
546
547bool_t
548rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
549{
550	COMPOUND4args *argsp;
551	COMPOUND4res res_buf, *resp;
552
553	if (req->rq_vers != 4)
554		return (FALSE);
555
556	argsp = (COMPOUND4args *)args;
557
558	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
559		return (FALSE);
560
561	resp = &res_buf;
562
563	/*
564	 * Form a reply tag by copying over the reqeuest tag.
565	 */
566	resp->tag.utf8string_val =
567	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
568	resp->tag.utf8string_len = argsp->tag.utf8string_len;
569	bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
570	    resp->tag.utf8string_len);
571	resp->array_len = 0;
572	resp->array = NULL;
573	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
574	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
575		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
576		    SVCXPRT *, xprt, char *, resp);
577		svcerr_systemerr(xprt);
578	}
579	rfs4_compound_free(resp);
580	return (TRUE);
581}
582
583void
584rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
585{
586	COMPOUND4res res_buf, *rbp;
587	nfs_resop4 *resop;
588	PUTFH4res *resp;
589
590	rbp = &res_buf;
591
592	/*
593	 * Form a reply tag by copying over the request tag.
594	 */
595	rbp->tag.utf8string_val =
596	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
597	rbp->tag.utf8string_len = argsp->tag.utf8string_len;
598	bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
599	    rbp->tag.utf8string_len);
600
601	rbp->array_len = 1;
602	rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
603	    KM_SLEEP);
604	resop = &rbp->array[0];
605	resop->resop = argsp->array[0].argop;	/* copy first op over */
606
607	/* Any op will do, just need to access status field */
608	resp = &resop->nfs_resop4_u.opputfh;
609
610	/*
611	 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
612	 * Note that all op numbers in the compound array were already
613	 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
614	 */
615	resp->status = (resop->resop == OP_ILLEGAL ?
616	    NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
617
618	/* compound status is same as first op status */
619	rbp->status = resp->status;
620
621	if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
622		DTRACE_PROBE2(nfss__rsrc_err__sendfail,
623		    struct svc_req *, req->rq_xprt, char *, rbp);
624		svcerr_systemerr(req->rq_xprt);
625	}
626
627	UTF8STRING_FREE(rbp->tag);
628	kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
629}
630