nfs4_dispatch.c revision 7d12f3bc086bd094c3bf327e7bd04f94701e1c69
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/systm.h>
30#include <sys/sdt.h>
31#include <rpc/types.h>
32#include <rpc/auth.h>
33#include <rpc/auth_unix.h>
34#include <rpc/auth_des.h>
35#include <rpc/svc.h>
36#include <rpc/xdr.h>
37#include <nfs/nfs4.h>
38#include <nfs/nfs_dispatch.h>
39#include <nfs/nfs4_drc.h>
40
41#define	NFS4_MAX_MINOR_VERSION	0
42
43/*
44 * This is the duplicate request cache for NFSv4
45 */
46rfs4_drc_t *nfs4_drc = NULL;
47
48/*
49 * The default size of the duplicate request cache
50 */
51uint32_t nfs4_drc_max = 8 * 1024;
52
53/*
54 * The number of buckets we'd like to hash the
55 * replies into.. do not change this on the fly.
56 */
57uint32_t nfs4_drc_hash = 541;
58
59/*
60 * Initialize a duplicate request cache.
61 */
62rfs4_drc_t *
63rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
64{
65	rfs4_drc_t *drc;
66	uint32_t   bki;
67
68	ASSERT(drc_size);
69	ASSERT(drc_hash_size);
70
71	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
72
73	drc->max_size = drc_size;
74	drc->in_use = 0;
75
76	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
77
78	drc->dr_hash = drc_hash_size;
79
80	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
81
82	for (bki = 0; bki < drc_hash_size; bki++) {
83		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
84		    offsetof(rfs4_dupreq_t, dr_bkt_next));
85	}
86
87	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
88		    offsetof(rfs4_dupreq_t, dr_next));
89
90	return (drc);
91}
92
93/*
94 * Destroy a duplicate request cache.
95 */
96void
97rfs4_fini_drc(rfs4_drc_t *drc)
98{
99	rfs4_dupreq_t *drp, *drp_next;
100
101	ASSERT(drc);
102
103	/* iterate over the dr_cache and free the enties */
104	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
105
106		if (drp->dr_state == NFS4_DUP_REPLAY)
107			rfs4_compound_free(&(drp->dr_res));
108
109		if (drp->dr_addr.buf != NULL)
110			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
111
112		drp_next = list_next(&(drc->dr_cache), drp);
113
114		kmem_free(drp, sizeof (rfs4_dupreq_t));
115	}
116
117	mutex_destroy(&drc->lock);
118	kmem_free(drc->dr_buckets,
119		sizeof (list_t)*drc->dr_hash);
120	kmem_free(drc, sizeof (rfs4_drc_t));
121}
122
123/*
124 * rfs4_dr_chstate:
125 *
126 * Change the state of a rfs4_dupreq. If it's not in transition
127 * to the FREE state, update the time used and return. If we
128 * are moving to the FREE state then we need to clean up the
129 * compound results and move the entry to the end of the list.
130 */
131void
132rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
133{
134	rfs4_drc_t *drc;
135
136	ASSERT(drp);
137	ASSERT(drp->drc);
138	ASSERT(drp->dr_bkt);
139	ASSERT(MUTEX_HELD(&drp->drc->lock));
140
141	drp->dr_state = new_state;
142
143	if (new_state != NFS4_DUP_FREE) {
144		gethrestime(&drp->dr_time_used);
145		return;
146	}
147
148	drc = drp->drc;
149
150	/*
151	 * Remove entry from the bucket and
152	 * dr_cache list, free compound results.
153	 */
154	list_remove(drp->dr_bkt, drp);
155	list_remove(&(drc->dr_cache), drp);
156	rfs4_compound_free(&(drp->dr_res));
157}
158
159/*
160 * rfs4_alloc_dr:
161 *
162 * Malloc a new one if we have not reached our maximum cache
163 * limit, otherwise pick an entry off the tail -- Use if it
164 * is marked as NFS4_DUP_FREE, or is an entry in the
165 * NFS4_DUP_REPLAY state.
166 */
167rfs4_dupreq_t *
168rfs4_alloc_dr(rfs4_drc_t *drc)
169{
170	rfs4_dupreq_t *drp_tail, *drp = NULL;
171
172	ASSERT(drc);
173	ASSERT(MUTEX_HELD(&drc->lock));
174
175	/*
176	 * Have we hit the cache limit yet ?
177	 */
178	if (drc->in_use < drc->max_size) {
179		/*
180		 * nope, so let's malloc a new one
181		 */
182		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
183		drp->drc = drc;
184		drc->in_use++;
185		gethrestime(&drp->dr_time_created);
186		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
187		return (drp);
188	}
189
190	/*
191	 * Cache is all allocated now traverse the list
192	 * backwards to find one we can reuse.
193	 */
194	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
195	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
196
197		switch (drp_tail->dr_state) {
198
199		case NFS4_DUP_FREE:
200			list_remove(&(drc->dr_cache), drp_tail);
201			DTRACE_PROBE1(nfss__i__drc_freeclaim,
202					rfs4_dupreq_t *, drp_tail);
203			return (drp_tail);
204			/* NOTREACHED */
205
206		case NFS4_DUP_REPLAY:
207			/* grab it. */
208			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
209			DTRACE_PROBE1(nfss__i__drc_replayclaim,
210					rfs4_dupreq_t *, drp_tail);
211			return (drp_tail);
212			/* NOTREACHED */
213		}
214	}
215	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
216	return (NULL);
217}
218
219/*
220 * rfs4_find_dr:
221 *
222 * Search for an entry in the duplicate request cache by
223 * calculating the hash index based on the XID, and examining
224 * the entries in the hash bucket. If we find a match stamp the
225 * time_used and return. If the entry does not match it could be
226 * ready to be freed. Once we have searched the bucket we call
227 * rfs4_alloc_dr() to allocate a new entry, or reuse one that is
228 * available.
229 */
230int
231rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
232{
233
234	uint32_t	the_xid;
235	list_t		*dr_bkt;
236	rfs4_dupreq_t	*drp;
237	int		bktdex;
238
239	/*
240	 * Get the XID, calculate the bucket and search to
241	 * see if we need to replay from the cache.
242	 */
243	the_xid = req->rq_xprt->xp_xid;
244	bktdex = the_xid % drc->dr_hash;
245
246	dr_bkt = (list_t *)
247		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
248
249	DTRACE_PROBE3(nfss__i__drc_bktdex,
250			int, bktdex,
251			uint32_t, the_xid,
252			list_t *, dr_bkt);
253
254	*dup = NULL;
255
256	mutex_enter(&drc->lock);
257	/*
258	 * Search the bucket for a matching xid and address.
259	 */
260	for (drp = list_head(dr_bkt); drp != NULL;
261		drp = list_next(dr_bkt, drp)) {
262
263		if (drp->dr_xid == the_xid &&
264		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
265		    bcmp((caddr_t)drp->dr_addr.buf,
266		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
267		    drp->dr_addr.len) == 0) {
268
269			/*
270			 * Found a match so REPLAY the Reply
271			 */
272			if (drp->dr_state == NFS4_DUP_REPLAY) {
273				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
274				mutex_exit(&drc->lock);
275				*dup = drp;
276				DTRACE_PROBE1(nfss__i__drc_replay,
277					rfs4_dupreq_t *, drp);
278				return (NFS4_DUP_REPLAY);
279			}
280
281			/*
282			 * This entry must be in transition, so return
283			 * the 'pending' status.
284			 */
285			mutex_exit(&drc->lock);
286			return (NFS4_DUP_PENDING);
287		}
288
289		/*
290		 * Not a match, but maybe this entry is okay
291		 * to be reused.
292		 */
293		if (drp->dr_state == NFS4_DUP_REPLAY) {
294			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
295			list_insert_tail(&(drp->drc->dr_cache), drp);
296		}
297	}
298
299	drp = rfs4_alloc_dr(drc);
300	mutex_exit(&drc->lock);
301
302	/*
303	 * The DRC is full and all entries are in use. Upper function
304	 * should error out this request and force the client to
305	 * retransmit -- effectively this is a resource issue. NFSD
306	 * threads tied up with native File System, or the cache size
307	 * is too small for the server load.
308	 */
309	if (drp == NULL)
310		return (NFS4_DUP_ERROR);
311
312	/*
313	 * Init the state to NEW and clear the time used field.
314	 */
315	drp->dr_state = NFS4_DUP_NEW;
316	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
317
318	/*
319	 * If needed, resize the address buffer
320	 */
321	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
322		if (drp->dr_addr.buf != NULL)
323			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
324		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
325		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
326		if (drp->dr_addr.buf == NULL) {
327			/*
328			 * If the malloc fails, mark the entry
329			 * as free and put on the tail.
330			 */
331			drp->dr_addr.maxlen = 0;
332			drp->dr_state = NFS4_DUP_FREE;
333			mutex_enter(&drc->lock);
334			list_insert_tail(&(drc->dr_cache), drp);
335			mutex_exit(&drc->lock);
336			return (NFS4_DUP_ERROR);
337		}
338	}
339
340
341	/*
342	 * Copy the address.
343	 */
344	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
345
346	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
347		(caddr_t)drp->dr_addr.buf,
348		drp->dr_addr.len);
349
350	drp->dr_xid = the_xid;
351	drp->dr_bkt = dr_bkt;
352
353	/*
354	 * Insert at the head of the bucket and
355	 * the drc lists..
356	 */
357	mutex_enter(&drc->lock);
358	list_insert_head(&drc->dr_cache, drp);
359	list_insert_head(dr_bkt, drp);
360	mutex_exit(&drc->lock);
361
362	*dup = drp;
363
364	return (NFS4_DUP_NEW);
365}
366
367/*
368 *
369 * This function handles the duplicate request cache,
370 * NULL_PROC and COMPOUND procedure calls for NFSv4;
371 *
372 * Passed into this function are:-
373 *
374 * 	disp	A pointer to our dispatch table entry
375 * 	req	The request to process
376 * 	xprt	The server transport handle
377 * 	ap	A pointer to the arguments
378 *
379 *
380 * When appropriate this function is responsible for inserting
381 * the reply into the duplicate cache or replaying an existing
382 * cached reply.
383 *
384 * dr_stat 	reflects the state of the duplicate request that
385 * 		has been inserted into or retrieved from the cache
386 *
387 * drp		is the duplicate request entry
388 *
389 */
390int
391rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
392		SVCXPRT *xprt, char *ap)
393{
394
395	COMPOUND4res res_buf, *rbp;
396	COMPOUND4args *cap;
397
398	cred_t 	*cr = NULL;
399	int	error = 0;
400	int 	dis_flags = 0;
401	int 	dr_stat = NFS4_NOT_DUP;
402	rfs4_dupreq_t *drp = NULL;
403
404	ASSERT(disp);
405
406	/*
407	 * Short circuit the RPC_NULL proc.
408	 */
409	if (disp->dis_proc == rpc_null) {
410		if (!svc_sendreply(xprt, xdr_void, NULL)) {
411			return (1);
412		}
413		return (0);
414	}
415
416	/* Only NFSv4 Compounds from this point onward */
417
418	rbp = &res_buf;
419	cap = (COMPOUND4args *)ap;
420
421	/*
422	 * Figure out the disposition of the whole COMPOUND
423	 * and record it's IDEMPOTENTCY.
424	 */
425	rfs4_compound_flagproc(cap, &dis_flags);
426
427	/*
428	 * If NON-IDEMPOTENT then we need to figure out if this
429	 * request can be replied from the duplicate cache.
430	 *
431	 * If this is a new request then we need to insert the
432	 * reply into the duplicate cache.
433	 */
434	if (!(dis_flags & RPC_IDEMPOTENT)) {
435		/* look for a replay from the cache or allocate */
436		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
437
438		switch (dr_stat) {
439
440		case NFS4_DUP_ERROR:
441			svcerr_systemerr(xprt);
442			return (1);
443			/* NOTREACHED */
444
445		case NFS4_DUP_PENDING:
446			/*
447			 * reply has previously been inserted into the
448			 * duplicate cache, however the reply has
449			 * not yet been sent via svc_sendreply()
450			 */
451			return (1);
452			/* NOTREACHED */
453
454		case NFS4_DUP_NEW:
455			curthread->t_flag |= T_DONTPEND;
456			/* NON-IDEMPOTENT proc call */
457			rfs4_compound(cap, rbp, NULL, req, cr);
458
459			curthread->t_flag &= ~T_DONTPEND;
460
461			/*
462			 * dr_res must be initialized before calling
463			 * rfs4_dr_chstate (it frees the reply).
464			 */
465			drp->dr_res = res_buf;
466			if (curthread->t_flag & T_WOULDBLOCK) {
467				curthread->t_flag &= ~T_WOULDBLOCK;
468				/*
469				 * mark this entry as FREE and plop
470				 * on the end of the cache list
471				 */
472				mutex_enter(&drp->drc->lock);
473				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
474				list_insert_tail(&(drp->drc->dr_cache), drp);
475				mutex_exit(&drp->drc->lock);
476				return (1);
477			}
478			break;
479
480		case NFS4_DUP_REPLAY:
481			/* replay from the cache */
482			rbp = &(drp->dr_res);
483			break;
484		}
485	} else {
486		curthread->t_flag |= T_DONTPEND;
487		/* IDEMPOTENT proc call */
488		rfs4_compound(cap, rbp, NULL, req, cr);
489
490		curthread->t_flag &= ~T_DONTPEND;
491		if (curthread->t_flag & T_WOULDBLOCK) {
492			curthread->t_flag &= ~T_WOULDBLOCK;
493			return (1);
494		}
495	}
496
497	/*
498	 * Send out the replayed reply or the 'real' one.
499	 */
500	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
501		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
502			struct svc_req *, xprt,
503			char *, rbp);
504		error++;
505	}
506
507	/*
508	 * If this reply was just inserted into the duplicate cache
509	 * or it was replayed from the dup cache; (re)mark it as
510	 * available for replay
511	 *
512	 * At first glance, this 'if' statement seems a little strange;
513	 * testing for NFS4_DUP_REPLAY, and then calling...
514	 *
515	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
516	 *
517	 * ... but notice that we are checking dr_stat, and not the
518	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
519	 * we do that so that we know not to prematurely reap it whilst
520	 * we resent it to the client.
521	 *
522	 */
523	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
524		mutex_enter(&drp->drc->lock);
525		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
526		mutex_exit(&drp->drc->lock);
527	} else if (dr_stat == NFS4_NOT_DUP) {
528		rfs4_compound_free(rbp);
529	}
530
531	return (error);
532}
533
534bool_t
535rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
536{
537	COMPOUND4args *argsp;
538	COMPOUND4res res_buf, *resp;
539
540	if (req->rq_vers != 4)
541		return (FALSE);
542
543	argsp = (COMPOUND4args *)args;
544
545	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
546		return (FALSE);
547
548	resp = &res_buf;
549
550	/*
551	 * Form a reply tag by copying over the reqeuest tag.
552	 */
553	resp->tag.utf8string_val =
554	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
555	resp->tag.utf8string_len = argsp->tag.utf8string_len;
556	bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
557	    resp->tag.utf8string_len);
558	resp->array_len = 0;
559	resp->array = NULL;
560	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
561	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
562		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
563		    SVCXPRT *, xprt, char *, resp);
564	}
565	rfs4_compound_free(resp);
566	return (TRUE);
567}
568