nfs4_dispatch.c revision 48a344074403d73a2e38d76ad47299c16c89e0dc
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/systm.h>
30#include <sys/sdt.h>
31#include <rpc/types.h>
32#include <rpc/auth.h>
33#include <rpc/auth_unix.h>
34#include <rpc/auth_des.h>
35#include <rpc/svc.h>
36#include <rpc/xdr.h>
37#include <nfs/nfs4.h>
38#include <nfs/nfs_dispatch.h>
39#include <nfs/nfs4_drc.h>
40
41/*
42 * This is the duplicate request cache for NFSv4
43 */
44rfs4_drc_t *nfs4_drc = NULL;
45
46/*
47 * The default size of the duplicate request cache
48 */
49uint32_t nfs4_drc_max = 8 * 1024;
50
51/*
52 * The number of buckets we'd like to hash the
53 * replies into.. do not change this on the fly.
54 */
55uint32_t nfs4_drc_hash = 541;
56
57/*
58 * Initialize a duplicate request cache.
59 */
60rfs4_drc_t *
61rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
62{
63	rfs4_drc_t *drc;
64	uint32_t   bki;
65
66	ASSERT(drc_size);
67	ASSERT(drc_hash_size);
68
69	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
70
71	drc->max_size = drc_size;
72	drc->in_use = 0;
73
74	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
75
76	drc->dr_hash = drc_hash_size;
77
78	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
79
80	for (bki = 0; bki < drc_hash_size; bki++) {
81		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
82		    offsetof(rfs4_dupreq_t, dr_bkt_next));
83	}
84
85	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
86		    offsetof(rfs4_dupreq_t, dr_next));
87
88	return (drc);
89}
90
91/*
92 * Destroy a duplicate request cache.
93 */
94void
95rfs4_fini_drc(rfs4_drc_t *drc)
96{
97	rfs4_dupreq_t *drp, *drp_next;
98
99	ASSERT(drc);
100
101	/* iterate over the dr_cache and free the enties */
102	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
103
104		if (drp->dr_state == NFS4_DUP_REPLAY)
105			rfs4_compound_free(&(drp->dr_res));
106
107		if (drp->dr_addr.buf != NULL)
108			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
109
110		drp_next = list_next(&(drc->dr_cache), drp);
111
112		kmem_free(drp, sizeof (rfs4_dupreq_t));
113	}
114
115	mutex_destroy(&drc->lock);
116	kmem_free(drc->dr_buckets,
117		sizeof (list_t)*drc->dr_hash);
118	kmem_free(drc, sizeof (rfs4_drc_t));
119}
120
121/*
122 * rfs4_dr_chstate:
123 *
124 * Change the state of a rfs4_dupreq. If it's not in transition
125 * to the FREE state, update the time used and return. If we
126 * are moving to the FREE state then we need to clean up the
127 * compound results and move the entry to the end of the list.
128 */
129void
130rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
131{
132	rfs4_drc_t *drc;
133
134	ASSERT(drp);
135	ASSERT(drp->drc);
136	ASSERT(drp->dr_bkt);
137	ASSERT(MUTEX_HELD(&drp->drc->lock));
138
139	drp->dr_state = new_state;
140
141	if (new_state != NFS4_DUP_FREE) {
142		gethrestime(&drp->dr_time_used);
143		return;
144	}
145
146	drc = drp->drc;
147
148	/*
149	 * Remove entry from the bucket and
150	 * dr_cache list, free compound results.
151	 */
152	list_remove(drp->dr_bkt, drp);
153	list_remove(&(drc->dr_cache), drp);
154	rfs4_compound_free(&(drp->dr_res));
155}
156
157/*
158 * rfs4_alloc_dr:
159 *
160 * Malloc a new one if we have not reached our maximum cache
161 * limit, otherwise pick an entry off the tail -- Use if it
162 * is marked as NFS4_DUP_FREE, or is an entry in the
163 * NFS4_DUP_REPLAY state.
164 */
165rfs4_dupreq_t *
166rfs4_alloc_dr(rfs4_drc_t *drc)
167{
168	rfs4_dupreq_t *drp_tail, *drp = NULL;
169
170	ASSERT(drc);
171	ASSERT(MUTEX_HELD(&drc->lock));
172
173	/*
174	 * Have we hit the cache limit yet ?
175	 */
176	if (drc->in_use < drc->max_size) {
177		/*
178		 * nope, so let's malloc a new one
179		 */
180		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
181		drp->drc = drc;
182		drc->in_use++;
183		gethrestime(&drp->dr_time_created);
184		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
185		return (drp);
186	}
187
188	/*
189	 * Cache is all allocated now traverse the list
190	 * backwards to find one we can reuse.
191	 */
192	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
193	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
194
195		switch (drp_tail->dr_state) {
196
197		case NFS4_DUP_FREE:
198			list_remove(&(drc->dr_cache), drp_tail);
199			DTRACE_PROBE1(nfss__i__drc_freeclaim,
200					rfs4_dupreq_t *, drp_tail);
201			return (drp_tail);
202			/* NOTREACHED */
203
204		case NFS4_DUP_REPLAY:
205			/* grab it. */
206			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
207			DTRACE_PROBE1(nfss__i__drc_replayclaim,
208					rfs4_dupreq_t *, drp_tail);
209			return (drp_tail);
210			/* NOTREACHED */
211		}
212	}
213	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
214	return (NULL);
215}
216
217/*
218 * rfs4_find_dr:
219 *
220 * Search for an entry in the duplicate request cache by
221 * calculating the hash index based on the XID, and examining
222 * the entries in the hash bucket. If we find a match stamp the
223 * time_used and return. If the entry does not match it could be
224 * ready to be freed. Once we have searched the bucket we call
225 * rfs4_alloc_dr() to allocate a new entry, or reuse one that is
226 * available.
227 */
228int
229rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
230{
231
232	uint32_t	the_xid;
233	list_t		*dr_bkt;
234	rfs4_dupreq_t	*drp;
235	int		bktdex;
236
237	/*
238	 * Get the XID, calculate the bucket and search to
239	 * see if we need to replay from the cache.
240	 */
241	the_xid = req->rq_xprt->xp_xid;
242	bktdex = the_xid % drc->dr_hash;
243
244	dr_bkt = (list_t *)
245		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
246
247	DTRACE_PROBE3(nfss__i__drc_bktdex,
248			int, bktdex,
249			uint32_t, the_xid,
250			list_t *, dr_bkt);
251
252	*dup = NULL;
253
254	mutex_enter(&drc->lock);
255	/*
256	 * Search the bucket for a matching xid and address.
257	 */
258	for (drp = list_head(dr_bkt); drp != NULL;
259		drp = list_next(dr_bkt, drp)) {
260
261		if (drp->dr_xid == the_xid &&
262		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
263		    bcmp((caddr_t)drp->dr_addr.buf,
264		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
265		    drp->dr_addr.len) == 0) {
266
267			/*
268			 * Found a match so REPLAY the Reply
269			 */
270			if (drp->dr_state == NFS4_DUP_REPLAY) {
271				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
272				mutex_exit(&drc->lock);
273				*dup = drp;
274				DTRACE_PROBE1(nfss__i__drc_replay,
275					rfs4_dupreq_t *, drp);
276				return (NFS4_DUP_REPLAY);
277			}
278
279			/*
280			 * This entry must be in transition, so return
281			 * the 'pending' status.
282			 */
283			mutex_exit(&drc->lock);
284			return (NFS4_DUP_PENDING);
285		}
286
287		/*
288		 * Not a match, but maybe this entry is okay
289		 * to be reused.
290		 */
291		if (drp->dr_state == NFS4_DUP_REPLAY) {
292			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
293			list_insert_tail(&(drp->drc->dr_cache), drp);
294		}
295	}
296
297	drp = rfs4_alloc_dr(drc);
298	mutex_exit(&drc->lock);
299
300	/*
301	 * The DRC is full and all entries are in use. Upper function
302	 * should error out this request and force the client to
303	 * retransmit -- effectively this is a resource issue. NFSD
304	 * threads tied up with native File System, or the cache size
305	 * is too small for the server load.
306	 */
307	if (drp == NULL)
308		return (NFS4_DUP_ERROR);
309
310	/*
311	 * Init the state to NEW and clear the time used field.
312	 */
313	drp->dr_state = NFS4_DUP_NEW;
314	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
315
316	/*
317	 * If needed, resize the address buffer
318	 */
319	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
320		if (drp->dr_addr.buf != NULL)
321			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
322		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
323		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
324		if (drp->dr_addr.buf == NULL) {
325			/*
326			 * If the malloc fails, mark the entry
327			 * as free and put on the tail.
328			 */
329			drp->dr_addr.maxlen = 0;
330			drp->dr_state = NFS4_DUP_FREE;
331			mutex_enter(&drc->lock);
332			list_insert_tail(&(drc->dr_cache), drp);
333			mutex_exit(&drc->lock);
334			return (NFS4_DUP_ERROR);
335		}
336	}
337
338
339	/*
340	 * Copy the address.
341	 */
342	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
343
344	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
345		(caddr_t)drp->dr_addr.buf,
346		drp->dr_addr.len);
347
348	drp->dr_xid = the_xid;
349	drp->dr_bkt = dr_bkt;
350
351	/*
352	 * Insert at the head of the bucket and
353	 * the drc lists..
354	 */
355	mutex_enter(&drc->lock);
356	list_insert_head(&drc->dr_cache, drp);
357	list_insert_head(dr_bkt, drp);
358	mutex_exit(&drc->lock);
359
360	*dup = drp;
361
362	return (NFS4_DUP_NEW);
363}
364
365/*
366 *
367 * This function handles the duplicate request cache,
368 * NULL_PROC and COMPOUND procedure calls for NFSv4;
369 *
370 * Passed into this function are:-
371 *
372 * 	disp	A pointer to our dispatch table entry
373 * 	req	The request to process
374 * 	xprt	The server transport handle
375 * 	ap	A pointer to the arguments
376 *
377 *
378 * When appropriate this function is responsible for inserting
379 * the reply into the duplicate cache or replaying an existing
380 * cached reply.
381 *
382 * dr_stat 	reflects the state of the duplicate request that
383 * 		has been inserted into or retrieved from the cache
384 *
385 * drp		is the duplicate request entry
386 *
387 */
388int
389rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
390		SVCXPRT *xprt, char *ap)
391{
392
393	COMPOUND4res res_buf, *rbp;
394	COMPOUND4args *cap;
395
396	cred_t 	*cr = NULL;
397	int	error = 0;
398	int 	dis_flags = 0;
399	int 	dr_stat = NFS4_NOT_DUP;
400	rfs4_dupreq_t *drp = NULL;
401
402	ASSERT(disp);
403
404	/*
405	 * Short circuit the RPC_NULL proc.
406	 */
407	if (disp->dis_proc == rpc_null) {
408		if (!svc_sendreply(xprt, xdr_void, NULL)) {
409			return (1);
410		}
411		return (0);
412	}
413
414	/* Only NFSv4 Compounds from this point onward */
415
416	rbp = &res_buf;
417	cap = (COMPOUND4args *)ap;
418
419	/*
420	 * Figure out the disposition of the whole COMPOUND
421	 * and record it's IDEMPOTENTCY.
422	 */
423	rfs4_compound_flagproc(cap, &dis_flags);
424
425	/*
426	 * If NON-IDEMPOTENT then we need to figure out if this
427	 * request can be replied from the duplicate cache.
428	 *
429	 * If this is a new request then we need to insert the
430	 * reply into the duplicate cache.
431	 */
432	if (!(dis_flags & RPC_IDEMPOTENT)) {
433		/* look for a replay from the cache or allocate */
434		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
435
436		switch (dr_stat) {
437
438		case NFS4_DUP_ERROR:
439			svcerr_systemerr(xprt);
440			return (1);
441			/* NOTREACHED */
442
443		case NFS4_DUP_PENDING:
444			/*
445			 * reply has previously been inserted into the
446			 * duplicate cache, however the reply has
447			 * not yet been sent via svc_sendreply()
448			 */
449			return (1);
450			/* NOTREACHED */
451
452		case NFS4_DUP_NEW:
453			curthread->t_flag |= T_DONTPEND;
454			/* NON-IDEMPOTENT proc call */
455			rfs4_compound(cap, rbp, NULL, req, cr);
456
457			curthread->t_flag &= ~T_DONTPEND;
458
459			/*
460			 * dr_res must be initialized before calling
461			 * rfs4_dr_chstate (it frees the reply).
462			 */
463			drp->dr_res = res_buf;
464			if (curthread->t_flag & T_WOULDBLOCK) {
465				curthread->t_flag &= ~T_WOULDBLOCK;
466				/*
467				 * mark this entry as FREE and plop
468				 * on the end of the cache list
469				 */
470				mutex_enter(&drp->drc->lock);
471				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
472				list_insert_tail(&(drp->drc->dr_cache), drp);
473				mutex_exit(&drp->drc->lock);
474				return (1);
475			}
476			break;
477
478		case NFS4_DUP_REPLAY:
479			/* replay from the cache */
480			rbp = &(drp->dr_res);
481			break;
482		}
483	} else {
484		curthread->t_flag |= T_DONTPEND;
485		/* IDEMPOTENT proc call */
486		rfs4_compound(cap, rbp, NULL, req, cr);
487
488		curthread->t_flag &= ~T_DONTPEND;
489		if (curthread->t_flag & T_WOULDBLOCK) {
490			curthread->t_flag &= ~T_WOULDBLOCK;
491			return (1);
492		}
493	}
494
495	/*
496	 * Send out the replayed reply or the 'real' one.
497	 */
498	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
499		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
500			struct svc_req *, xprt,
501			char *, rbp);
502		error++;
503	}
504
505	/*
506	 * If this reply was just inserted into the duplicate cache
507	 * or it was replayed from the dup cache; (re)mark it as
508	 * available for replay
509	 *
510	 * At first glance, this 'if' statement seems a little strange;
511	 * testing for NFS4_DUP_REPLAY, and then calling...
512	 *
513	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
514	 *
515	 * ... but notice that we are checking dr_stat, and not the
516	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
517	 * we do that so that we know not to prematurely reap it whilst
518	 * we resent it to the client.
519	 *
520	 */
521	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
522		mutex_enter(&drp->drc->lock);
523		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
524		mutex_exit(&drp->drc->lock);
525	} else if (dr_stat == NFS4_NOT_DUP) {
526		rfs4_compound_free(rbp);
527	}
528
529	return (error);
530}
531