nfs4_dispatch.c revision 3f720b33ddd72eec6da368eaaa751ed3acbca723
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23/*
24 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25 * Use is subject to license terms.
26 */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30#include <sys/systm.h>
31#include <sys/sdt.h>
32#include <rpc/types.h>
33#include <rpc/auth.h>
34#include <rpc/auth_unix.h>
35#include <rpc/auth_des.h>
36#include <rpc/svc.h>
37#include <rpc/xdr.h>
38#include <nfs/nfs4.h>
39#include <nfs/nfs_dispatch.h>
40#include <nfs/nfs4_drc.h>
41
42/*
43 * This is the duplicate request cache for NFSv4
44 */
45rfs4_drc_t *nfs4_drc = NULL;
46
47/*
48 * How long the entry can remain in the cache
49 * once it has been sent to the client and not
50 * used in a reply (in seconds)
51 */
52unsigned nfs4_drc_lifetime = 1;
53
54/*
55 * The default size of the duplicate request cache
56 */
57uint32_t nfs4_drc_max = 8 * 1024;
58
59/*
60 * The number of buckets we'd like to hash the
61 * replies into.. do not change this on the fly.
62 */
63uint32_t nfs4_drc_hash = 541;
64
65/*
66 * Initialize a duplicate request cache.
67 */
68rfs4_drc_t *
69rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size, unsigned ttl)
70{
71	rfs4_drc_t *drc;
72	uint32_t   bki;
73
74	ASSERT(drc_size);
75	ASSERT(drc_hash_size);
76
77	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
78
79	drc->max_size = drc_size;
80	drc->in_use = 0;
81	drc->drc_ttl = ttl;
82
83	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
84
85	drc->dr_hash = drc_hash_size;
86
87	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
88
89	for (bki = 0; bki < drc_hash_size; bki++) {
90		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
91		    offsetof(rfs4_dupreq_t, dr_bkt_next));
92	}
93
94	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
95		    offsetof(rfs4_dupreq_t, dr_next));
96
97	return (drc);
98}
99
100/*
101 * Destroy a duplicate request cache.
102 */
103void
104rfs4_fini_drc(rfs4_drc_t *drc)
105{
106	rfs4_dupreq_t *drp, *drp_next;
107
108	ASSERT(drc);
109
110	/* iterate over the dr_cache and free the enties */
111	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
112
113		if (drp->dr_state == NFS4_DUP_REPLAY)
114			rfs4_compound_free(&(drp->dr_res));
115
116		if (drp->dr_addr.buf != NULL)
117			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
118
119		drp_next = list_next(&(drc->dr_cache), drp);
120
121		kmem_free(drp, sizeof (rfs4_dupreq_t));
122	}
123
124	mutex_destroy(&drc->lock);
125	kmem_free(drc->dr_buckets,
126		sizeof (list_t)*drc->dr_hash);
127	kmem_free(drc, sizeof (rfs4_drc_t));
128}
129
130/*
131 * rfs4_dr_chstate:
132 *
133 * Change the state of a rfs4_dupreq. If it's not in transition
134 * to the FREE state, update the time used and return. If we
135 * are moving to the FREE state then we need to clean up the
136 * compound results and move the entry to the end of the list.
137 */
138void
139rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
140{
141	rfs4_drc_t *drc;
142
143	ASSERT(drp);
144	ASSERT(drp->drc);
145	ASSERT(drp->dr_bkt);
146	ASSERT(MUTEX_HELD(&drp->drc->lock));
147
148	drp->dr_state = new_state;
149
150	if (new_state != NFS4_DUP_FREE) {
151		gethrestime(&drp->dr_time_used);
152		return;
153	}
154
155	drc = drp->drc;
156
157	/*
158	 * Remove entry from the bucket and
159	 * dr_cache list, free compound results.
160	 */
161	list_remove(drp->dr_bkt, drp);
162	list_remove(&(drc->dr_cache), drp);
163	rfs4_compound_free(&(drp->dr_res));
164}
165
166/*
167 * rfs4_alloc_dr:
168 *
169 * Pick an entry off the tail -- Use if it is
170 * marked NFS4_DUP_FREE, or is an entry in the
171 * NFS4_DUP_REPLAY state that has timed-out...
172 * Otherwise malloc a new one if we have not reached
173 * our maximum cache limit.
174 *
175 * The list should be in time order, so no need
176 * to traverse backwards looking for a timed out
177 * entry, NFS4_DUP_FREE's are place on the tail.
178 */
179rfs4_dupreq_t *
180rfs4_alloc_dr(rfs4_drc_t *drc)
181{
182	rfs4_dupreq_t *drp_tail, *drp = NULL;
183
184	ASSERT(drc);
185	ASSERT(MUTEX_HELD(&drc->lock));
186
187	if ((drp_tail = list_tail(&drc->dr_cache)) != NULL) {
188
189		switch (drp_tail->dr_state) {
190
191		case NFS4_DUP_FREE:
192			list_remove(&(drc->dr_cache), drp_tail);
193			DTRACE_PROBE1(nfss__i__drc_freeclaim,
194					rfs4_dupreq_t *, drp_tail);
195			return (drp_tail);
196			/* NOTREACHED */
197
198		case NFS4_DUP_REPLAY:
199			if (gethrestime_sec() >
200			    drp_tail->dr_time_used.tv_sec+drc->drc_ttl) {
201				/* this entry has timedout so grab it. */
202				rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
203				DTRACE_PROBE1(nfss__i__drc_ttlclaim,
204					rfs4_dupreq_t *, drp_tail);
205				return (drp_tail);
206			}
207			break;
208		}
209	}
210
211	/*
212	 * Didn't find something to recycle have
213	 * we hit the cache limit ?
214	 */
215	if (drc->in_use >= drc->max_size) {
216		DTRACE_PROBE1(nfss__i__drc_full,
217			rfs4_drc_t *, drc);
218		return (NULL);
219	}
220
221
222	/* nope, so let's malloc a new one */
223	drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
224	drp->drc = drc;
225	drc->in_use++;
226	gethrestime(&drp->dr_time_created);
227	DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
228
229	return (drp);
230}
231
232/*
233 * rfs4_find_dr:
234 *
235 * Search for an entry in the duplicate request cache by
236 * calculating the hash index based on the XID, and examining
237 * the entries in the hash bucket. If we find a match stamp the
238 * time_used and return. If the entry does not match it could be
239 * ready to be freed. Once we have searched the bucket and we
240 * have not exhausted the maximum limit for the cache we will
241 * allocate a new entry.
242 */
243int
244rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
245{
246
247	uint32_t	the_xid;
248	list_t		*dr_bkt;
249	rfs4_dupreq_t	*drp;
250	int		bktdex;
251
252	/*
253	 * Get the XID, calculate the bucket and search to
254	 * see if we need to replay from the cache.
255	 */
256	the_xid = req->rq_xprt->xp_xid;
257	bktdex = the_xid % drc->dr_hash;
258
259	dr_bkt = (list_t *)
260		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
261
262	DTRACE_PROBE3(nfss__i__drc_bktdex,
263			int, bktdex,
264			uint32_t, the_xid,
265			list_t *, dr_bkt);
266
267	*dup = NULL;
268
269	mutex_enter(&drc->lock);
270	/*
271	 * Search the bucket for a matching xid and address.
272	 */
273	for (drp = list_head(dr_bkt); drp != NULL;
274		drp = list_next(dr_bkt, drp)) {
275
276		if (drp->dr_xid == the_xid &&
277		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
278		    bcmp((caddr_t)drp->dr_addr.buf,
279		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
280		    drp->dr_addr.len) == 0) {
281
282			/*
283			 * Found a match so REPLAY the Reply
284			 */
285			if (drp->dr_state == NFS4_DUP_REPLAY) {
286				gethrestime(&drp->dr_time_used);
287				mutex_exit(&drc->lock);
288				*dup = drp;
289				DTRACE_PROBE1(nfss__i__drc_replay,
290					rfs4_dupreq_t *, drp);
291				return (NFS4_DUP_REPLAY);
292			}
293
294			/*
295			 * This entry must be in transition, so return
296			 * the 'pending' status.
297			 */
298			mutex_exit(&drc->lock);
299			return (NFS4_DUP_PENDING);
300		}
301
302		/*
303		 * Not a match, but maybe this entry is ready
304		 * to be reused.
305		 */
306		if (drp->dr_state == NFS4_DUP_REPLAY &&
307			(gethrestime_sec() >
308			drp->dr_time_used.tv_sec+drc->drc_ttl)) {
309			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
310			list_insert_tail(&(drp->drc->dr_cache), drp);
311		}
312	}
313
314	drp = rfs4_alloc_dr(drc);
315	mutex_exit(&drc->lock);
316
317	if (drp == NULL) {
318		return (NFS4_DUP_ERROR);
319	}
320
321	/*
322	 * Place at the head of the list, init the state
323	 * to NEW and clear the time used field.
324	 */
325
326	drp->dr_state = NFS4_DUP_NEW;
327	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
328
329	/*
330	 * If needed, resize the address buffer
331	 */
332	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
333		if (drp->dr_addr.buf != NULL)
334			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
335		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
336		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
337		if (drp->dr_addr.buf == NULL) {
338			/*
339			 * If the malloc fails, mark the entry
340			 * as free and put on the tail.
341			 */
342			drp->dr_addr.maxlen = 0;
343			drp->dr_state = NFS4_DUP_FREE;
344			mutex_enter(&drc->lock);
345			list_insert_tail(&(drc->dr_cache), drp);
346			mutex_exit(&drc->lock);
347			return (NFS4_DUP_ERROR);
348		}
349	}
350
351
352	/*
353	 * Copy the address.
354	 */
355	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
356
357	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
358		(caddr_t)drp->dr_addr.buf,
359		drp->dr_addr.len);
360
361	drp->dr_xid = the_xid;
362	drp->dr_bkt = dr_bkt;
363
364	/*
365	 * Insert at the head of the bucket and
366	 * the drc lists..
367	 */
368	mutex_enter(&drc->lock);
369	list_insert_head(&drc->dr_cache, drp);
370	list_insert_head(dr_bkt, drp);
371	mutex_exit(&drc->lock);
372
373	*dup = drp;
374
375	return (NFS4_DUP_NEW);
376}
377
378/*
379 *
380 * This function handles the duplicate request cache,
381 * NULL_PROC and COMPOUND procedure calls for NFSv4;
382 *
383 * Passed into this function are:-
384 *
385 * 	disp	A pointer to our dispatch table entry
386 * 	req	The request to process
387 * 	xprt	The server transport handle
388 * 	ap	A pointer to the arguments
389 *
390 *
391 * When appropriate this function is responsible for inserting
392 * the reply into the duplicate cache or replaying an existing
393 * cached reply.
394 *
395 * dr_stat 	reflects the state of the duplicate request that
396 * 		has been inserted into or retrieved from the cache
397 *
398 * drp		is the duplicate request entry
399 *
400 */
401int
402rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
403		SVCXPRT *xprt, char *ap)
404{
405
406	COMPOUND4res res_buf, *rbp;
407	COMPOUND4args *cap;
408
409	cred_t 	*cr = NULL;
410	int	error = 0;
411	int 	dis_flags = 0;
412	int 	dr_stat = NFS4_NOT_DUP;
413	rfs4_dupreq_t *drp = NULL;
414
415	ASSERT(disp);
416
417	/*
418	 * Short circuit the RPC_NULL proc.
419	 */
420	if (disp->dis_proc == rpc_null) {
421		if (!svc_sendreply(xprt, xdr_void, NULL)) {
422			return (1);
423		}
424		return (0);
425	}
426
427	/* Only NFSv4 Compounds from this point onward */
428
429	rbp = &res_buf;
430	cap = (COMPOUND4args *)ap;
431
432	/*
433	 * Figure out the disposition of the whole COMPOUND
434	 * and record it's IDEMPOTENTCY.
435	 */
436	rfs4_compound_flagproc(cap, &dis_flags);
437
438	/*
439	 * If NON-IDEMPOTENT then we need to figure out if this
440	 * request can be replied from the duplicate cache.
441	 *
442	 * If this is a new request then we need to insert the
443	 * reply into the duplicate cache.
444	 */
445	if (!(dis_flags & RPC_IDEMPOTENT)) {
446		/* look for a replay from the cache or allocate */
447		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
448
449		switch (dr_stat) {
450
451		case NFS4_DUP_ERROR:
452			svcerr_systemerr(xprt);
453			return (1);
454			/* NOTREACHED */
455
456		case NFS4_DUP_PENDING:
457			/*
458			 * reply has previously been inserted into the
459			 * duplicate cache, however the reply has
460			 * not yet been sent via svc_sendreply()
461			 */
462			return (1);
463			/* NOTREACHED */
464
465		case NFS4_DUP_NEW:
466			curthread->t_flag |= T_DONTPEND;
467			/* NON-IDEMPOTENT proc call */
468			rfs4_compound(cap, rbp, NULL, req, cr);
469
470			curthread->t_flag &= ~T_DONTPEND;
471
472			/*
473			 * dr_res must be initialized before calling
474			 * rfs4_dr_chstate (it frees the reply).
475			 */
476			drp->dr_res = res_buf;
477			if (curthread->t_flag & T_WOULDBLOCK) {
478				curthread->t_flag &= ~T_WOULDBLOCK;
479				/*
480				 * mark this entry as FREE and plop
481				 * on the end of the cache list
482				 */
483				mutex_enter(&drp->drc->lock);
484				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
485				list_insert_tail(&(drp->drc->dr_cache), drp);
486				mutex_exit(&drp->drc->lock);
487				return (1);
488			}
489			break;
490
491		case NFS4_DUP_REPLAY:
492			/* replay from the cache */
493			rbp = &(drp->dr_res);
494			break;
495		}
496	} else {
497		curthread->t_flag |= T_DONTPEND;
498		/* IDEMPOTENT proc call */
499		rfs4_compound(cap, rbp, NULL, req, cr);
500
501		curthread->t_flag &= ~T_DONTPEND;
502		if (curthread->t_flag & T_WOULDBLOCK) {
503			curthread->t_flag &= ~T_WOULDBLOCK;
504			return (1);
505		}
506	}
507
508	/*
509	 * Send out the replayed reply or the 'real' one.
510	 */
511	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
512		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
513			struct svc_req *, xprt,
514			char *, rbp);
515		error++;
516	}
517
518	/*
519	 * If this reply was just inserted into the duplicate cache
520	 * mark it as available for replay
521	 */
522	if (dr_stat == NFS4_DUP_NEW) {
523		mutex_enter(&drp->drc->lock);
524		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
525		mutex_exit(&drp->drc->lock);
526	} else if (dr_stat == NFS4_NOT_DUP) {
527		rfs4_compound_free(rbp);
528	}
529
530	return (error);
531}
532