nfs4_dispatch.c revision d9ad96c1d1e6612641c338d86699f5700fca7217
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23/*
24 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25 * Use is subject to license terms.
26 */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30
31#include <rpc/types.h>
32#include <rpc/auth.h>
33#include <rpc/auth_unix.h>
34#include <rpc/auth_des.h>
35#include <rpc/svc.h>
36#include <rpc/xdr.h>
37#include <nfs/nfs4.h>
38#include <nfs/nfs_dispatch.h>
39#include <nfs/nfs4_drc.h>
40
41/*
42 * This is the duplicate request cache for NFSv4
43 */
44rfs4_drc_t *nfs4_drc = NULL;
45
46/*
47 * How long the entry can remain in the cache
48 * once it has been sent to the client and not
49 * used in a reply (in seconds)
50 */
51unsigned nfs4_drc_lifetime = 1;
52
53/*
54 * The default size of the duplicate request cache
55 */
56uint32_t nfs4_drc_max = 8 * 1024;
57
58/*
59 * The number of buckets we'd like to hash the
60 * replies into.. do not change this on the fly.
61 */
62uint32_t nfs4_drc_hash = 541;
63
64/*
65 * Initialize a duplicate request cache.
66 */
67rfs4_drc_t *
68rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size, unsigned ttl)
69{
70	rfs4_drc_t *drc;
71	uint32_t   bki;
72
73	ASSERT(drc_size);
74	ASSERT(drc_hash_size);
75
76	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
77
78	drc->max_size = drc_size;
79	drc->in_use = 0;
80	drc->drc_ttl = ttl;
81
82	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
83
84	drc->dr_hash = drc_hash_size;
85
86	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
87
88	for (bki = 0; bki < drc_hash_size; bki++) {
89		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
90		    offsetof(rfs4_dupreq_t, dr_bkt_next));
91	}
92
93	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
94		    offsetof(rfs4_dupreq_t, dr_next));
95
96	return (drc);
97}
98
99/*
100 * Destroy a duplicate request cache.
101 */
102void
103rfs4_fini_drc(rfs4_drc_t *drc)
104{
105	rfs4_dupreq_t *drp, *drp_next;
106
107	ASSERT(drc);
108
109	/* iterate over the dr_cache and free the enties */
110	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
111
112		if (drp->dr_state == NFS4_DUP_REPLAY)
113			rfs4_compound_free(&(drp->dr_res));
114
115		if (drp->dr_addr.buf != NULL)
116			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
117
118		drp_next = list_next(&(drc->dr_cache), drp);
119
120		kmem_free(drp, sizeof (rfs4_dupreq_t));
121	}
122
123	mutex_destroy(&drc->lock);
124	kmem_free(drc->dr_buckets,
125		sizeof (list_t)*drc->dr_hash);
126	kmem_free(drc, sizeof (rfs4_drc_t));
127}
128
129/*
130 * rfs4_dr_chstate:
131 *
132 * Change the state of a rfs4_dupreq. If it's not in transition
133 * to the FREE state, update the time used and return. If we
134 * are moving to the FREE state then we need to clean up the
135 * compound results and move the entry to the end of the list.
136 */
137void
138rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
139{
140	rfs4_drc_t *drc;
141
142	ASSERT(drp);
143	ASSERT(drp->drc);
144	ASSERT(drp->dr_bkt);
145	ASSERT(MUTEX_HELD(&drp->drc->lock));
146
147	drp->dr_state = new_state;
148
149	if (new_state != NFS4_DUP_FREE) {
150		gethrestime(&drp->dr_time_used);
151		return;
152	}
153
154	drc = drp->drc;
155
156	/*
157	 * Remove entry from the bucket and
158	 * dr_cache list, free compound results.
159	 */
160	list_remove(drp->dr_bkt, drp);
161	list_remove(&(drc->dr_cache), drp);
162	rfs4_compound_free(&(drp->dr_res));
163}
164
165/*
166 * rfs4_alloc_dr:
167 *
168 * Pick an entry off the tail -- Use if it is
169 * marked NFS4_DUP_FREE, or is an entry in the
170 * NFS4_DUP_REPLAY state that has timed-out...
171 * Otherwise malloc a new one if we have not reached
172 * our maximum cache limit.
173 *
174 * The list should be in time order, so no need
175 * to traverse backwards looking for a timed out
176 * entry, NFS4_DUP_FREE's are place on the tail.
177 */
178rfs4_dupreq_t *
179rfs4_alloc_dr(rfs4_drc_t *drc)
180{
181	rfs4_dupreq_t *drp_tail, *drp = NULL;
182
183	ASSERT(drc);
184	ASSERT(MUTEX_HELD(&drc->lock));
185
186	if ((drp_tail = list_tail(&drc->dr_cache)) != NULL) {
187
188		switch (drp_tail->dr_state) {
189
190		case NFS4_DUP_FREE:
191			list_remove(&(drc->dr_cache), drp_tail);
192			DTRACE_PROBE1(nfss__i__drc_freeclaim,
193					rfs4_dupreq_t *, drp_tail);
194			return (drp_tail);
195			/* NOTREACHED */
196
197		case NFS4_DUP_REPLAY:
198			if (gethrestime_sec() >
199			    drp_tail->dr_time_used.tv_sec+drc->drc_ttl) {
200				/* this entry has timedout so grab it. */
201				rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
202				DTRACE_PROBE1(nfss__i__drc_ttlclaim,
203					rfs4_dupreq_t *, drp_tail);
204				return (drp_tail);
205			}
206			break;
207		}
208	}
209
210	/*
211	 * Didn't find something to recycle have
212	 * we hit the cache limit ?
213	 */
214	if (drc->in_use >= drc->max_size) {
215		DTRACE_PROBE1(nfss__i__drc_full,
216			rfs4_drc_t *, drc);
217		return (NULL);
218	}
219
220
221	/* nope, so let's malloc a new one */
222	drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
223	drp->drc = drc;
224	drc->in_use++;
225	gethrestime(&drp->dr_time_created);
226	DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
227
228	return (drp);
229}
230
231/*
232 * rfs4_find_dr:
233 *
234 * Search for an entry in the duplicate request cache by
235 * calculating the hash index based on the XID, and examining
236 * the entries in the hash bucket. If we find a match stamp the
237 * time_used and return. If the entry does not match it could be
238 * ready to be freed. Once we have searched the bucket and we
239 * have not exhausted the maximum limit for the cache we will
240 * allocate a new entry.
241 */
242int
243rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
244{
245
246	uint32_t	the_xid;
247	list_t		*dr_bkt;
248	rfs4_dupreq_t	*drp;
249	int		bktdex;
250
251	/*
252	 * Get the XID, calculate the bucket and search to
253	 * see if we need to replay from the cache.
254	 */
255	the_xid = req->rq_xprt->xp_xid;
256	bktdex = the_xid % drc->dr_hash;
257
258	dr_bkt = (list_t *)
259		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
260
261	DTRACE_PROBE3(nfss__i__drc_bktdex,
262			int, bktdex,
263			uint32_t, the_xid,
264			list_t *, dr_bkt);
265
266	*dup = NULL;
267
268	mutex_enter(&drc->lock);
269	/*
270	 * Search the bucket for a matching xid and address.
271	 */
272	for (drp = list_head(dr_bkt); drp != NULL;
273		drp = list_next(dr_bkt, drp)) {
274
275		if (drp->dr_xid == the_xid &&
276		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
277		    bcmp((caddr_t)drp->dr_addr.buf,
278		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
279		    drp->dr_addr.len) == 0) {
280
281			/*
282			 * Found a match so REPLAY the Reply
283			 */
284			if (drp->dr_state == NFS4_DUP_REPLAY) {
285				gethrestime(&drp->dr_time_used);
286				mutex_exit(&drc->lock);
287				*dup = drp;
288				DTRACE_PROBE1(nfss__i__drc_replay,
289					rfs4_dupreq_t *, drp);
290				return (NFS4_DUP_REPLAY);
291			}
292
293			/*
294			 * This entry must be in transition, so return
295			 * the 'pending' status.
296			 */
297			mutex_exit(&drc->lock);
298			return (NFS4_DUP_PENDING);
299		}
300
301		/*
302		 * Not a match, but maybe this entry is ready
303		 * to be reused.
304		 */
305		if (drp->dr_state == NFS4_DUP_REPLAY &&
306			(gethrestime_sec() >
307			drp->dr_time_used.tv_sec+drc->drc_ttl)) {
308			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
309			list_insert_tail(&(drp->drc->dr_cache), drp);
310		}
311	}
312
313	drp = rfs4_alloc_dr(drc);
314	mutex_exit(&drc->lock);
315
316	if (drp == NULL) {
317		return (NFS4_DUP_ERROR);
318	}
319
320	/*
321	 * Place at the head of the list, init the state
322	 * to NEW and clear the time used field.
323	 */
324
325	drp->dr_state = NFS4_DUP_NEW;
326	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
327
328	/*
329	 * If needed, resize the address buffer
330	 */
331	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
332		if (drp->dr_addr.buf != NULL)
333			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
334		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
335		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
336		if (drp->dr_addr.buf == NULL) {
337			/*
338			 * If the malloc fails, mark the entry
339			 * as free and put on the tail.
340			 */
341			drp->dr_addr.maxlen = 0;
342			drp->dr_state = NFS4_DUP_FREE;
343			mutex_enter(&drc->lock);
344			list_insert_tail(&(drc->dr_cache), drp);
345			mutex_exit(&drc->lock);
346			return (NFS4_DUP_ERROR);
347		}
348	}
349
350
351	/*
352	 * Copy the address.
353	 */
354	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
355
356	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
357		(caddr_t)drp->dr_addr.buf,
358		drp->dr_addr.len);
359
360	drp->dr_xid = the_xid;
361	drp->dr_bkt = dr_bkt;
362
363	/*
364	 * Insert at the head of the bucket and
365	 * the drc lists..
366	 */
367	mutex_enter(&drc->lock);
368	list_insert_head(&drc->dr_cache, drp);
369	list_insert_head(dr_bkt, drp);
370	mutex_exit(&drc->lock);
371
372	*dup = drp;
373
374	return (NFS4_DUP_NEW);
375}
376
377/*
378 *
379 * This function handles the duplicate request cache,
380 * NULL_PROC and COMPOUND procedure calls for NFSv4;
381 *
382 * Passed into this function are:-
383 *
384 * 	disp	A pointer to our dispatch table entry
385 * 	req	The request to process
386 * 	xprt	The server transport handle
387 * 	ap	A pointer to the arguments
388 *
389 *
390 * When appropriate this function is responsible for inserting
391 * the reply into the duplicate cache or replaying an existing
392 * cached reply.
393 *
394 * dr_stat 	reflects the state of the duplicate request that
395 * 		has been inserted into or retrieved from the cache
396 *
397 * drp		is the duplicate request entry
398 *
399 */
400int
401rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
402		SVCXPRT *xprt, char *ap)
403{
404
405	COMPOUND4res res_buf, *rbp;
406	COMPOUND4args *cap;
407
408	cred_t 	*cr = NULL;
409	int	error = 0;
410	int 	dis_flags = 0;
411	int 	dr_stat = NFS4_NOT_DUP;
412	rfs4_dupreq_t *drp = NULL;
413
414	ASSERT(disp);
415
416	/*
417	 * Short circuit the RPC_NULL proc.
418	 */
419	if (disp->dis_proc == rpc_null) {
420		if (!svc_sendreply(xprt, xdr_void, NULL)) {
421			return (1);
422		}
423		return (0);
424	}
425
426	/* Only NFSv4 Compounds from this point onward */
427
428	rbp = &res_buf;
429	cap = (COMPOUND4args *)ap;
430
431	/*
432	 * Figure out the disposition of the whole COMPOUND
433	 * and record it's IDEMPOTENTCY.
434	 */
435	rfs4_compound_flagproc(cap, &dis_flags);
436
437	/*
438	 * If NON-IDEMPOTENT then we need to figure out if this
439	 * request can be replied from the duplicate cache.
440	 *
441	 * If this is a new request then we need to insert the
442	 * reply into the duplicate cache.
443	 */
444	if (!(dis_flags & RPC_IDEMPOTENT)) {
445		/* look for a replay from the cache or allocate */
446		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
447
448		switch (dr_stat) {
449
450		case NFS4_DUP_ERROR:
451			svcerr_systemerr(xprt);
452			return (1);
453			/* NOTREACHED */
454
455		case NFS4_DUP_PENDING:
456			/*
457			 * reply has previously been inserted into the
458			 * duplicate cache, however the reply has
459			 * not yet been sent via svc_sendreply()
460			 */
461			return (1);
462			/* NOTREACHED */
463
464		case NFS4_DUP_NEW:
465			curthread->t_flag |= T_DONTPEND;
466			/* NON-IDEMPOTENT proc call */
467			rfs4_compound(cap, rbp, NULL, req, cr);
468
469			curthread->t_flag &= ~T_DONTPEND;
470			if (curthread->t_flag & T_WOULDBLOCK) {
471				curthread->t_flag &= ~T_WOULDBLOCK;
472				/*
473				 * mark this entry as FREE and plop
474				 * on the end of the cache list
475				 */
476				mutex_enter(&drp->drc->lock);
477				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
478				list_insert_tail(&(drp->drc->dr_cache), drp);
479				mutex_exit(&drp->drc->lock);
480				return (1);
481			}
482			drp->dr_res = res_buf;
483			break;
484
485		case NFS4_DUP_REPLAY:
486			/* replay from the cache */
487			rbp = &(drp->dr_res);
488			break;
489		}
490	} else {
491		curthread->t_flag |= T_DONTPEND;
492		/* IDEMPOTENT proc call */
493		rfs4_compound(cap, rbp, NULL, req, cr);
494
495		curthread->t_flag &= ~T_DONTPEND;
496		if (curthread->t_flag & T_WOULDBLOCK) {
497			curthread->t_flag &= ~T_WOULDBLOCK;
498			return (1);
499		}
500	}
501
502	/*
503	 * Send out the replayed reply or the 'real' one.
504	 */
505	if (!svc_sendreply(xprt,  xdr_COMPOUND4res, (char *)rbp)) {
506		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
507			struct svc_req *, xprt,
508			char *, rbp);
509		error++;
510	}
511
512	/*
513	 * If this reply was just inserted into the duplicate cache
514	 * mark it as available for replay
515	 */
516	if (dr_stat == NFS4_DUP_NEW) {
517		mutex_enter(&drp->drc->lock);
518		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
519		mutex_exit(&drp->drc->lock);
520	} else if (dr_stat == NFS4_NOT_DUP) {
521		rfs4_compound_free(rbp);
522	}
523
524	return (error);
525}
526