xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_dispatch.c (revision d9ad96c1d1e6612641c338d86699f5700fca7217)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 
31 #include <rpc/types.h>
32 #include <rpc/auth.h>
33 #include <rpc/auth_unix.h>
34 #include <rpc/auth_des.h>
35 #include <rpc/svc.h>
36 #include <rpc/xdr.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfs_dispatch.h>
39 #include <nfs/nfs4_drc.h>
40 
41 /*
42  * This is the duplicate request cache for NFSv4
43  */
44 rfs4_drc_t *nfs4_drc = NULL;
45 
46 /*
47  * How long the entry can remain in the cache
48  * once it has been sent to the client and not
49  * used in a reply (in seconds)
50  */
51 unsigned nfs4_drc_lifetime = 1;
52 
53 /*
54  * The default size of the duplicate request cache
55  */
56 uint32_t nfs4_drc_max = 8 * 1024;
57 
58 /*
59  * The number of buckets we'd like to hash the
60  * replies into.. do not change this on the fly.
61  */
62 uint32_t nfs4_drc_hash = 541;
63 
64 /*
65  * Initialize a duplicate request cache.
66  */
67 rfs4_drc_t *
68 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size, unsigned ttl)
69 {
70 	rfs4_drc_t *drc;
71 	uint32_t   bki;
72 
73 	ASSERT(drc_size);
74 	ASSERT(drc_hash_size);
75 
76 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
77 
78 	drc->max_size = drc_size;
79 	drc->in_use = 0;
80 	drc->drc_ttl = ttl;
81 
82 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
83 
84 	drc->dr_hash = drc_hash_size;
85 
86 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
87 
88 	for (bki = 0; bki < drc_hash_size; bki++) {
89 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
90 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
91 	}
92 
93 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
94 		    offsetof(rfs4_dupreq_t, dr_next));
95 
96 	return (drc);
97 }
98 
99 /*
100  * Destroy a duplicate request cache.
101  */
102 void
103 rfs4_fini_drc(rfs4_drc_t *drc)
104 {
105 	rfs4_dupreq_t *drp, *drp_next;
106 
107 	ASSERT(drc);
108 
109 	/* iterate over the dr_cache and free the enties */
110 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
111 
112 		if (drp->dr_state == NFS4_DUP_REPLAY)
113 			rfs4_compound_free(&(drp->dr_res));
114 
115 		if (drp->dr_addr.buf != NULL)
116 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
117 
118 		drp_next = list_next(&(drc->dr_cache), drp);
119 
120 		kmem_free(drp, sizeof (rfs4_dupreq_t));
121 	}
122 
123 	mutex_destroy(&drc->lock);
124 	kmem_free(drc->dr_buckets,
125 		sizeof (list_t)*drc->dr_hash);
126 	kmem_free(drc, sizeof (rfs4_drc_t));
127 }
128 
129 /*
130  * rfs4_dr_chstate:
131  *
132  * Change the state of a rfs4_dupreq. If it's not in transition
133  * to the FREE state, update the time used and return. If we
134  * are moving to the FREE state then we need to clean up the
135  * compound results and move the entry to the end of the list.
136  */
137 void
138 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
139 {
140 	rfs4_drc_t *drc;
141 
142 	ASSERT(drp);
143 	ASSERT(drp->drc);
144 	ASSERT(drp->dr_bkt);
145 	ASSERT(MUTEX_HELD(&drp->drc->lock));
146 
147 	drp->dr_state = new_state;
148 
149 	if (new_state != NFS4_DUP_FREE) {
150 		gethrestime(&drp->dr_time_used);
151 		return;
152 	}
153 
154 	drc = drp->drc;
155 
156 	/*
157 	 * Remove entry from the bucket and
158 	 * dr_cache list, free compound results.
159 	 */
160 	list_remove(drp->dr_bkt, drp);
161 	list_remove(&(drc->dr_cache), drp);
162 	rfs4_compound_free(&(drp->dr_res));
163 }
164 
165 /*
166  * rfs4_alloc_dr:
167  *
168  * Pick an entry off the tail -- Use if it is
169  * marked NFS4_DUP_FREE, or is an entry in the
170  * NFS4_DUP_REPLAY state that has timed-out...
171  * Otherwise malloc a new one if we have not reached
172  * our maximum cache limit.
173  *
174  * The list should be in time order, so no need
175  * to traverse backwards looking for a timed out
176  * entry, NFS4_DUP_FREE's are place on the tail.
177  */
178 rfs4_dupreq_t *
179 rfs4_alloc_dr(rfs4_drc_t *drc)
180 {
181 	rfs4_dupreq_t *drp_tail, *drp = NULL;
182 
183 	ASSERT(drc);
184 	ASSERT(MUTEX_HELD(&drc->lock));
185 
186 	if ((drp_tail = list_tail(&drc->dr_cache)) != NULL) {
187 
188 		switch (drp_tail->dr_state) {
189 
190 		case NFS4_DUP_FREE:
191 			list_remove(&(drc->dr_cache), drp_tail);
192 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
193 					rfs4_dupreq_t *, drp_tail);
194 			return (drp_tail);
195 			/* NOTREACHED */
196 
197 		case NFS4_DUP_REPLAY:
198 			if (gethrestime_sec() >
199 			    drp_tail->dr_time_used.tv_sec+drc->drc_ttl) {
200 				/* this entry has timedout so grab it. */
201 				rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
202 				DTRACE_PROBE1(nfss__i__drc_ttlclaim,
203 					rfs4_dupreq_t *, drp_tail);
204 				return (drp_tail);
205 			}
206 			break;
207 		}
208 	}
209 
210 	/*
211 	 * Didn't find something to recycle have
212 	 * we hit the cache limit ?
213 	 */
214 	if (drc->in_use >= drc->max_size) {
215 		DTRACE_PROBE1(nfss__i__drc_full,
216 			rfs4_drc_t *, drc);
217 		return (NULL);
218 	}
219 
220 
221 	/* nope, so let's malloc a new one */
222 	drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
223 	drp->drc = drc;
224 	drc->in_use++;
225 	gethrestime(&drp->dr_time_created);
226 	DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
227 
228 	return (drp);
229 }
230 
231 /*
232  * rfs4_find_dr:
233  *
234  * Search for an entry in the duplicate request cache by
235  * calculating the hash index based on the XID, and examining
236  * the entries in the hash bucket. If we find a match stamp the
237  * time_used and return. If the entry does not match it could be
238  * ready to be freed. Once we have searched the bucket and we
239  * have not exhausted the maximum limit for the cache we will
240  * allocate a new entry.
241  */
242 int
243 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
244 {
245 
246 	uint32_t	the_xid;
247 	list_t		*dr_bkt;
248 	rfs4_dupreq_t	*drp;
249 	int		bktdex;
250 
251 	/*
252 	 * Get the XID, calculate the bucket and search to
253 	 * see if we need to replay from the cache.
254 	 */
255 	the_xid = req->rq_xprt->xp_xid;
256 	bktdex = the_xid % drc->dr_hash;
257 
258 	dr_bkt = (list_t *)
259 		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
260 
261 	DTRACE_PROBE3(nfss__i__drc_bktdex,
262 			int, bktdex,
263 			uint32_t, the_xid,
264 			list_t *, dr_bkt);
265 
266 	*dup = NULL;
267 
268 	mutex_enter(&drc->lock);
269 	/*
270 	 * Search the bucket for a matching xid and address.
271 	 */
272 	for (drp = list_head(dr_bkt); drp != NULL;
273 		drp = list_next(dr_bkt, drp)) {
274 
275 		if (drp->dr_xid == the_xid &&
276 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
277 		    bcmp((caddr_t)drp->dr_addr.buf,
278 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
279 		    drp->dr_addr.len) == 0) {
280 
281 			/*
282 			 * Found a match so REPLAY the Reply
283 			 */
284 			if (drp->dr_state == NFS4_DUP_REPLAY) {
285 				gethrestime(&drp->dr_time_used);
286 				mutex_exit(&drc->lock);
287 				*dup = drp;
288 				DTRACE_PROBE1(nfss__i__drc_replay,
289 					rfs4_dupreq_t *, drp);
290 				return (NFS4_DUP_REPLAY);
291 			}
292 
293 			/*
294 			 * This entry must be in transition, so return
295 			 * the 'pending' status.
296 			 */
297 			mutex_exit(&drc->lock);
298 			return (NFS4_DUP_PENDING);
299 		}
300 
301 		/*
302 		 * Not a match, but maybe this entry is ready
303 		 * to be reused.
304 		 */
305 		if (drp->dr_state == NFS4_DUP_REPLAY &&
306 			(gethrestime_sec() >
307 			drp->dr_time_used.tv_sec+drc->drc_ttl)) {
308 			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
309 			list_insert_tail(&(drp->drc->dr_cache), drp);
310 		}
311 	}
312 
313 	drp = rfs4_alloc_dr(drc);
314 	mutex_exit(&drc->lock);
315 
316 	if (drp == NULL) {
317 		return (NFS4_DUP_ERROR);
318 	}
319 
320 	/*
321 	 * Place at the head of the list, init the state
322 	 * to NEW and clear the time used field.
323 	 */
324 
325 	drp->dr_state = NFS4_DUP_NEW;
326 	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
327 
328 	/*
329 	 * If needed, resize the address buffer
330 	 */
331 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
332 		if (drp->dr_addr.buf != NULL)
333 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
334 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
335 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
336 		if (drp->dr_addr.buf == NULL) {
337 			/*
338 			 * If the malloc fails, mark the entry
339 			 * as free and put on the tail.
340 			 */
341 			drp->dr_addr.maxlen = 0;
342 			drp->dr_state = NFS4_DUP_FREE;
343 			mutex_enter(&drc->lock);
344 			list_insert_tail(&(drc->dr_cache), drp);
345 			mutex_exit(&drc->lock);
346 			return (NFS4_DUP_ERROR);
347 		}
348 	}
349 
350 
351 	/*
352 	 * Copy the address.
353 	 */
354 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
355 
356 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
357 		(caddr_t)drp->dr_addr.buf,
358 		drp->dr_addr.len);
359 
360 	drp->dr_xid = the_xid;
361 	drp->dr_bkt = dr_bkt;
362 
363 	/*
364 	 * Insert at the head of the bucket and
365 	 * the drc lists..
366 	 */
367 	mutex_enter(&drc->lock);
368 	list_insert_head(&drc->dr_cache, drp);
369 	list_insert_head(dr_bkt, drp);
370 	mutex_exit(&drc->lock);
371 
372 	*dup = drp;
373 
374 	return (NFS4_DUP_NEW);
375 }
376 
377 /*
378  *
379  * This function handles the duplicate request cache,
380  * NULL_PROC and COMPOUND procedure calls for NFSv4;
381  *
382  * Passed into this function are:-
383  *
384  * 	disp	A pointer to our dispatch table entry
385  * 	req	The request to process
386  * 	xprt	The server transport handle
387  * 	ap	A pointer to the arguments
388  *
389  *
390  * When appropriate this function is responsible for inserting
391  * the reply into the duplicate cache or replaying an existing
392  * cached reply.
393  *
394  * dr_stat 	reflects the state of the duplicate request that
395  * 		has been inserted into or retrieved from the cache
396  *
397  * drp		is the duplicate request entry
398  *
399  */
400 int
401 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
402 		SVCXPRT *xprt, char *ap)
403 {
404 
405 	COMPOUND4res res_buf, *rbp;
406 	COMPOUND4args *cap;
407 
408 	cred_t 	*cr = NULL;
409 	int	error = 0;
410 	int 	dis_flags = 0;
411 	int 	dr_stat = NFS4_NOT_DUP;
412 	rfs4_dupreq_t *drp = NULL;
413 
414 	ASSERT(disp);
415 
416 	/*
417 	 * Short circuit the RPC_NULL proc.
418 	 */
419 	if (disp->dis_proc == rpc_null) {
420 		if (!svc_sendreply(xprt, xdr_void, NULL)) {
421 			return (1);
422 		}
423 		return (0);
424 	}
425 
426 	/* Only NFSv4 Compounds from this point onward */
427 
428 	rbp = &res_buf;
429 	cap = (COMPOUND4args *)ap;
430 
431 	/*
432 	 * Figure out the disposition of the whole COMPOUND
433 	 * and record it's IDEMPOTENTCY.
434 	 */
435 	rfs4_compound_flagproc(cap, &dis_flags);
436 
437 	/*
438 	 * If NON-IDEMPOTENT then we need to figure out if this
439 	 * request can be replied from the duplicate cache.
440 	 *
441 	 * If this is a new request then we need to insert the
442 	 * reply into the duplicate cache.
443 	 */
444 	if (!(dis_flags & RPC_IDEMPOTENT)) {
445 		/* look for a replay from the cache or allocate */
446 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
447 
448 		switch (dr_stat) {
449 
450 		case NFS4_DUP_ERROR:
451 			svcerr_systemerr(xprt);
452 			return (1);
453 			/* NOTREACHED */
454 
455 		case NFS4_DUP_PENDING:
456 			/*
457 			 * reply has previously been inserted into the
458 			 * duplicate cache, however the reply has
459 			 * not yet been sent via svc_sendreply()
460 			 */
461 			return (1);
462 			/* NOTREACHED */
463 
464 		case NFS4_DUP_NEW:
465 			curthread->t_flag |= T_DONTPEND;
466 			/* NON-IDEMPOTENT proc call */
467 			rfs4_compound(cap, rbp, NULL, req, cr);
468 
469 			curthread->t_flag &= ~T_DONTPEND;
470 			if (curthread->t_flag & T_WOULDBLOCK) {
471 				curthread->t_flag &= ~T_WOULDBLOCK;
472 				/*
473 				 * mark this entry as FREE and plop
474 				 * on the end of the cache list
475 				 */
476 				mutex_enter(&drp->drc->lock);
477 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
478 				list_insert_tail(&(drp->drc->dr_cache), drp);
479 				mutex_exit(&drp->drc->lock);
480 				return (1);
481 			}
482 			drp->dr_res = res_buf;
483 			break;
484 
485 		case NFS4_DUP_REPLAY:
486 			/* replay from the cache */
487 			rbp = &(drp->dr_res);
488 			break;
489 		}
490 	} else {
491 		curthread->t_flag |= T_DONTPEND;
492 		/* IDEMPOTENT proc call */
493 		rfs4_compound(cap, rbp, NULL, req, cr);
494 
495 		curthread->t_flag &= ~T_DONTPEND;
496 		if (curthread->t_flag & T_WOULDBLOCK) {
497 			curthread->t_flag &= ~T_WOULDBLOCK;
498 			return (1);
499 		}
500 	}
501 
502 	/*
503 	 * Send out the replayed reply or the 'real' one.
504 	 */
505 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res, (char *)rbp)) {
506 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
507 			struct svc_req *, xprt,
508 			char *, rbp);
509 		error++;
510 	}
511 
512 	/*
513 	 * If this reply was just inserted into the duplicate cache
514 	 * mark it as available for replay
515 	 */
516 	if (dr_stat == NFS4_DUP_NEW) {
517 		mutex_enter(&drp->drc->lock);
518 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
519 		mutex_exit(&drp->drc->lock);
520 	} else if (dr_stat == NFS4_NOT_DUP) {
521 		rfs4_compound_free(rbp);
522 	}
523 
524 	return (error);
525 }
526