1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <sys/systm.h>
31 #include <sys/sdt.h>
32 #include <rpc/types.h>
33 #include <rpc/auth.h>
34 #include <rpc/auth_unix.h>
35 #include <rpc/auth_des.h>
36 #include <rpc/svc.h>
37 #include <rpc/xdr.h>
38 #include <nfs/nfs4.h>
39 #include <nfs/nfs_dispatch.h>
40 #include <nfs/nfs4_drc.h>
41 
42 /*
43  * This is the duplicate request cache for NFSv4
44  */
45 rfs4_drc_t *nfs4_drc = NULL;
46 
47 /*
48  * How long the entry can remain in the cache
49  * once it has been sent to the client and not
50  * used in a reply (in seconds)
51  */
52 unsigned nfs4_drc_lifetime = 1;
53 
54 /*
55  * The default size of the duplicate request cache
56  */
57 uint32_t nfs4_drc_max = 8 * 1024;
58 
59 /*
60  * The number of buckets we'd like to hash the
61  * replies into.. do not change this on the fly.
62  */
63 uint32_t nfs4_drc_hash = 541;
64 
65 /*
66  * Initialize a duplicate request cache.
67  */
68 rfs4_drc_t *
69 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size, unsigned ttl)
70 {
71 	rfs4_drc_t *drc;
72 	uint32_t   bki;
73 
74 	ASSERT(drc_size);
75 	ASSERT(drc_hash_size);
76 
77 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
78 
79 	drc->max_size = drc_size;
80 	drc->in_use = 0;
81 	drc->drc_ttl = ttl;
82 
83 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
84 
85 	drc->dr_hash = drc_hash_size;
86 
87 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
88 
89 	for (bki = 0; bki < drc_hash_size; bki++) {
90 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
91 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
92 	}
93 
94 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
95 		    offsetof(rfs4_dupreq_t, dr_next));
96 
97 	return (drc);
98 }
99 
100 /*
101  * Destroy a duplicate request cache.
102  */
103 void
104 rfs4_fini_drc(rfs4_drc_t *drc)
105 {
106 	rfs4_dupreq_t *drp, *drp_next;
107 
108 	ASSERT(drc);
109 
110 	/* iterate over the dr_cache and free the enties */
111 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
112 
113 		if (drp->dr_state == NFS4_DUP_REPLAY)
114 			rfs4_compound_free(&(drp->dr_res));
115 
116 		if (drp->dr_addr.buf != NULL)
117 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
118 
119 		drp_next = list_next(&(drc->dr_cache), drp);
120 
121 		kmem_free(drp, sizeof (rfs4_dupreq_t));
122 	}
123 
124 	mutex_destroy(&drc->lock);
125 	kmem_free(drc->dr_buckets,
126 		sizeof (list_t)*drc->dr_hash);
127 	kmem_free(drc, sizeof (rfs4_drc_t));
128 }
129 
130 /*
131  * rfs4_dr_chstate:
132  *
133  * Change the state of a rfs4_dupreq. If it's not in transition
134  * to the FREE state, update the time used and return. If we
135  * are moving to the FREE state then we need to clean up the
136  * compound results and move the entry to the end of the list.
137  */
138 void
139 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
140 {
141 	rfs4_drc_t *drc;
142 
143 	ASSERT(drp);
144 	ASSERT(drp->drc);
145 	ASSERT(drp->dr_bkt);
146 	ASSERT(MUTEX_HELD(&drp->drc->lock));
147 
148 	drp->dr_state = new_state;
149 
150 	if (new_state != NFS4_DUP_FREE) {
151 		gethrestime(&drp->dr_time_used);
152 		return;
153 	}
154 
155 	drc = drp->drc;
156 
157 	/*
158 	 * Remove entry from the bucket and
159 	 * dr_cache list, free compound results.
160 	 */
161 	list_remove(drp->dr_bkt, drp);
162 	list_remove(&(drc->dr_cache), drp);
163 	rfs4_compound_free(&(drp->dr_res));
164 }
165 
166 /*
167  * rfs4_alloc_dr:
168  *
169  * Pick an entry off the tail -- Use if it is
170  * marked NFS4_DUP_FREE, or is an entry in the
171  * NFS4_DUP_REPLAY state that has timed-out...
172  * Otherwise malloc a new one if we have not reached
173  * our maximum cache limit.
174  *
175  * The list should be in time order, so no need
176  * to traverse backwards looking for a timed out
177  * entry, NFS4_DUP_FREE's are place on the tail.
178  */
179 rfs4_dupreq_t *
180 rfs4_alloc_dr(rfs4_drc_t *drc)
181 {
182 	rfs4_dupreq_t *drp_tail, *drp = NULL;
183 
184 	ASSERT(drc);
185 	ASSERT(MUTEX_HELD(&drc->lock));
186 
187 	if ((drp_tail = list_tail(&drc->dr_cache)) != NULL) {
188 
189 		switch (drp_tail->dr_state) {
190 
191 		case NFS4_DUP_FREE:
192 			list_remove(&(drc->dr_cache), drp_tail);
193 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
194 					rfs4_dupreq_t *, drp_tail);
195 			return (drp_tail);
196 			/* NOTREACHED */
197 
198 		case NFS4_DUP_REPLAY:
199 			if (gethrestime_sec() >
200 			    drp_tail->dr_time_used.tv_sec+drc->drc_ttl) {
201 				/* this entry has timedout so grab it. */
202 				rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
203 				DTRACE_PROBE1(nfss__i__drc_ttlclaim,
204 					rfs4_dupreq_t *, drp_tail);
205 				return (drp_tail);
206 			}
207 			break;
208 		}
209 	}
210 
211 	/*
212 	 * Didn't find something to recycle have
213 	 * we hit the cache limit ?
214 	 */
215 	if (drc->in_use >= drc->max_size) {
216 		DTRACE_PROBE1(nfss__i__drc_full,
217 			rfs4_drc_t *, drc);
218 		return (NULL);
219 	}
220 
221 
222 	/* nope, so let's malloc a new one */
223 	drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
224 	drp->drc = drc;
225 	drc->in_use++;
226 	gethrestime(&drp->dr_time_created);
227 	DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
228 
229 	return (drp);
230 }
231 
232 /*
233  * rfs4_find_dr:
234  *
235  * Search for an entry in the duplicate request cache by
236  * calculating the hash index based on the XID, and examining
237  * the entries in the hash bucket. If we find a match stamp the
238  * time_used and return. If the entry does not match it could be
239  * ready to be freed. Once we have searched the bucket and we
240  * have not exhausted the maximum limit for the cache we will
241  * allocate a new entry.
242  */
243 int
244 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
245 {
246 
247 	uint32_t	the_xid;
248 	list_t		*dr_bkt;
249 	rfs4_dupreq_t	*drp;
250 	int		bktdex;
251 
252 	/*
253 	 * Get the XID, calculate the bucket and search to
254 	 * see if we need to replay from the cache.
255 	 */
256 	the_xid = req->rq_xprt->xp_xid;
257 	bktdex = the_xid % drc->dr_hash;
258 
259 	dr_bkt = (list_t *)
260 		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
261 
262 	DTRACE_PROBE3(nfss__i__drc_bktdex,
263 			int, bktdex,
264 			uint32_t, the_xid,
265 			list_t *, dr_bkt);
266 
267 	*dup = NULL;
268 
269 	mutex_enter(&drc->lock);
270 	/*
271 	 * Search the bucket for a matching xid and address.
272 	 */
273 	for (drp = list_head(dr_bkt); drp != NULL;
274 		drp = list_next(dr_bkt, drp)) {
275 
276 		if (drp->dr_xid == the_xid &&
277 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
278 		    bcmp((caddr_t)drp->dr_addr.buf,
279 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
280 		    drp->dr_addr.len) == 0) {
281 
282 			/*
283 			 * Found a match so REPLAY the Reply
284 			 */
285 			if (drp->dr_state == NFS4_DUP_REPLAY) {
286 				gethrestime(&drp->dr_time_used);
287 				mutex_exit(&drc->lock);
288 				*dup = drp;
289 				DTRACE_PROBE1(nfss__i__drc_replay,
290 					rfs4_dupreq_t *, drp);
291 				return (NFS4_DUP_REPLAY);
292 			}
293 
294 			/*
295 			 * This entry must be in transition, so return
296 			 * the 'pending' status.
297 			 */
298 			mutex_exit(&drc->lock);
299 			return (NFS4_DUP_PENDING);
300 		}
301 
302 		/*
303 		 * Not a match, but maybe this entry is ready
304 		 * to be reused.
305 		 */
306 		if (drp->dr_state == NFS4_DUP_REPLAY &&
307 			(gethrestime_sec() >
308 			drp->dr_time_used.tv_sec+drc->drc_ttl)) {
309 			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
310 			list_insert_tail(&(drp->drc->dr_cache), drp);
311 		}
312 	}
313 
314 	drp = rfs4_alloc_dr(drc);
315 	mutex_exit(&drc->lock);
316 
317 	if (drp == NULL) {
318 		return (NFS4_DUP_ERROR);
319 	}
320 
321 	/*
322 	 * Place at the head of the list, init the state
323 	 * to NEW and clear the time used field.
324 	 */
325 
326 	drp->dr_state = NFS4_DUP_NEW;
327 	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
328 
329 	/*
330 	 * If needed, resize the address buffer
331 	 */
332 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
333 		if (drp->dr_addr.buf != NULL)
334 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
335 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
336 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
337 		if (drp->dr_addr.buf == NULL) {
338 			/*
339 			 * If the malloc fails, mark the entry
340 			 * as free and put on the tail.
341 			 */
342 			drp->dr_addr.maxlen = 0;
343 			drp->dr_state = NFS4_DUP_FREE;
344 			mutex_enter(&drc->lock);
345 			list_insert_tail(&(drc->dr_cache), drp);
346 			mutex_exit(&drc->lock);
347 			return (NFS4_DUP_ERROR);
348 		}
349 	}
350 
351 
352 	/*
353 	 * Copy the address.
354 	 */
355 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
356 
357 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
358 		(caddr_t)drp->dr_addr.buf,
359 		drp->dr_addr.len);
360 
361 	drp->dr_xid = the_xid;
362 	drp->dr_bkt = dr_bkt;
363 
364 	/*
365 	 * Insert at the head of the bucket and
366 	 * the drc lists..
367 	 */
368 	mutex_enter(&drc->lock);
369 	list_insert_head(&drc->dr_cache, drp);
370 	list_insert_head(dr_bkt, drp);
371 	mutex_exit(&drc->lock);
372 
373 	*dup = drp;
374 
375 	return (NFS4_DUP_NEW);
376 }
377 
378 /*
379  *
380  * This function handles the duplicate request cache,
381  * NULL_PROC and COMPOUND procedure calls for NFSv4;
382  *
383  * Passed into this function are:-
384  *
385  * 	disp	A pointer to our dispatch table entry
386  * 	req	The request to process
387  * 	xprt	The server transport handle
388  * 	ap	A pointer to the arguments
389  *
390  *
391  * When appropriate this function is responsible for inserting
392  * the reply into the duplicate cache or replaying an existing
393  * cached reply.
394  *
395  * dr_stat 	reflects the state of the duplicate request that
396  * 		has been inserted into or retrieved from the cache
397  *
398  * drp		is the duplicate request entry
399  *
400  */
401 int
402 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
403 		SVCXPRT *xprt, char *ap)
404 {
405 
406 	COMPOUND4res res_buf, *rbp;
407 	COMPOUND4args *cap;
408 
409 	cred_t 	*cr = NULL;
410 	int	error = 0;
411 	int 	dis_flags = 0;
412 	int 	dr_stat = NFS4_NOT_DUP;
413 	rfs4_dupreq_t *drp = NULL;
414 
415 	ASSERT(disp);
416 
417 	/*
418 	 * Short circuit the RPC_NULL proc.
419 	 */
420 	if (disp->dis_proc == rpc_null) {
421 		if (!svc_sendreply(xprt, xdr_void, NULL)) {
422 			return (1);
423 		}
424 		return (0);
425 	}
426 
427 	/* Only NFSv4 Compounds from this point onward */
428 
429 	rbp = &res_buf;
430 	cap = (COMPOUND4args *)ap;
431 
432 	/*
433 	 * Figure out the disposition of the whole COMPOUND
434 	 * and record it's IDEMPOTENTCY.
435 	 */
436 	rfs4_compound_flagproc(cap, &dis_flags);
437 
438 	/*
439 	 * If NON-IDEMPOTENT then we need to figure out if this
440 	 * request can be replied from the duplicate cache.
441 	 *
442 	 * If this is a new request then we need to insert the
443 	 * reply into the duplicate cache.
444 	 */
445 	if (!(dis_flags & RPC_IDEMPOTENT)) {
446 		/* look for a replay from the cache or allocate */
447 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
448 
449 		switch (dr_stat) {
450 
451 		case NFS4_DUP_ERROR:
452 			svcerr_systemerr(xprt);
453 			return (1);
454 			/* NOTREACHED */
455 
456 		case NFS4_DUP_PENDING:
457 			/*
458 			 * reply has previously been inserted into the
459 			 * duplicate cache, however the reply has
460 			 * not yet been sent via svc_sendreply()
461 			 */
462 			return (1);
463 			/* NOTREACHED */
464 
465 		case NFS4_DUP_NEW:
466 			curthread->t_flag |= T_DONTPEND;
467 			/* NON-IDEMPOTENT proc call */
468 			rfs4_compound(cap, rbp, NULL, req, cr);
469 
470 			curthread->t_flag &= ~T_DONTPEND;
471 			if (curthread->t_flag & T_WOULDBLOCK) {
472 				curthread->t_flag &= ~T_WOULDBLOCK;
473 				/*
474 				 * mark this entry as FREE and plop
475 				 * on the end of the cache list
476 				 */
477 				mutex_enter(&drp->drc->lock);
478 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
479 				list_insert_tail(&(drp->drc->dr_cache), drp);
480 				mutex_exit(&drp->drc->lock);
481 				return (1);
482 			}
483 			drp->dr_res = res_buf;
484 			break;
485 
486 		case NFS4_DUP_REPLAY:
487 			/* replay from the cache */
488 			rbp = &(drp->dr_res);
489 			break;
490 		}
491 	} else {
492 		curthread->t_flag |= T_DONTPEND;
493 		/* IDEMPOTENT proc call */
494 		rfs4_compound(cap, rbp, NULL, req, cr);
495 
496 		curthread->t_flag &= ~T_DONTPEND;
497 		if (curthread->t_flag & T_WOULDBLOCK) {
498 			curthread->t_flag &= ~T_WOULDBLOCK;
499 			return (1);
500 		}
501 	}
502 
503 	/*
504 	 * Send out the replayed reply or the 'real' one.
505 	 */
506 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
507 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
508 			struct svc_req *, xprt,
509 			char *, rbp);
510 		error++;
511 	}
512 
513 	/*
514 	 * If this reply was just inserted into the duplicate cache
515 	 * mark it as available for replay
516 	 */
517 	if (dr_stat == NFS4_DUP_NEW) {
518 		mutex_enter(&drp->drc->lock);
519 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
520 		mutex_exit(&drp->drc->lock);
521 	} else if (dr_stat == NFS4_NOT_DUP) {
522 		rfs4_compound_free(rbp);
523 	}
524 
525 	return (error);
526 }
527