1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/systm.h>
28#include <sys/sdt.h>
29#include <rpc/types.h>
30#include <rpc/auth.h>
31#include <rpc/auth_unix.h>
32#include <rpc/auth_des.h>
33#include <rpc/svc.h>
34#include <rpc/xdr.h>
35#include <nfs/nfs4.h>
36#include <nfs/nfs_dispatch.h>
37#include <nfs/nfs4_drc.h>
38
39#define	NFS4_MAX_MINOR_VERSION	0
40
41/*
42 * This is the duplicate request cache for NFSv4
43 */
44rfs4_drc_t *nfs4_drc = NULL;
45
46/*
47 * The default size of the duplicate request cache
48 */
49uint32_t nfs4_drc_max = 8 * 1024;
50
51/*
52 * The number of buckets we'd like to hash the
53 * replies into.. do not change this on the fly.
54 */
55uint32_t nfs4_drc_hash = 541;
56
57static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
58
59/*
60 * Initialize a duplicate request cache.
61 */
62rfs4_drc_t *
63rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
64{
65	rfs4_drc_t *drc;
66	uint32_t   bki;
67
68	ASSERT(drc_size);
69	ASSERT(drc_hash_size);
70
71	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
72
73	drc->max_size = drc_size;
74	drc->in_use = 0;
75
76	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
77
78	drc->dr_hash = drc_hash_size;
79
80	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
81
82	for (bki = 0; bki < drc_hash_size; bki++) {
83		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
84		    offsetof(rfs4_dupreq_t, dr_bkt_next));
85	}
86
87	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
88	    offsetof(rfs4_dupreq_t, dr_next));
89
90	return (drc);
91}
92
93/*
94 * Destroy a duplicate request cache.
95 */
96void
97rfs4_fini_drc(rfs4_drc_t *drc)
98{
99	rfs4_dupreq_t *drp, *drp_next;
100
101	ASSERT(drc);
102
103	/* iterate over the dr_cache and free the enties */
104	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
105
106		if (drp->dr_state == NFS4_DUP_REPLAY)
107			rfs4_compound_free(&(drp->dr_res));
108
109		if (drp->dr_addr.buf != NULL)
110			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
111
112		drp_next = list_next(&(drc->dr_cache), drp);
113
114		kmem_free(drp, sizeof (rfs4_dupreq_t));
115	}
116
117	mutex_destroy(&drc->lock);
118	kmem_free(drc->dr_buckets,
119	    sizeof (list_t)*drc->dr_hash);
120	kmem_free(drc, sizeof (rfs4_drc_t));
121}
122
123/*
124 * rfs4_dr_chstate:
125 *
126 * Change the state of a rfs4_dupreq. If it's not in transition
127 * to the FREE state, return. If we are moving to the FREE state
128 * then we need to clean up the compound results and move the entry
129 * to the end of the list.
130 */
131void
132rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
133{
134	rfs4_drc_t *drc;
135
136	ASSERT(drp);
137	ASSERT(drp->drc);
138	ASSERT(drp->dr_bkt);
139	ASSERT(MUTEX_HELD(&drp->drc->lock));
140
141	drp->dr_state = new_state;
142
143	if (new_state != NFS4_DUP_FREE)
144		return;
145
146	drc = drp->drc;
147
148	/*
149	 * Remove entry from the bucket and
150	 * dr_cache list, free compound results.
151	 */
152	list_remove(drp->dr_bkt, drp);
153	list_remove(&(drc->dr_cache), drp);
154	rfs4_compound_free(&(drp->dr_res));
155}
156
157/*
158 * rfs4_alloc_dr:
159 *
160 * Malloc a new one if we have not reached our maximum cache
161 * limit, otherwise pick an entry off the tail -- Use if it
162 * is marked as NFS4_DUP_FREE, or is an entry in the
163 * NFS4_DUP_REPLAY state.
164 */
165rfs4_dupreq_t *
166rfs4_alloc_dr(rfs4_drc_t *drc)
167{
168	rfs4_dupreq_t *drp_tail, *drp = NULL;
169
170	ASSERT(drc);
171	ASSERT(MUTEX_HELD(&drc->lock));
172
173	/*
174	 * Have we hit the cache limit yet ?
175	 */
176	if (drc->in_use < drc->max_size) {
177		/*
178		 * nope, so let's malloc a new one
179		 */
180		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
181		drp->drc = drc;
182		drc->in_use++;
183		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
184		return (drp);
185	}
186
187	/*
188	 * Cache is all allocated now traverse the list
189	 * backwards to find one we can reuse.
190	 */
191	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
192	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
193
194		switch (drp_tail->dr_state) {
195
196		case NFS4_DUP_FREE:
197			list_remove(&(drc->dr_cache), drp_tail);
198			DTRACE_PROBE1(nfss__i__drc_freeclaim,
199			    rfs4_dupreq_t *, drp_tail);
200			return (drp_tail);
201			/* NOTREACHED */
202
203		case NFS4_DUP_REPLAY:
204			/* grab it. */
205			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
206			DTRACE_PROBE1(nfss__i__drc_replayclaim,
207			    rfs4_dupreq_t *, drp_tail);
208			return (drp_tail);
209			/* NOTREACHED */
210		}
211	}
212	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
213	return (NULL);
214}
215
216/*
217 * rfs4_find_dr:
218 *
219 * Search for an entry in the duplicate request cache by
220 * calculating the hash index based on the XID, and examining
221 * the entries in the hash bucket. If we find a match, return.
222 * Once we have searched the bucket we call rfs4_alloc_dr() to
223 * allocate a new entry, or reuse one that is available.
224 */
225int
226rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
227{
228
229	uint32_t	the_xid;
230	list_t		*dr_bkt;
231	rfs4_dupreq_t	*drp;
232	int		bktdex;
233
234	/*
235	 * Get the XID, calculate the bucket and search to
236	 * see if we need to replay from the cache.
237	 */
238	the_xid = req->rq_xprt->xp_xid;
239	bktdex = the_xid % drc->dr_hash;
240
241	dr_bkt = (list_t *)
242	    &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
243
244	DTRACE_PROBE3(nfss__i__drc_bktdex,
245	    int, bktdex,
246	    uint32_t, the_xid,
247	    list_t *, dr_bkt);
248
249	*dup = NULL;
250
251	mutex_enter(&drc->lock);
252	/*
253	 * Search the bucket for a matching xid and address.
254	 */
255	for (drp = list_head(dr_bkt); drp != NULL;
256	    drp = list_next(dr_bkt, drp)) {
257
258		if (drp->dr_xid == the_xid &&
259		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
260		    bcmp((caddr_t)drp->dr_addr.buf,
261		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
262		    drp->dr_addr.len) == 0) {
263
264			/*
265			 * Found a match so REPLAY the Reply
266			 */
267			if (drp->dr_state == NFS4_DUP_REPLAY) {
268				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
269				mutex_exit(&drc->lock);
270				*dup = drp;
271				DTRACE_PROBE1(nfss__i__drc_replay,
272				    rfs4_dupreq_t *, drp);
273				return (NFS4_DUP_REPLAY);
274			}
275
276			/*
277			 * This entry must be in transition, so return
278			 * the 'pending' status.
279			 */
280			mutex_exit(&drc->lock);
281			return (NFS4_DUP_PENDING);
282		}
283	}
284
285	drp = rfs4_alloc_dr(drc);
286	mutex_exit(&drc->lock);
287
288	/*
289	 * The DRC is full and all entries are in use. Upper function
290	 * should error out this request and force the client to
291	 * retransmit -- effectively this is a resource issue. NFSD
292	 * threads tied up with native File System, or the cache size
293	 * is too small for the server load.
294	 */
295	if (drp == NULL)
296		return (NFS4_DUP_ERROR);
297
298	/*
299	 * Init the state to NEW.
300	 */
301	drp->dr_state = NFS4_DUP_NEW;
302
303	/*
304	 * If needed, resize the address buffer
305	 */
306	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
307		if (drp->dr_addr.buf != NULL)
308			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
309		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
310		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
311		if (drp->dr_addr.buf == NULL) {
312			/*
313			 * If the malloc fails, mark the entry
314			 * as free and put on the tail.
315			 */
316			drp->dr_addr.maxlen = 0;
317			drp->dr_state = NFS4_DUP_FREE;
318			mutex_enter(&drc->lock);
319			list_insert_tail(&(drc->dr_cache), drp);
320			mutex_exit(&drc->lock);
321			return (NFS4_DUP_ERROR);
322		}
323	}
324
325
326	/*
327	 * Copy the address.
328	 */
329	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
330
331	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
332	    (caddr_t)drp->dr_addr.buf,
333	    drp->dr_addr.len);
334
335	drp->dr_xid = the_xid;
336	drp->dr_bkt = dr_bkt;
337
338	/*
339	 * Insert at the head of the bucket and
340	 * the drc lists..
341	 */
342	mutex_enter(&drc->lock);
343	list_insert_head(&drc->dr_cache, drp);
344	list_insert_head(dr_bkt, drp);
345	mutex_exit(&drc->lock);
346
347	*dup = drp;
348
349	return (NFS4_DUP_NEW);
350}
351
352/*
353 *
354 * This function handles the duplicate request cache,
355 * NULL_PROC and COMPOUND procedure calls for NFSv4;
356 *
357 * Passed into this function are:-
358 *
359 * 	disp	A pointer to our dispatch table entry
360 * 	req	The request to process
361 * 	xprt	The server transport handle
362 * 	ap	A pointer to the arguments
363 *
364 *
365 * When appropriate this function is responsible for inserting
366 * the reply into the duplicate cache or replaying an existing
367 * cached reply.
368 *
369 * dr_stat 	reflects the state of the duplicate request that
370 * 		has been inserted into or retrieved from the cache
371 *
372 * drp		is the duplicate request entry
373 *
374 */
375int
376rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
377		SVCXPRT *xprt, char *ap)
378{
379
380	COMPOUND4res	 res_buf;
381	COMPOUND4res	*rbp;
382	COMPOUND4args	*cap;
383	cred_t		*cr = NULL;
384	int		 error = 0;
385	int		 dis_flags = 0;
386	int		 dr_stat = NFS4_NOT_DUP;
387	rfs4_dupreq_t	*drp = NULL;
388	int		 rv;
389
390	ASSERT(disp);
391
392	/*
393	 * Short circuit the RPC_NULL proc.
394	 */
395	if (disp->dis_proc == rpc_null) {
396		DTRACE_NFSV4_1(null__start, struct svc_req *, req);
397		if (!svc_sendreply(xprt, xdr_void, NULL)) {
398			DTRACE_NFSV4_1(null__done, struct svc_req *, req);
399			svcerr_systemerr(xprt);
400			return (1);
401		}
402		DTRACE_NFSV4_1(null__done, struct svc_req *, req);
403		return (0);
404	}
405
406	/* Only NFSv4 Compounds from this point onward */
407
408	rbp = &res_buf;
409	cap = (COMPOUND4args *)ap;
410
411	/*
412	 * Figure out the disposition of the whole COMPOUND
413	 * and record it's IDEMPOTENTCY.
414	 */
415	rfs4_compound_flagproc(cap, &dis_flags);
416
417	/*
418	 * If NON-IDEMPOTENT then we need to figure out if this
419	 * request can be replied from the duplicate cache.
420	 *
421	 * If this is a new request then we need to insert the
422	 * reply into the duplicate cache.
423	 */
424	if (!(dis_flags & RPC_IDEMPOTENT)) {
425		/* look for a replay from the cache or allocate */
426		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
427
428		switch (dr_stat) {
429
430		case NFS4_DUP_ERROR:
431			rfs4_resource_err(req, cap);
432			return (1);
433			/* NOTREACHED */
434
435		case NFS4_DUP_PENDING:
436			/*
437			 * reply has previously been inserted into the
438			 * duplicate cache, however the reply has
439			 * not yet been sent via svc_sendreply()
440			 */
441			return (1);
442			/* NOTREACHED */
443
444		case NFS4_DUP_NEW:
445			curthread->t_flag |= T_DONTPEND;
446			/* NON-IDEMPOTENT proc call */
447			rfs4_compound(cap, rbp, NULL, req, cr, &rv);
448			curthread->t_flag &= ~T_DONTPEND;
449
450			if (rv)		/* short ckt sendreply on error */
451				return (rv);
452
453			/*
454			 * dr_res must be initialized before calling
455			 * rfs4_dr_chstate (it frees the reply).
456			 */
457			drp->dr_res = res_buf;
458			if (curthread->t_flag & T_WOULDBLOCK) {
459				curthread->t_flag &= ~T_WOULDBLOCK;
460				/*
461				 * mark this entry as FREE and plop
462				 * on the end of the cache list
463				 */
464				mutex_enter(&drp->drc->lock);
465				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
466				list_insert_tail(&(drp->drc->dr_cache), drp);
467				mutex_exit(&drp->drc->lock);
468				return (1);
469			}
470			break;
471
472		case NFS4_DUP_REPLAY:
473			/* replay from the cache */
474			rbp = &(drp->dr_res);
475			break;
476		}
477	} else {
478		curthread->t_flag |= T_DONTPEND;
479		/* IDEMPOTENT proc call */
480		rfs4_compound(cap, rbp, NULL, req, cr, &rv);
481		curthread->t_flag &= ~T_DONTPEND;
482
483		if (rv)		/* short ckt sendreply on error */
484			return (rv);
485
486		if (curthread->t_flag & T_WOULDBLOCK) {
487			curthread->t_flag &= ~T_WOULDBLOCK;
488			return (1);
489		}
490	}
491
492	/*
493	 * Send out the replayed reply or the 'real' one.
494	 */
495	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
496		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
497		    struct svc_req *, xprt,
498		    char *, rbp);
499		svcerr_systemerr(xprt);
500		error++;
501	}
502
503	/*
504	 * If this reply was just inserted into the duplicate cache
505	 * or it was replayed from the dup cache; (re)mark it as
506	 * available for replay
507	 *
508	 * At first glance, this 'if' statement seems a little strange;
509	 * testing for NFS4_DUP_REPLAY, and then calling...
510	 *
511	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
512	 *
513	 * ... but notice that we are checking dr_stat, and not the
514	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
515	 * we do that so that we know not to prematurely reap it whilst
516	 * we resent it to the client.
517	 *
518	 */
519	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
520		mutex_enter(&drp->drc->lock);
521		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
522		mutex_exit(&drp->drc->lock);
523	} else if (dr_stat == NFS4_NOT_DUP) {
524		rfs4_compound_free(rbp);
525	}
526
527	return (error);
528}
529
530bool_t
531rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
532{
533	COMPOUND4args *argsp;
534	COMPOUND4res res_buf, *resp;
535
536	if (req->rq_vers != 4)
537		return (FALSE);
538
539	argsp = (COMPOUND4args *)args;
540
541	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
542		return (FALSE);
543
544	resp = &res_buf;
545
546	/*
547	 * Form a reply tag by copying over the reqeuest tag.
548	 */
549	resp->tag.utf8string_val =
550	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
551	resp->tag.utf8string_len = argsp->tag.utf8string_len;
552	bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
553	    resp->tag.utf8string_len);
554	resp->array_len = 0;
555	resp->array = NULL;
556	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
557	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
558		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
559		    SVCXPRT *, xprt, char *, resp);
560		svcerr_systemerr(xprt);
561	}
562	rfs4_compound_free(resp);
563	return (TRUE);
564}
565
566void
567rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
568{
569	COMPOUND4res res_buf, *rbp;
570	nfs_resop4 *resop;
571	PUTFH4res *resp;
572
573	rbp = &res_buf;
574
575	/*
576	 * Form a reply tag by copying over the request tag.
577	 */
578	rbp->tag.utf8string_val =
579	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
580	rbp->tag.utf8string_len = argsp->tag.utf8string_len;
581	bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
582	    rbp->tag.utf8string_len);
583
584	rbp->array_len = 1;
585	rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
586	    KM_SLEEP);
587	resop = &rbp->array[0];
588	resop->resop = argsp->array[0].argop;	/* copy first op over */
589
590	/* Any op will do, just need to access status field */
591	resp = &resop->nfs_resop4_u.opputfh;
592
593	/*
594	 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
595	 * Note that all op numbers in the compound array were already
596	 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
597	 */
598	resp->status = (resop->resop == OP_ILLEGAL ?
599	    NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
600
601	/* compound status is same as first op status */
602	rbp->status = resp->status;
603
604	if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
605		DTRACE_PROBE2(nfss__rsrc_err__sendfail,
606		    struct svc_req *, req->rq_xprt, char *, rbp);
607		svcerr_systemerr(req->rq_xprt);
608	}
609
610	UTF8STRING_FREE(rbp->tag);
611	kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
612}
613