1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2018 Nexenta Systems, Inc.
29 */
30
31 #include <sys/systm.h>
32 #include <sys/sdt.h>
33 #include <rpc/types.h>
34 #include <rpc/auth.h>
35 #include <rpc/auth_unix.h>
36 #include <rpc/auth_des.h>
37 #include <rpc/svc.h>
38 #include <rpc/xdr.h>
39 #include <nfs/nfs4.h>
40 #include <nfs/nfs_dispatch.h>
41 #include <nfs/nfs4_drc.h>
42
43 #define NFS4_MAX_MINOR_VERSION 0
44
45 /*
46 * The default size of the duplicate request cache
47 */
48 uint32_t nfs4_drc_max = 8 * 1024;
49
50 /*
51 * The number of buckets we'd like to hash the
52 * replies into.. do not change this on the fly.
53 */
54 uint32_t nfs4_drc_hash = 541;
55
56 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
57
58 /*
59 * Initialize a duplicate request cache.
60 */
61 rfs4_drc_t *
rfs4_init_drc(uint32_t drc_size,uint32_t drc_hash_size)62 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
63 {
64 rfs4_drc_t *drc;
65 uint32_t bki;
66
67 ASSERT(drc_size);
68 ASSERT(drc_hash_size);
69
70 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
71
72 drc->max_size = drc_size;
73 drc->in_use = 0;
74
75 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
76
77 drc->dr_hash = drc_hash_size;
78
79 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
80
81 for (bki = 0; bki < drc_hash_size; bki++) {
82 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
83 offsetof(rfs4_dupreq_t, dr_bkt_next));
84 }
85
86 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
87 offsetof(rfs4_dupreq_t, dr_next));
88
89 return (drc);
90 }
91
92 /*
93 * Destroy a duplicate request cache.
94 */
95 void
rfs4_fini_drc(void)96 rfs4_fini_drc(void)
97 {
98 nfs4_srv_t *nsrv4 = nfs4_get_srv();
99 rfs4_drc_t *drc = nsrv4->nfs4_drc;
100 rfs4_dupreq_t *drp, *drp_next;
101
102 /* iterate over the dr_cache and free the enties */
103 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
104
105 if (drp->dr_state == NFS4_DUP_REPLAY)
106 rfs4_compound_free(&(drp->dr_res));
107
108 if (drp->dr_addr.buf != NULL)
109 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
110
111 drp_next = list_next(&(drc->dr_cache), drp);
112
113 kmem_free(drp, sizeof (rfs4_dupreq_t));
114 }
115
116 mutex_destroy(&drc->lock);
117 kmem_free(drc->dr_buckets,
118 sizeof (list_t)*drc->dr_hash);
119 kmem_free(drc, sizeof (rfs4_drc_t));
120 }
121
122 /*
123 * rfs4_dr_chstate:
124 *
125 * Change the state of a rfs4_dupreq. If it's not in transition
126 * to the FREE state, return. If we are moving to the FREE state
127 * then we need to clean up the compound results and move the entry
128 * to the end of the list.
129 */
130 void
rfs4_dr_chstate(rfs4_dupreq_t * drp,int new_state)131 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
132 {
133 rfs4_drc_t *drc;
134
135 ASSERT(drp);
136 ASSERT(drp->drc);
137 ASSERT(drp->dr_bkt);
138 ASSERT(MUTEX_HELD(&drp->drc->lock));
139
140 drp->dr_state = new_state;
141
142 if (new_state != NFS4_DUP_FREE)
143 return;
144
145 drc = drp->drc;
146
147 /*
148 * Remove entry from the bucket and
149 * dr_cache list, free compound results.
150 */
151 list_remove(drp->dr_bkt, drp);
152 list_remove(&(drc->dr_cache), drp);
153 rfs4_compound_free(&(drp->dr_res));
154 }
155
156 /*
157 * rfs4_alloc_dr:
158 *
159 * Malloc a new one if we have not reached our maximum cache
160 * limit, otherwise pick an entry off the tail -- Use if it
161 * is marked as NFS4_DUP_FREE, or is an entry in the
162 * NFS4_DUP_REPLAY state.
163 */
164 rfs4_dupreq_t *
rfs4_alloc_dr(rfs4_drc_t * drc)165 rfs4_alloc_dr(rfs4_drc_t *drc)
166 {
167 rfs4_dupreq_t *drp_tail, *drp = NULL;
168
169 ASSERT(drc);
170 ASSERT(MUTEX_HELD(&drc->lock));
171
172 /*
173 * Have we hit the cache limit yet ?
174 */
175 if (drc->in_use < drc->max_size) {
176 /*
177 * nope, so let's malloc a new one
178 */
179 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
180 drp->drc = drc;
181 drc->in_use++;
182 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
183 return (drp);
184 }
185
186 /*
187 * Cache is all allocated now traverse the list
188 * backwards to find one we can reuse.
189 */
190 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
191 drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
192
193 switch (drp_tail->dr_state) {
194
195 case NFS4_DUP_FREE:
196 list_remove(&(drc->dr_cache), drp_tail);
197 DTRACE_PROBE1(nfss__i__drc_freeclaim,
198 rfs4_dupreq_t *, drp_tail);
199 return (drp_tail);
200 /* NOTREACHED */
201
202 case NFS4_DUP_REPLAY:
203 /* grab it. */
204 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
205 DTRACE_PROBE1(nfss__i__drc_replayclaim,
206 rfs4_dupreq_t *, drp_tail);
207 return (drp_tail);
208 /* NOTREACHED */
209 }
210 }
211 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
212 return (NULL);
213 }
214
215 /*
216 * rfs4_find_dr:
217 *
218 * Search for an entry in the duplicate request cache by
219 * calculating the hash index based on the XID, and examining
220 * the entries in the hash bucket. If we find a match, return.
221 * Once we have searched the bucket we call rfs4_alloc_dr() to
222 * allocate a new entry, or reuse one that is available.
223 */
224 int
rfs4_find_dr(struct svc_req * req,rfs4_drc_t * drc,rfs4_dupreq_t ** dup)225 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
226 {
227
228 uint32_t the_xid;
229 list_t *dr_bkt;
230 rfs4_dupreq_t *drp;
231 int bktdex;
232
233 /*
234 * Get the XID, calculate the bucket and search to
235 * see if we need to replay from the cache.
236 */
237 the_xid = req->rq_xprt->xp_xid;
238 bktdex = the_xid % drc->dr_hash;
239
240 dr_bkt = (list_t *)
241 &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
242
243 DTRACE_PROBE3(nfss__i__drc_bktdex,
244 int, bktdex,
245 uint32_t, the_xid,
246 list_t *, dr_bkt);
247
248 *dup = NULL;
249
250 mutex_enter(&drc->lock);
251 /*
252 * Search the bucket for a matching xid and address.
253 */
254 for (drp = list_head(dr_bkt); drp != NULL;
255 drp = list_next(dr_bkt, drp)) {
256
257 if (drp->dr_xid == the_xid &&
258 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
259 bcmp((caddr_t)drp->dr_addr.buf,
260 (caddr_t)req->rq_xprt->xp_rtaddr.buf,
261 drp->dr_addr.len) == 0) {
262
263 /*
264 * Found a match so REPLAY the Reply
265 */
266 if (drp->dr_state == NFS4_DUP_REPLAY) {
267 rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
268 mutex_exit(&drc->lock);
269 *dup = drp;
270 DTRACE_PROBE1(nfss__i__drc_replay,
271 rfs4_dupreq_t *, drp);
272 return (NFS4_DUP_REPLAY);
273 }
274
275 /*
276 * This entry must be in transition, so return
277 * the 'pending' status.
278 */
279 mutex_exit(&drc->lock);
280 return (NFS4_DUP_PENDING);
281 }
282 }
283
284 drp = rfs4_alloc_dr(drc);
285 mutex_exit(&drc->lock);
286
287 /*
288 * The DRC is full and all entries are in use. Upper function
289 * should error out this request and force the client to
290 * retransmit -- effectively this is a resource issue. NFSD
291 * threads tied up with native File System, or the cache size
292 * is too small for the server load.
293 */
294 if (drp == NULL)
295 return (NFS4_DUP_ERROR);
296
297 /*
298 * Init the state to NEW.
299 */
300 drp->dr_state = NFS4_DUP_NEW;
301
302 /*
303 * If needed, resize the address buffer
304 */
305 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
306 if (drp->dr_addr.buf != NULL)
307 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
308 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
309 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
310 if (drp->dr_addr.buf == NULL) {
311 /*
312 * If the malloc fails, mark the entry
313 * as free and put on the tail.
314 */
315 drp->dr_addr.maxlen = 0;
316 drp->dr_state = NFS4_DUP_FREE;
317 mutex_enter(&drc->lock);
318 list_insert_tail(&(drc->dr_cache), drp);
319 mutex_exit(&drc->lock);
320 return (NFS4_DUP_ERROR);
321 }
322 }
323
324
325 /*
326 * Copy the address.
327 */
328 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
329
330 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
331 (caddr_t)drp->dr_addr.buf,
332 drp->dr_addr.len);
333
334 drp->dr_xid = the_xid;
335 drp->dr_bkt = dr_bkt;
336
337 /*
338 * Insert at the head of the bucket and
339 * the drc lists..
340 */
341 mutex_enter(&drc->lock);
342 list_insert_head(&drc->dr_cache, drp);
343 list_insert_head(dr_bkt, drp);
344 mutex_exit(&drc->lock);
345
346 *dup = drp;
347
348 return (NFS4_DUP_NEW);
349 }
350
351 /*
352 *
353 * This function handles the duplicate request cache,
354 * NULL_PROC and COMPOUND procedure calls for NFSv4;
355 *
356 * Passed into this function are:-
357 *
358 * disp A pointer to our dispatch table entry
359 * req The request to process
360 * xprt The server transport handle
361 * ap A pointer to the arguments
362 *
363 *
364 * When appropriate this function is responsible for inserting
365 * the reply into the duplicate cache or replaying an existing
366 * cached reply.
367 *
368 * dr_stat reflects the state of the duplicate request that
369 * has been inserted into or retrieved from the cache
370 *
371 * drp is the duplicate request entry
372 *
373 */
374 int
rfs4_dispatch(struct rpcdisp * disp,struct svc_req * req,SVCXPRT * xprt,char * ap)375 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req, SVCXPRT *xprt,
376 char *ap)
377 {
378
379 COMPOUND4res res_buf;
380 COMPOUND4res *rbp;
381 COMPOUND4args *cap;
382 cred_t *cr = NULL;
383 int error = 0;
384 int dis_flags = 0;
385 int dr_stat = NFS4_NOT_DUP;
386 rfs4_dupreq_t *drp = NULL;
387 int rv;
388 nfs4_srv_t *nsrv4 = nfs4_get_srv();
389 rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
390
391 ASSERT(disp);
392
393 /*
394 * Short circuit the RPC_NULL proc.
395 */
396 if (disp->dis_proc == rpc_null) {
397 DTRACE_NFSV4_1(null__start, struct svc_req *, req);
398 if (!svc_sendreply(xprt, xdr_void, NULL)) {
399 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
400 svcerr_systemerr(xprt);
401 return (1);
402 }
403 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
404 return (0);
405 }
406
407 /* Only NFSv4 Compounds from this point onward */
408
409 rbp = &res_buf;
410 cap = (COMPOUND4args *)ap;
411
412 /*
413 * Figure out the disposition of the whole COMPOUND
414 * and record it's IDEMPOTENTCY.
415 */
416 rfs4_compound_flagproc(cap, &dis_flags);
417
418 /*
419 * If NON-IDEMPOTENT then we need to figure out if this
420 * request can be replied from the duplicate cache.
421 *
422 * If this is a new request then we need to insert the
423 * reply into the duplicate cache.
424 */
425 if (!(dis_flags & RPC_IDEMPOTENT)) {
426 /* look for a replay from the cache or allocate */
427 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
428
429 switch (dr_stat) {
430
431 case NFS4_DUP_ERROR:
432 rfs4_resource_err(req, cap);
433 return (1);
434 /* NOTREACHED */
435
436 case NFS4_DUP_PENDING:
437 /*
438 * reply has previously been inserted into the
439 * duplicate cache, however the reply has
440 * not yet been sent via svc_sendreply()
441 */
442 return (1);
443 /* NOTREACHED */
444
445 case NFS4_DUP_NEW:
446 curthread->t_flag |= T_DONTPEND;
447 /* NON-IDEMPOTENT proc call */
448 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
449 curthread->t_flag &= ~T_DONTPEND;
450
451 if (rv) /* short ckt sendreply on error */
452 return (rv);
453
454 /*
455 * dr_res must be initialized before calling
456 * rfs4_dr_chstate (it frees the reply).
457 */
458 drp->dr_res = res_buf;
459 if (curthread->t_flag & T_WOULDBLOCK) {
460 curthread->t_flag &= ~T_WOULDBLOCK;
461 /*
462 * mark this entry as FREE and plop
463 * on the end of the cache list
464 */
465 mutex_enter(&drp->drc->lock);
466 rfs4_dr_chstate(drp, NFS4_DUP_FREE);
467 list_insert_tail(&(drp->drc->dr_cache), drp);
468 mutex_exit(&drp->drc->lock);
469 return (1);
470 }
471 break;
472
473 case NFS4_DUP_REPLAY:
474 /* replay from the cache */
475 rbp = &(drp->dr_res);
476 break;
477 }
478 } else {
479 curthread->t_flag |= T_DONTPEND;
480 /* IDEMPOTENT proc call */
481 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
482 curthread->t_flag &= ~T_DONTPEND;
483
484 if (rv) /* short ckt sendreply on error */
485 return (rv);
486
487 if (curthread->t_flag & T_WOULDBLOCK) {
488 curthread->t_flag &= ~T_WOULDBLOCK;
489 return (1);
490 }
491 }
492
493 /*
494 * Send out the replayed reply or the 'real' one.
495 */
496 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
497 DTRACE_PROBE2(nfss__e__dispatch_sendfail,
498 struct svc_req *, xprt,
499 char *, rbp);
500 svcerr_systemerr(xprt);
501 error++;
502 }
503
504 /*
505 * If this reply was just inserted into the duplicate cache
506 * or it was replayed from the dup cache; (re)mark it as
507 * available for replay
508 *
509 * At first glance, this 'if' statement seems a little strange;
510 * testing for NFS4_DUP_REPLAY, and then calling...
511 *
512 * rfs4_dr_chatate(NFS4_DUP_REPLAY)
513 *
514 * ... but notice that we are checking dr_stat, and not the
515 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
516 * we do that so that we know not to prematurely reap it whilst
517 * we resent it to the client.
518 *
519 */
520 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
521 mutex_enter(&drp->drc->lock);
522 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
523 mutex_exit(&drp->drc->lock);
524 } else if (dr_stat == NFS4_NOT_DUP) {
525 rfs4_compound_free(rbp);
526 }
527
528 return (error);
529 }
530
531 bool_t
rfs4_minorvers_mismatch(struct svc_req * req,SVCXPRT * xprt,void * args)532 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
533 {
534 COMPOUND4args *argsp;
535 COMPOUND4res res_buf, *resp;
536
537 if (req->rq_vers != 4)
538 return (FALSE);
539
540 argsp = (COMPOUND4args *)args;
541
542 if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
543 return (FALSE);
544
545 resp = &res_buf;
546
547 /*
548 * Form a reply tag by copying over the request tag.
549 */
550 resp->tag.utf8string_len = argsp->tag.utf8string_len;
551 if (argsp->tag.utf8string_len != 0) {
552 resp->tag.utf8string_val =
553 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
554 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
555 resp->tag.utf8string_len);
556 } else {
557 resp->tag.utf8string_val = NULL;
558 }
559 resp->array_len = 0;
560 resp->array = NULL;
561 resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
562 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) {
563 DTRACE_PROBE2(nfss__e__minorvers_mismatch,
564 SVCXPRT *, xprt, char *, resp);
565 svcerr_systemerr(xprt);
566 }
567 rfs4_compound_free(resp);
568 return (TRUE);
569 }
570
571 void
rfs4_resource_err(struct svc_req * req,COMPOUND4args * argsp)572 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
573 {
574 COMPOUND4res res_buf, *rbp;
575 nfs_resop4 *resop;
576 PUTFH4res *resp;
577
578 rbp = &res_buf;
579
580 /*
581 * Form a reply tag by copying over the request tag.
582 */
583 rbp->tag.utf8string_len = argsp->tag.utf8string_len;
584 if (argsp->tag.utf8string_len != 0) {
585 rbp->tag.utf8string_val =
586 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
587 bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
588 rbp->tag.utf8string_len);
589 } else {
590 rbp->tag.utf8string_val = NULL;
591 }
592
593 rbp->array_len = 1;
594 rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
595 KM_SLEEP);
596 resop = &rbp->array[0];
597 resop->resop = argsp->array[0].argop; /* copy first op over */
598
599 /* Any op will do, just need to access status field */
600 resp = &resop->nfs_resop4_u.opputfh;
601
602 /*
603 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
604 * Note that all op numbers in the compound array were already
605 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
606 */
607 resp->status = (resop->resop == OP_ILLEGAL ?
608 NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
609
610 /* compound status is same as first op status */
611 rbp->status = resp->status;
612
613 if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
614 DTRACE_PROBE2(nfss__rsrc_err__sendfail,
615 struct svc_req *, req->rq_xprt, char *, rbp);
616 svcerr_systemerr(req->rq_xprt);
617 }
618
619 UTF8STRING_FREE(rbp->tag);
620 kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
621 }
622