xref: /illumos-gate/usr/src/uts/common/rpc/clnt_rdma.c (revision 8ca9c6bb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
27 /*
28  * Portions of this source code were derived from Berkeley
29  * 4.3 BSD under license from the Regents of the University of
30  * California.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/user.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/errno.h>
39 #include <sys/kmem.h>
40 #include <sys/debug.h>
41 #include <sys/systm.h>
42 #include <sys/kstat.h>
43 #include <sys/t_lock.h>
44 #include <sys/ddi.h>
45 #include <sys/cmn_err.h>
46 #include <sys/time.h>
47 #include <sys/isa_defs.h>
48 #include <sys/zone.h>
49 #include <sys/sdt.h>
50 
51 #include <rpc/types.h>
52 #include <rpc/xdr.h>
53 #include <rpc/auth.h>
54 #include <rpc/clnt.h>
55 #include <rpc/rpc_msg.h>
56 #include <rpc/rpc_rdma.h>
57 #include <nfs/nfs.h>
58 #include <nfs/nfs4_kprot.h>
59 
60 static uint32_t rdma_bufs_rqst = RDMA_BUFS_RQST;
61 
62 static int clnt_compose_rpcmsg(CLIENT *, rpcproc_t, rdma_buf_t *,
63 			    XDR *, xdrproc_t, caddr_t);
64 static int  clnt_compose_rdma_header(CONN *, CLIENT *, rdma_buf_t *,
65 		    XDR **, uint_t *);
66 static int clnt_setup_rlist(CONN *, XDR *, XDR *);
67 static int clnt_setup_wlist(CONN *, XDR *, XDR *, rdma_buf_t *);
68 static int clnt_setup_long_reply(CONN *, struct clist **, uint_t);
69 static void clnt_check_credit(CONN *);
70 static void clnt_return_credit(CONN *);
71 static void clnt_decode_long_reply(CONN *, struct clist *,
72 		struct clist *, XDR *, XDR **, struct clist *,
73 		struct clist *, uint_t, uint_t);
74 
75 static void clnt_update_credit(CONN *, uint32_t);
76 
77 static enum clnt_stat clnt_rdma_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
78     caddr_t, xdrproc_t, caddr_t, struct timeval);
79 static void	clnt_rdma_kabort(CLIENT *);
80 static void	clnt_rdma_kerror(CLIENT *, struct rpc_err *);
81 static bool_t	clnt_rdma_kfreeres(CLIENT *, xdrproc_t, caddr_t);
82 static void	clnt_rdma_kdestroy(CLIENT *);
83 static bool_t	clnt_rdma_kcontrol(CLIENT *, int, char *);
84 static int	clnt_rdma_ksettimers(CLIENT *, struct rpc_timers *,
85     struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t);
86 
87 /*
88  * Operations vector for RDMA based RPC
89  */
90 static struct clnt_ops rdma_clnt_ops = {
91 	clnt_rdma_kcallit,	/* do rpc call */
92 	clnt_rdma_kabort,	/* abort call */
93 	clnt_rdma_kerror,	/* return error status */
94 	clnt_rdma_kfreeres,	/* free results */
95 	clnt_rdma_kdestroy,	/* destroy rpc handle */
96 	clnt_rdma_kcontrol,	/* the ioctl() of rpc */
97 	clnt_rdma_ksettimers,	/* set retry timers */
98 };
99 
100 /*
101  * The size of the preserialized RPC header information.
102  */
103 #define	CKU_HDRSIZE	20
104 #define	CLNT_RDMA_SUCCESS 0
105 #define	CLNT_RDMA_FAIL (-1)
106 
107 #define	AUTH_REFRESH_COUNT 2
108 
109 #define	IS_RPCSEC_GSS(authh)			\
110 	(authh->cl_auth->ah_cred.oa_flavor == RPCSEC_GSS)
111 
112 /*
113  * Per RPC RDMA endpoint details
114  */
115 typedef struct cku_private {
116 	CLIENT			cku_client;	/* client handle */
117 	rdma_mod_t		*cku_rd_mod;	/* underlying RDMA mod */
118 	void			*cku_rd_handle;	/* underlying RDMA device */
119 	struct netbuf		cku_srcaddr;	/* source address for retries */
120 	struct netbuf		cku_addr;	/* remote netbuf address */
121 	int			cku_addrfmly;	/* for finding addr_type */
122 	struct rpc_err		cku_err;	/* error status */
123 	struct cred		*cku_cred;	/* credentials */
124 	XDR			cku_outxdr;	/* xdr stream for output */
125 	uint32_t		cku_outsz;
126 	XDR			cku_inxdr;	/* xdr stream for input */
127 	char			cku_rpchdr[CKU_HDRSIZE+4]; /* rpc header */
128 	uint32_t		cku_xid;	/* current XID */
129 } cku_private_t;
130 
131 #define	CLNT_RDMA_DELAY	10	/* secs to delay after a connection failure */
132 static int clnt_rdma_min_delay = CLNT_RDMA_DELAY;
133 
134 struct {
135 	kstat_named_t	rccalls;
136 	kstat_named_t	rcbadcalls;
137 	kstat_named_t	rcbadxids;
138 	kstat_named_t	rctimeouts;
139 	kstat_named_t	rcnewcreds;
140 	kstat_named_t	rcbadverfs;
141 	kstat_named_t	rctimers;
142 	kstat_named_t	rccantconn;
143 	kstat_named_t	rcnomem;
144 	kstat_named_t	rcintrs;
145 	kstat_named_t	rclongrpcs;
146 } rdmarcstat = {
147 	{ "calls",	KSTAT_DATA_UINT64 },
148 	{ "badcalls",	KSTAT_DATA_UINT64 },
149 	{ "badxids",	KSTAT_DATA_UINT64 },
150 	{ "timeouts",	KSTAT_DATA_UINT64 },
151 	{ "newcreds",	KSTAT_DATA_UINT64 },
152 	{ "badverfs",	KSTAT_DATA_UINT64 },
153 	{ "timers",	KSTAT_DATA_UINT64 },
154 	{ "cantconn",	KSTAT_DATA_UINT64 },
155 	{ "nomem",	KSTAT_DATA_UINT64 },
156 	{ "interrupts", KSTAT_DATA_UINT64 },
157 	{ "longrpc", 	KSTAT_DATA_UINT64 }
158 };
159 
160 kstat_named_t *rdmarcstat_ptr = (kstat_named_t *)&rdmarcstat;
161 uint_t rdmarcstat_ndata = sizeof (rdmarcstat) / sizeof (kstat_named_t);
162 
163 #ifdef DEBUG
164 int rdma_clnt_debug = 0;
165 #endif
166 
167 #ifdef accurate_stats
168 extern kmutex_t rdmarcstat_lock;    /* mutex for rcstat updates */
169 
170 #define	RCSTAT_INCR(x)			\
171 	mutex_enter(&rdmarcstat_lock);	\
172 	rdmarcstat.x.value.ui64++;	\
173 	mutex_exit(&rdmarcstat_lock);
174 #else
175 #define	RCSTAT_INCR(x)			\
176 	rdmarcstat.x.value.ui64++;
177 #endif
178 
179 #define	ptoh(p)		(&((p)->cku_client))
180 #define	htop(h)		((cku_private_t *)((h)->cl_private))
181 
182 uint_t
calc_length(uint_t len)183 calc_length(uint_t len)
184 {
185 	len = RNDUP(len);
186 
187 	if (len <= 64 * 1024) {
188 		if (len > 32 * 1024) {
189 			len = 64 * 1024;
190 		} else {
191 			if (len > 16 * 1024) {
192 				len = 32 * 1024;
193 			} else {
194 				if (len > 8 * 1024) {
195 					len = 16 * 1024;
196 				} else {
197 					len = 8 * 1024;
198 				}
199 			}
200 		}
201 	}
202 	return (len);
203 }
204 int
clnt_rdma_kcreate(char * proto,void * handle,struct netbuf * raddr,int family,rpcprog_t pgm,rpcvers_t vers,struct cred * cred,CLIENT ** cl)205 clnt_rdma_kcreate(char *proto, void *handle, struct netbuf *raddr, int family,
206     rpcprog_t pgm, rpcvers_t vers, struct cred *cred, CLIENT **cl)
207 {
208 	CLIENT *h;
209 	struct cku_private *p;
210 	struct rpc_msg call_msg;
211 	rdma_registry_t *rp;
212 
213 	ASSERT(INGLOBALZONE(curproc));
214 
215 	if (cl == NULL)
216 		return (EINVAL);
217 	*cl = NULL;
218 
219 	p = kmem_zalloc(sizeof (*p), KM_SLEEP);
220 
221 	/*
222 	 * Find underlying RDMATF plugin
223 	 */
224 	rw_enter(&rdma_lock, RW_READER);
225 	rp = rdma_mod_head;
226 	while (rp != NULL) {
227 		if (strcmp(rp->r_mod->rdma_api, proto))
228 			rp = rp->r_next;
229 		else {
230 			p->cku_rd_mod = rp->r_mod;
231 			p->cku_rd_handle = handle;
232 			break;
233 		}
234 	}
235 	rw_exit(&rdma_lock);
236 
237 	if (p->cku_rd_mod == NULL) {
238 		/*
239 		 * Should not happen.
240 		 * No matching RDMATF plugin.
241 		 */
242 		kmem_free(p, sizeof (struct cku_private));
243 		return (EINVAL);
244 	}
245 
246 	h = ptoh(p);
247 	h->cl_ops = &rdma_clnt_ops;
248 	h->cl_private = (caddr_t)p;
249 	h->cl_auth = authkern_create();
250 
251 	/* call message, just used to pre-serialize below */
252 	call_msg.rm_xid = 0;
253 	call_msg.rm_direction = CALL;
254 	call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
255 	call_msg.rm_call.cb_prog = pgm;
256 	call_msg.rm_call.cb_vers = vers;
257 
258 	xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE);
259 	/* pre-serialize call message header */
260 	if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) {
261 		XDR_DESTROY(&p->cku_outxdr);
262 		auth_destroy(h->cl_auth);
263 		kmem_free(p, sizeof (struct cku_private));
264 		return (EINVAL);
265 	}
266 
267 	/*
268 	 * Set up the rpc information
269 	 */
270 	p->cku_cred = cred;
271 	p->cku_srcaddr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP);
272 	p->cku_srcaddr.maxlen = raddr->maxlen;
273 	p->cku_srcaddr.len = 0;
274 	p->cku_addr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP);
275 	p->cku_addr.maxlen = raddr->maxlen;
276 	p->cku_addr.len = raddr->len;
277 	bcopy(raddr->buf, p->cku_addr.buf, raddr->len);
278 	p->cku_addrfmly = family;
279 
280 	*cl = h;
281 	return (0);
282 }
283 
284 static void
clnt_rdma_kdestroy(CLIENT * h)285 clnt_rdma_kdestroy(CLIENT *h)
286 {
287 	struct cku_private *p = htop(h);
288 
289 	kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen);
290 	kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
291 	kmem_free(p, sizeof (*p));
292 }
293 
294 void
clnt_rdma_kinit(CLIENT * h,char * proto,void * handle,struct netbuf * raddr,struct cred * cred)295 clnt_rdma_kinit(CLIENT *h, char *proto, void *handle, struct netbuf *raddr,
296     struct cred *cred)
297 {
298 	struct cku_private *p = htop(h);
299 	rdma_registry_t *rp;
300 
301 	ASSERT(INGLOBALZONE(curproc));
302 	/*
303 	 * Find underlying RDMATF plugin
304 	 */
305 	p->cku_rd_mod = NULL;
306 	rw_enter(&rdma_lock, RW_READER);
307 	rp = rdma_mod_head;
308 	while (rp != NULL) {
309 		if (strcmp(rp->r_mod->rdma_api, proto))
310 			rp = rp->r_next;
311 		else {
312 			p->cku_rd_mod = rp->r_mod;
313 			p->cku_rd_handle = handle;
314 			break;
315 		}
316 
317 	}
318 	rw_exit(&rdma_lock);
319 
320 	/*
321 	 * Set up the rpc information
322 	 */
323 	p->cku_cred = cred;
324 	p->cku_xid = 0;
325 
326 	if (p->cku_addr.maxlen < raddr->len) {
327 		if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL)
328 			kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
329 		p->cku_addr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP);
330 		p->cku_addr.maxlen = raddr->maxlen;
331 	}
332 
333 	p->cku_srcaddr.len = 0;
334 
335 	p->cku_addr.len = raddr->len;
336 	bcopy(raddr->buf, p->cku_addr.buf, raddr->len);
337 	h->cl_ops = &rdma_clnt_ops;
338 }
339 
340 static int
clnt_compose_rpcmsg(CLIENT * h,rpcproc_t procnum,rdma_buf_t * rpcmsg,XDR * xdrs,xdrproc_t xdr_args,caddr_t argsp)341 clnt_compose_rpcmsg(CLIENT *h, rpcproc_t procnum,
342     rdma_buf_t *rpcmsg, XDR *xdrs,
343     xdrproc_t xdr_args, caddr_t argsp)
344 {
345 	cku_private_t *p = htop(h);
346 
347 	if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
348 		/*
349 		 * Copy in the preserialized RPC header
350 		 * information.
351 		 */
352 		bcopy(p->cku_rpchdr, rpcmsg->addr, CKU_HDRSIZE);
353 
354 		/*
355 		 * transaction id is the 1st thing in the output
356 		 * buffer.
357 		 */
358 		/* LINTED pointer alignment */
359 		(*(uint32_t *)(rpcmsg->addr)) = p->cku_xid;
360 
361 		/* Skip the preserialized stuff. */
362 		XDR_SETPOS(xdrs, CKU_HDRSIZE);
363 
364 		/* Serialize dynamic stuff into the output buffer. */
365 		if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
366 		    (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
367 		    (!(*xdr_args)(xdrs, argsp))) {
368 			DTRACE_PROBE(krpc__e__clntrdma__rpcmsg__dynargs);
369 			return (CLNT_RDMA_FAIL);
370 		}
371 		p->cku_outsz = XDR_GETPOS(xdrs);
372 	} else {
373 		uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
374 		IXDR_PUT_U_INT32(uproc, procnum);
375 		(*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
376 		XDR_SETPOS(xdrs, 0);
377 
378 		/* Serialize the procedure number and the arguments. */
379 		if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
380 		    CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) {
381 			if (rpcmsg->addr != xdrs->x_base) {
382 				rpcmsg->addr = xdrs->x_base;
383 				rpcmsg->len = xdr_getbufsize(xdrs);
384 			}
385 			DTRACE_PROBE(krpc__e__clntrdma__rpcmsg__procnum);
386 			return (CLNT_RDMA_FAIL);
387 		}
388 		/*
389 		 * If we had to allocate a new buffer while encoding
390 		 * then update the addr and len.
391 		 */
392 		if (rpcmsg->addr != xdrs->x_base) {
393 			rpcmsg->addr = xdrs->x_base;
394 			rpcmsg->len = xdr_getbufsize(xdrs);
395 		}
396 
397 		p->cku_outsz = XDR_GETPOS(xdrs);
398 		DTRACE_PROBE1(krpc__i__compose__size__sec, int, p->cku_outsz)
399 	}
400 
401 	return (CLNT_RDMA_SUCCESS);
402 }
403 
404 static int
clnt_compose_rdma_header(CONN * conn,CLIENT * h,rdma_buf_t * clmsg,XDR ** xdrs,uint_t * op)405 clnt_compose_rdma_header(CONN *conn, CLIENT *h, rdma_buf_t *clmsg,
406     XDR **xdrs, uint_t *op)
407 {
408 	cku_private_t *p = htop(h);
409 	uint_t vers;
410 	uint32_t rdma_credit = rdma_bufs_rqst;
411 
412 	vers = RPCRDMA_VERS;
413 	clmsg->type = SEND_BUFFER;
414 
415 	if (rdma_buf_alloc(conn, clmsg)) {
416 		return (CLNT_RDMA_FAIL);
417 	}
418 
419 	*xdrs = &p->cku_outxdr;
420 	xdrmem_create(*xdrs, clmsg->addr, clmsg->len, XDR_ENCODE);
421 
422 	(*(uint32_t *)clmsg->addr) = p->cku_xid;
423 	XDR_SETPOS(*xdrs, sizeof (uint32_t));
424 	(void) xdr_u_int(*xdrs, &vers);
425 	(void) xdr_u_int(*xdrs, &rdma_credit);
426 	(void) xdr_u_int(*xdrs, op);
427 
428 	return (CLNT_RDMA_SUCCESS);
429 }
430 
431 /*
432  * If xp_cl is NULL value, then the RPC payload will NOT carry
433  * an RDMA READ chunk list, in this case we insert FALSE into
434  * the XDR stream. Otherwise we use the clist and RDMA register
435  * the memory and encode the clist into the outbound XDR stream.
436  */
437 static int
clnt_setup_rlist(CONN * conn,XDR * xdrs,XDR * call_xdrp)438 clnt_setup_rlist(CONN *conn, XDR *xdrs, XDR *call_xdrp)
439 {
440 	int status;
441 	struct clist *rclp;
442 	int32_t xdr_flag = XDR_RDMA_RLIST_REG;
443 
444 	XDR_CONTROL(call_xdrp, XDR_RDMA_GET_RLIST, &rclp);
445 
446 	if (rclp != NULL) {
447 		status = clist_register(conn, rclp, CLIST_REG_SOURCE);
448 		if (status != RDMA_SUCCESS) {
449 			return (CLNT_RDMA_FAIL);
450 		}
451 		XDR_CONTROL(call_xdrp, XDR_RDMA_SET_FLAGS, &xdr_flag);
452 	}
453 	(void) xdr_do_clist(xdrs, &rclp);
454 
455 	return (CLNT_RDMA_SUCCESS);
456 }
457 
458 /*
459  * If xp_wcl is NULL value, then the RPC payload will NOT carry
460  * an RDMA WRITE chunk list, in this case we insert FALSE into
461  * the XDR stream. Otherwise we use the clist and  RDMA register
462  * the memory and encode the clist into the outbound XDR stream.
463  */
464 static int
clnt_setup_wlist(CONN * conn,XDR * xdrs,XDR * call_xdrp,rdma_buf_t * rndbuf)465 clnt_setup_wlist(CONN *conn, XDR *xdrs, XDR *call_xdrp, rdma_buf_t *rndbuf)
466 {
467 	int status;
468 	struct clist *wlist, *rndcl;
469 	int wlen, rndlen;
470 	int32_t xdr_flag = XDR_RDMA_WLIST_REG;
471 
472 	XDR_CONTROL(call_xdrp, XDR_RDMA_GET_WLIST, &wlist);
473 
474 	if (wlist != NULL) {
475 		/*
476 		 * If we are sending a non 4-byte alligned length
477 		 * the server will roundup the length to 4-byte
478 		 * boundary. In such a case, a trailing chunk is
479 		 * added to take any spill over roundup bytes.
480 		 */
481 		wlen = clist_len(wlist);
482 		rndlen = (roundup(wlen, BYTES_PER_XDR_UNIT) - wlen);
483 		if (rndlen) {
484 			rndcl = clist_alloc();
485 			/*
486 			 * calc_length() will allocate a PAGESIZE
487 			 * buffer below.
488 			 */
489 			rndcl->c_len = calc_length(rndlen);
490 			rndcl->rb_longbuf.type = RDMA_LONG_BUFFER;
491 			rndcl->rb_longbuf.len = rndcl->c_len;
492 			if (rdma_buf_alloc(conn, &rndcl->rb_longbuf)) {
493 				clist_free(rndcl);
494 				return (CLNT_RDMA_FAIL);
495 			}
496 
497 			/* Roundup buffer freed back in caller */
498 			*rndbuf = rndcl->rb_longbuf;
499 
500 			rndcl->u.c_daddr3 = rndcl->rb_longbuf.addr;
501 			rndcl->c_next = NULL;
502 			rndcl->c_dmemhandle = rndcl->rb_longbuf.handle;
503 			wlist->c_next = rndcl;
504 		}
505 
506 		status = clist_register(conn, wlist, CLIST_REG_DST);
507 		if (status != RDMA_SUCCESS) {
508 			rdma_buf_free(conn, rndbuf);
509 			bzero(rndbuf, sizeof (rdma_buf_t));
510 			return (CLNT_RDMA_FAIL);
511 		}
512 		XDR_CONTROL(call_xdrp, XDR_RDMA_SET_FLAGS, &xdr_flag);
513 	}
514 
515 	if (!xdr_encode_wlist(xdrs, wlist)) {
516 		if (rndlen) {
517 			rdma_buf_free(conn, rndbuf);
518 			bzero(rndbuf, sizeof (rdma_buf_t));
519 		}
520 		return (CLNT_RDMA_FAIL);
521 	}
522 
523 	return (CLNT_RDMA_SUCCESS);
524 }
525 
526 static int
clnt_setup_long_reply(CONN * conn,struct clist ** clpp,uint_t length)527 clnt_setup_long_reply(CONN *conn, struct clist **clpp, uint_t length)
528 {
529 	if (length == 0) {
530 		*clpp = NULL;
531 		return (CLNT_RDMA_SUCCESS);
532 	}
533 
534 	*clpp = clist_alloc();
535 
536 	(*clpp)->rb_longbuf.len = calc_length(length);
537 	(*clpp)->rb_longbuf.type = RDMA_LONG_BUFFER;
538 
539 	if (rdma_buf_alloc(conn, &((*clpp)->rb_longbuf))) {
540 		clist_free(*clpp);
541 		*clpp = NULL;
542 		return (CLNT_RDMA_FAIL);
543 	}
544 
545 	(*clpp)->u.c_daddr3 = (*clpp)->rb_longbuf.addr;
546 	(*clpp)->c_len = (*clpp)->rb_longbuf.len;
547 	(*clpp)->c_next = NULL;
548 	(*clpp)->c_dmemhandle = (*clpp)->rb_longbuf.handle;
549 
550 	if (clist_register(conn, *clpp, CLIST_REG_DST)) {
551 		DTRACE_PROBE(krpc__e__clntrdma__longrep_regbuf);
552 		rdma_buf_free(conn, &((*clpp)->rb_longbuf));
553 		clist_free(*clpp);
554 		*clpp = NULL;
555 		return (CLNT_RDMA_FAIL);
556 	}
557 
558 	return (CLNT_RDMA_SUCCESS);
559 }
560 
561 /* ARGSUSED */
562 static enum clnt_stat
clnt_rdma_kcallit(CLIENT * h,rpcproc_t procnum,xdrproc_t xdr_args,caddr_t argsp,xdrproc_t xdr_results,caddr_t resultsp,struct timeval wait)563 clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
564     caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp,
565     struct timeval wait)
566 {
567 	cku_private_t *p = htop(h);
568 
569 	int 	try_call_again;
570 	int	refresh_attempt = AUTH_REFRESH_COUNT;
571 	int 	status;
572 	int 	msglen;
573 
574 	XDR	*call_xdrp, callxdr; /* for xdrrdma encoding the RPC call */
575 	XDR	*reply_xdrp, replyxdr; /* for xdrrdma decoding the RPC reply */
576 	XDR 	*rdmahdr_o_xdrs, *rdmahdr_i_xdrs;
577 
578 	struct rpc_msg 	reply_msg;
579 	rdma_registry_t	*m;
580 
581 	struct clist *cl_sendlist;
582 	struct clist *cl_recvlist;
583 	struct clist *cl;
584 	struct clist *cl_rpcmsg;
585 	struct clist *cl_rdma_reply;
586 	struct clist *cl_rpcreply_wlist;
587 	struct clist *cl_long_reply;
588 	rdma_buf_t  rndup;
589 
590 	uint_t vers;
591 	uint_t op;
592 	uint_t off;
593 	uint32_t seg_array_len;
594 	uint_t long_reply_len;
595 	uint_t rpcsec_gss;
596 	uint_t gss_i_or_p;
597 
598 	CONN *conn = NULL;
599 	rdma_buf_t clmsg;
600 	rdma_buf_t rpcmsg;
601 	rdma_chunkinfo_lengths_t rcil;
602 
603 	clock_t	ticks;
604 	bool_t wlist_exists_reply;
605 
606 	uint32_t rdma_credit = rdma_bufs_rqst;
607 
608 	RCSTAT_INCR(rccalls);
609 
610 call_again:
611 
612 	bzero(&clmsg, sizeof (clmsg));
613 	bzero(&rpcmsg, sizeof (rpcmsg));
614 	bzero(&rndup, sizeof (rndup));
615 	try_call_again = 0;
616 	cl_sendlist = NULL;
617 	cl_recvlist = NULL;
618 	cl = NULL;
619 	cl_rpcmsg = NULL;
620 	cl_rdma_reply = NULL;
621 	call_xdrp = NULL;
622 	reply_xdrp = NULL;
623 	wlist_exists_reply  = FALSE;
624 	cl_rpcreply_wlist = NULL;
625 	cl_long_reply = NULL;
626 	rcil.rcil_len = 0;
627 	rcil.rcil_len_alt = 0;
628 	long_reply_len = 0;
629 
630 	rw_enter(&rdma_lock, RW_READER);
631 	m = (rdma_registry_t *)p->cku_rd_handle;
632 	if (m->r_mod_state == RDMA_MOD_INACTIVE) {
633 		/*
634 		 * If we didn't find a matching RDMA module in the registry
635 		 * then there is no transport.
636 		 */
637 		rw_exit(&rdma_lock);
638 		p->cku_err.re_status = RPC_CANTSEND;
639 		p->cku_err.re_errno = EIO;
640 		ticks = clnt_rdma_min_delay * drv_usectohz(1000000);
641 		if (h->cl_nosignal == TRUE) {
642 			delay(ticks);
643 		} else {
644 			if (delay_sig(ticks) == EINTR) {
645 				p->cku_err.re_status = RPC_INTR;
646 				p->cku_err.re_errno = EINTR;
647 			}
648 		}
649 		return (RPC_CANTSEND);
650 	}
651 	/*
652 	 * Get unique xid
653 	 */
654 	if (p->cku_xid == 0)
655 		p->cku_xid = alloc_xid();
656 
657 	status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_srcaddr,
658 	    &p->cku_addr, p->cku_addrfmly, p->cku_rd_handle, &conn);
659 	rw_exit(&rdma_lock);
660 
661 	/*
662 	 * If there is a problem with the connection reflect the issue
663 	 * back to the higher level to address, we MAY delay for a short
664 	 * period so that we are kind to the transport.
665 	 */
666 	if (conn == NULL) {
667 		/*
668 		 * Connect failed to server. Could be because of one
669 		 * of several things. In some cases we don't want
670 		 * the caller to retry immediately - delay before
671 		 * returning to caller.
672 		 */
673 		switch (status) {
674 		case RDMA_TIMEDOUT:
675 			/*
676 			 * Already timed out. No need to delay
677 			 * some more.
678 			 */
679 			p->cku_err.re_status = RPC_TIMEDOUT;
680 			p->cku_err.re_errno = ETIMEDOUT;
681 			break;
682 		case RDMA_INTR:
683 			/*
684 			 * Failed because of an signal. Very likely
685 			 * the caller will not retry.
686 			 */
687 			p->cku_err.re_status = RPC_INTR;
688 			p->cku_err.re_errno = EINTR;
689 			break;
690 		default:
691 			/*
692 			 * All other failures - server down or service
693 			 * down or temporary resource failure. Delay before
694 			 * returning to caller.
695 			 */
696 			ticks = clnt_rdma_min_delay * drv_usectohz(1000000);
697 			p->cku_err.re_status = RPC_CANTCONNECT;
698 			p->cku_err.re_errno = EIO;
699 
700 			if (h->cl_nosignal == TRUE) {
701 				delay(ticks);
702 			} else {
703 				if (delay_sig(ticks) == EINTR) {
704 					p->cku_err.re_status = RPC_INTR;
705 					p->cku_err.re_errno = EINTR;
706 				}
707 			}
708 			break;
709 		}
710 
711 		return (p->cku_err.re_status);
712 	}
713 
714 	if (p->cku_srcaddr.maxlen < conn->c_laddr.len) {
715 		if ((p->cku_srcaddr.maxlen != 0) &&
716 		    (p->cku_srcaddr.buf != NULL))
717 			kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen);
718 		p->cku_srcaddr.buf = kmem_zalloc(conn->c_laddr.maxlen,
719 		    KM_SLEEP);
720 		p->cku_srcaddr.maxlen = conn->c_laddr.maxlen;
721 	}
722 
723 	p->cku_srcaddr.len = conn->c_laddr.len;
724 	bcopy(conn->c_laddr.buf, p->cku_srcaddr.buf, conn->c_laddr.len);
725 
726 	clnt_check_credit(conn);
727 
728 	status = CLNT_RDMA_FAIL;
729 
730 	rpcsec_gss = gss_i_or_p = FALSE;
731 
732 	if (IS_RPCSEC_GSS(h)) {
733 		rpcsec_gss = TRUE;
734 		if (rpc_gss_get_service_type(h->cl_auth) ==
735 		    rpc_gss_svc_integrity ||
736 		    rpc_gss_get_service_type(h->cl_auth) ==
737 		    rpc_gss_svc_privacy)
738 			gss_i_or_p = TRUE;
739 	}
740 
741 	/*
742 	 * Try a regular RDMA message if RPCSEC_GSS is not being used
743 	 * or if RPCSEC_GSS is being used for authentication only.
744 	 */
745 	if (rpcsec_gss == FALSE ||
746 	    (rpcsec_gss == TRUE && gss_i_or_p == FALSE)) {
747 		/*
748 		 * Grab a send buffer for the request.  Try to
749 		 * encode it to see if it fits. If not, then it
750 		 * needs to be sent in a chunk.
751 		 */
752 		rpcmsg.type = SEND_BUFFER;
753 		if (rdma_buf_alloc(conn, &rpcmsg)) {
754 			DTRACE_PROBE(krpc__e__clntrdma__callit_nobufs);
755 			goto done;
756 		}
757 
758 		/* First try to encode into regular send buffer */
759 		op = RDMA_MSG;
760 
761 		call_xdrp = &callxdr;
762 
763 		xdrrdma_create(call_xdrp, rpcmsg.addr, rpcmsg.len,
764 		    rdma_minchunk, NULL, XDR_ENCODE, conn);
765 
766 		status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, call_xdrp,
767 		    xdr_args, argsp);
768 
769 		if (status != CLNT_RDMA_SUCCESS) {
770 			/* Clean up from previous encode attempt */
771 			rdma_buf_free(conn, &rpcmsg);
772 			XDR_DESTROY(call_xdrp);
773 		} else {
774 			XDR_CONTROL(call_xdrp, XDR_RDMA_GET_CHUNK_LEN, &rcil);
775 		}
776 	}
777 
778 	/* If the encode didn't work, then try a NOMSG */
779 	if (status != CLNT_RDMA_SUCCESS) {
780 
781 		msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT + MAX_AUTH_BYTES +
782 		    xdr_sizeof(xdr_args, argsp);
783 
784 		msglen = calc_length(msglen);
785 
786 		/* pick up the lengths for the reply buffer needed */
787 		(void) xdrrdma_sizeof(xdr_args, argsp, 0,
788 		    &rcil.rcil_len, &rcil.rcil_len_alt);
789 
790 		/*
791 		 * Construct a clist to describe the CHUNK_BUFFER
792 		 * for the rpcmsg.
793 		 */
794 		cl_rpcmsg = clist_alloc();
795 		cl_rpcmsg->c_len = msglen;
796 		cl_rpcmsg->rb_longbuf.type = RDMA_LONG_BUFFER;
797 		cl_rpcmsg->rb_longbuf.len = msglen;
798 		if (rdma_buf_alloc(conn, &cl_rpcmsg->rb_longbuf)) {
799 			clist_free(cl_rpcmsg);
800 			goto done;
801 		}
802 		cl_rpcmsg->w.c_saddr3 = cl_rpcmsg->rb_longbuf.addr;
803 
804 		op = RDMA_NOMSG;
805 		call_xdrp = &callxdr;
806 
807 		xdrrdma_create(call_xdrp, cl_rpcmsg->rb_longbuf.addr,
808 		    cl_rpcmsg->rb_longbuf.len, 0,
809 		    cl_rpcmsg, XDR_ENCODE, conn);
810 
811 		status = clnt_compose_rpcmsg(h, procnum, &cl_rpcmsg->rb_longbuf,
812 		    call_xdrp, xdr_args, argsp);
813 
814 		DTRACE_PROBE2(krpc__i__clntrdma__callit__longbuf, int, status,
815 		    int, msglen);
816 		if (status != CLNT_RDMA_SUCCESS) {
817 			p->cku_err.re_status = RPC_CANTENCODEARGS;
818 			p->cku_err.re_errno = EIO;
819 			DTRACE_PROBE(krpc__e__clntrdma__callit__composemsg);
820 			goto done;
821 		}
822 	}
823 
824 	/*
825 	 * During the XDR_ENCODE we may have "allocated" an RDMA READ or
826 	 * RDMA WRITE clist.
827 	 *
828 	 * First pull the RDMA READ chunk list from the XDR private
829 	 * area to keep it handy.
830 	 */
831 	XDR_CONTROL(call_xdrp, XDR_RDMA_GET_RLIST, &cl);
832 
833 	if (gss_i_or_p) {
834 		long_reply_len = rcil.rcil_len + rcil.rcil_len_alt;
835 		long_reply_len += MAX_AUTH_BYTES;
836 	} else {
837 		long_reply_len = rcil.rcil_len;
838 	}
839 
840 	/*
841 	 * Update the chunk size information for the Long RPC msg.
842 	 */
843 	if (cl && op == RDMA_NOMSG)
844 		cl->c_len = p->cku_outsz;
845 
846 	/*
847 	 * Prepare the RDMA header. On success xdrs will hold the result
848 	 * of xdrmem_create() for a SEND_BUFFER.
849 	 */
850 	status = clnt_compose_rdma_header(conn, h, &clmsg,
851 	    &rdmahdr_o_xdrs, &op);
852 
853 	if (status != CLNT_RDMA_SUCCESS) {
854 		p->cku_err.re_status = RPC_CANTSEND;
855 		p->cku_err.re_errno = EIO;
856 		RCSTAT_INCR(rcnomem);
857 		DTRACE_PROBE(krpc__e__clntrdma__callit__nobufs2);
858 		goto done;
859 	}
860 
861 	/*
862 	 * Now insert the RDMA READ list iff present
863 	 */
864 	status = clnt_setup_rlist(conn, rdmahdr_o_xdrs, call_xdrp);
865 	if (status != CLNT_RDMA_SUCCESS) {
866 		DTRACE_PROBE(krpc__e__clntrdma__callit__clistreg);
867 		rdma_buf_free(conn, &clmsg);
868 		p->cku_err.re_status = RPC_CANTSEND;
869 		p->cku_err.re_errno = EIO;
870 		goto done;
871 	}
872 
873 	/*
874 	 * Setup RDMA WRITE chunk list for nfs read operation
875 	 * other operations will have a NULL which will result
876 	 * as a NULL list in the XDR stream.
877 	 */
878 	status = clnt_setup_wlist(conn, rdmahdr_o_xdrs, call_xdrp, &rndup);
879 	if (status != CLNT_RDMA_SUCCESS) {
880 		rdma_buf_free(conn, &clmsg);
881 		p->cku_err.re_status = RPC_CANTSEND;
882 		p->cku_err.re_errno = EIO;
883 		goto done;
884 	}
885 
886 	/*
887 	 * If NULL call and RPCSEC_GSS, provide a chunk such that
888 	 * large responses can flow back to the client.
889 	 * If RPCSEC_GSS with integrity or privacy is in use, get chunk.
890 	 */
891 	if ((procnum == 0 && rpcsec_gss == TRUE) ||
892 	    (rpcsec_gss == TRUE && gss_i_or_p == TRUE))
893 		long_reply_len += 1024;
894 
895 	status = clnt_setup_long_reply(conn, &cl_long_reply, long_reply_len);
896 
897 	DTRACE_PROBE2(krpc__i__clntrdma__callit__longreply, int, status,
898 	    int, long_reply_len);
899 
900 	if (status != CLNT_RDMA_SUCCESS) {
901 		rdma_buf_free(conn, &clmsg);
902 		p->cku_err.re_status = RPC_CANTSEND;
903 		p->cku_err.re_errno = EIO;
904 		goto done;
905 	}
906 
907 	/*
908 	 * XDR encode the RDMA_REPLY write chunk
909 	 */
910 	seg_array_len = (cl_long_reply ? 1 : 0);
911 	(void) xdr_encode_reply_wchunk(rdmahdr_o_xdrs, cl_long_reply,
912 	    seg_array_len);
913 
914 	/*
915 	 * Construct a clist in "sendlist" that represents what we
916 	 * will push over the wire.
917 	 *
918 	 * Start with the RDMA header and clist (if any)
919 	 */
920 	clist_add(&cl_sendlist, 0, XDR_GETPOS(rdmahdr_o_xdrs), &clmsg.handle,
921 	    clmsg.addr, NULL, NULL);
922 
923 	/*
924 	 * Put the RPC call message in  sendlist if small RPC
925 	 */
926 	if (op == RDMA_MSG) {
927 		clist_add(&cl_sendlist, 0, p->cku_outsz, &rpcmsg.handle,
928 		    rpcmsg.addr, NULL, NULL);
929 	} else {
930 		/* Long RPC already in chunk list */
931 		RCSTAT_INCR(rclongrpcs);
932 	}
933 
934 	/*
935 	 * Set up a reply buffer ready for the reply
936 	 */
937 	status = rdma_clnt_postrecv(conn, p->cku_xid);
938 	if (status != RDMA_SUCCESS) {
939 		rdma_buf_free(conn, &clmsg);
940 		p->cku_err.re_status = RPC_CANTSEND;
941 		p->cku_err.re_errno = EIO;
942 		goto done;
943 	}
944 
945 	/*
946 	 * sync the memory for dma
947 	 */
948 	if (cl != NULL) {
949 		status = clist_syncmem(conn, cl, CLIST_REG_SOURCE);
950 		if (status != RDMA_SUCCESS) {
951 			(void) rdma_clnt_postrecv_remove(conn, p->cku_xid);
952 			rdma_buf_free(conn, &clmsg);
953 			p->cku_err.re_status = RPC_CANTSEND;
954 			p->cku_err.re_errno = EIO;
955 			goto done;
956 		}
957 	}
958 
959 	/*
960 	 * Send the RDMA Header and RPC call message to the server
961 	 */
962 	status = RDMA_SEND(conn, cl_sendlist, p->cku_xid);
963 	if (status != RDMA_SUCCESS) {
964 		(void) rdma_clnt_postrecv_remove(conn, p->cku_xid);
965 		p->cku_err.re_status = RPC_CANTSEND;
966 		p->cku_err.re_errno = EIO;
967 		goto done;
968 	}
969 
970 	/*
971 	 * RDMA plugin now owns the send msg buffers.
972 	 * Clear them out and don't free them.
973 	 */
974 	clmsg.addr = NULL;
975 	if (rpcmsg.type == SEND_BUFFER)
976 		rpcmsg.addr = NULL;
977 
978 	/*
979 	 * Recv rpc reply
980 	 */
981 	status = RDMA_RECV(conn, &cl_recvlist, p->cku_xid);
982 
983 	/*
984 	 * Now check recv status
985 	 */
986 	if (status != 0) {
987 		if (status == RDMA_INTR) {
988 			p->cku_err.re_status = RPC_INTR;
989 			p->cku_err.re_errno = EINTR;
990 			RCSTAT_INCR(rcintrs);
991 		} else if (status == RPC_TIMEDOUT) {
992 			p->cku_err.re_status = RPC_TIMEDOUT;
993 			p->cku_err.re_errno = ETIMEDOUT;
994 			RCSTAT_INCR(rctimeouts);
995 		} else {
996 			p->cku_err.re_status = RPC_CANTRECV;
997 			p->cku_err.re_errno = EIO;
998 		}
999 		goto done;
1000 	}
1001 
1002 	/*
1003 	 * Process the reply message.
1004 	 *
1005 	 * First the chunk list (if any)
1006 	 */
1007 	rdmahdr_i_xdrs = &(p->cku_inxdr);
1008 	xdrmem_create(rdmahdr_i_xdrs,
1009 	    (caddr_t)(uintptr_t)cl_recvlist->w.c_saddr3,
1010 	    cl_recvlist->c_len, XDR_DECODE);
1011 
1012 	/*
1013 	 * Treat xid as opaque (xid is the first entity
1014 	 * in the rpc rdma message).
1015 	 * Skip xid and set the xdr position accordingly.
1016 	 */
1017 	XDR_SETPOS(rdmahdr_i_xdrs, sizeof (uint32_t));
1018 	(void) xdr_u_int(rdmahdr_i_xdrs, &vers);
1019 	(void) xdr_u_int(rdmahdr_i_xdrs, &rdma_credit);
1020 	(void) xdr_u_int(rdmahdr_i_xdrs, &op);
1021 	(void) xdr_do_clist(rdmahdr_i_xdrs, &cl);
1022 
1023 	clnt_update_credit(conn, rdma_credit);
1024 
1025 	wlist_exists_reply = FALSE;
1026 	if (! xdr_decode_wlist(rdmahdr_i_xdrs, &cl_rpcreply_wlist,
1027 	    &wlist_exists_reply)) {
1028 		DTRACE_PROBE(krpc__e__clntrdma__callit__wlist_decode);
1029 		p->cku_err.re_status = RPC_CANTDECODERES;
1030 		p->cku_err.re_errno = EIO;
1031 		goto done;
1032 	}
1033 
1034 	/*
1035 	 * The server shouldn't have sent a RDMA_SEND that
1036 	 * the client needs to RDMA_WRITE a reply back to
1037 	 * the server.  So silently ignoring what the
1038 	 * server returns in the rdma_reply section of the
1039 	 * header.
1040 	 */
1041 	(void) xdr_decode_reply_wchunk(rdmahdr_i_xdrs, &cl_rdma_reply);
1042 	off = xdr_getpos(rdmahdr_i_xdrs);
1043 
1044 	clnt_decode_long_reply(conn, cl_long_reply,
1045 	    cl_rdma_reply, &replyxdr, &reply_xdrp,
1046 	    cl, cl_recvlist, op, off);
1047 
1048 	if (reply_xdrp == NULL)
1049 		goto done;
1050 
1051 	if (wlist_exists_reply) {
1052 		XDR_CONTROL(reply_xdrp, XDR_RDMA_SET_WLIST, cl_rpcreply_wlist);
1053 	}
1054 
1055 	reply_msg.rm_direction = REPLY;
1056 	reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
1057 	reply_msg.acpted_rply.ar_stat = SUCCESS;
1058 	reply_msg.acpted_rply.ar_verf = _null_auth;
1059 
1060 	/*
1061 	 *  xdr_results will be done in AUTH_UNWRAP.
1062 	 */
1063 	reply_msg.acpted_rply.ar_results.where = NULL;
1064 	reply_msg.acpted_rply.ar_results.proc = xdr_void;
1065 
1066 	/*
1067 	 * Decode and validate the response.
1068 	 */
1069 	if (xdr_replymsg(reply_xdrp, &reply_msg)) {
1070 		enum clnt_stat re_status;
1071 
1072 		_seterr_reply(&reply_msg, &(p->cku_err));
1073 
1074 		re_status = p->cku_err.re_status;
1075 		if (re_status == RPC_SUCCESS) {
1076 			/*
1077 			 * Reply is good, check auth.
1078 			 */
1079 			if (!AUTH_VALIDATE(h->cl_auth,
1080 			    &reply_msg.acpted_rply.ar_verf)) {
1081 				p->cku_err.re_status = RPC_AUTHERROR;
1082 				p->cku_err.re_why = AUTH_INVALIDRESP;
1083 				RCSTAT_INCR(rcbadverfs);
1084 				DTRACE_PROBE(
1085 				    krpc__e__clntrdma__callit__authvalidate);
1086 			} else if (!AUTH_UNWRAP(h->cl_auth, reply_xdrp,
1087 			    xdr_results, resultsp)) {
1088 				p->cku_err.re_status = RPC_CANTDECODERES;
1089 				p->cku_err.re_errno = EIO;
1090 				DTRACE_PROBE(
1091 				    krpc__e__clntrdma__callit__authunwrap);
1092 			}
1093 		} else {
1094 			/* set errno in case we can't recover */
1095 			if (re_status != RPC_VERSMISMATCH &&
1096 			    re_status != RPC_AUTHERROR &&
1097 			    re_status != RPC_PROGVERSMISMATCH)
1098 				p->cku_err.re_errno = EIO;
1099 
1100 			if (re_status == RPC_AUTHERROR) {
1101 				if ((refresh_attempt > 0) &&
1102 				    AUTH_REFRESH(h->cl_auth, &reply_msg,
1103 				    p->cku_cred)) {
1104 					refresh_attempt--;
1105 					try_call_again = 1;
1106 					goto done;
1107 				}
1108 
1109 				try_call_again = 0;
1110 
1111 				/*
1112 				 * We have used the client handle to
1113 				 * do an AUTH_REFRESH and the RPC status may
1114 				 * be set to RPC_SUCCESS; Let's make sure to
1115 				 * set it to RPC_AUTHERROR.
1116 				 */
1117 				p->cku_err.re_status = RPC_AUTHERROR;
1118 
1119 				/*
1120 				 * Map recoverable and unrecoverable
1121 				 * authentication errors to appropriate
1122 				 * errno
1123 				 */
1124 				switch (p->cku_err.re_why) {
1125 				case AUTH_BADCRED:
1126 				case AUTH_BADVERF:
1127 				case AUTH_INVALIDRESP:
1128 				case AUTH_TOOWEAK:
1129 				case AUTH_FAILED:
1130 				case RPCSEC_GSS_NOCRED:
1131 				case RPCSEC_GSS_FAILED:
1132 					p->cku_err.re_errno = EACCES;
1133 					break;
1134 				case AUTH_REJECTEDCRED:
1135 				case AUTH_REJECTEDVERF:
1136 				default:
1137 					p->cku_err.re_errno = EIO;
1138 					break;
1139 				}
1140 			}
1141 			DTRACE_PROBE1(krpc__e__clntrdma__callit__rpcfailed,
1142 			    int, p->cku_err.re_why);
1143 		}
1144 	} else {
1145 		p->cku_err.re_status = RPC_CANTDECODERES;
1146 		p->cku_err.re_errno = EIO;
1147 		DTRACE_PROBE(krpc__e__clntrdma__callit__replymsg);
1148 	}
1149 
1150 done:
1151 	clnt_return_credit(conn);
1152 
1153 	if (cl_sendlist != NULL)
1154 		clist_free(cl_sendlist);
1155 
1156 	/*
1157 	 * If rpc reply is in a chunk, free it now.
1158 	 */
1159 	if (cl_long_reply) {
1160 		(void) clist_deregister(conn, cl_long_reply);
1161 		rdma_buf_free(conn, &cl_long_reply->rb_longbuf);
1162 		clist_free(cl_long_reply);
1163 	}
1164 
1165 	if (call_xdrp)
1166 		XDR_DESTROY(call_xdrp);
1167 
1168 	if (rndup.rb_private) {
1169 		rdma_buf_free(conn, &rndup);
1170 	}
1171 
1172 	if (reply_xdrp) {
1173 		(void) xdr_rpc_free_verifier(reply_xdrp, &reply_msg);
1174 		XDR_DESTROY(reply_xdrp);
1175 	}
1176 
1177 	if (cl_rdma_reply) {
1178 		clist_free(cl_rdma_reply);
1179 	}
1180 
1181 	if (cl_recvlist) {
1182 		rdma_buf_t	recvmsg = {0};
1183 		recvmsg.addr = (caddr_t)(uintptr_t)cl_recvlist->w.c_saddr3;
1184 		recvmsg.type = RECV_BUFFER;
1185 		RDMA_BUF_FREE(conn, &recvmsg);
1186 		clist_free(cl_recvlist);
1187 	}
1188 
1189 	RDMA_REL_CONN(conn);
1190 
1191 	if (try_call_again)
1192 		goto call_again;
1193 
1194 	if (p->cku_err.re_status != RPC_SUCCESS) {
1195 		RCSTAT_INCR(rcbadcalls);
1196 	}
1197 	return (p->cku_err.re_status);
1198 }
1199 
1200 
1201 static void
clnt_decode_long_reply(CONN * conn,struct clist * cl_long_reply,struct clist * cl_rdma_reply,XDR * xdrs,XDR ** rxdrp,struct clist * cl,struct clist * cl_recvlist,uint_t op,uint_t off)1202 clnt_decode_long_reply(CONN *conn,
1203     struct clist *cl_long_reply,
1204     struct clist *cl_rdma_reply, XDR *xdrs,
1205     XDR **rxdrp, struct clist *cl,
1206     struct clist *cl_recvlist,
1207     uint_t  op, uint_t off)
1208 {
1209 	if (op != RDMA_NOMSG) {
1210 		DTRACE_PROBE1(krpc__i__longrepl__rdmamsg__len,
1211 		    int, cl_recvlist->c_len - off);
1212 		xdrrdma_create(xdrs,
1213 		    (caddr_t)(uintptr_t)(cl_recvlist->w.c_saddr3 + off),
1214 		    cl_recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
1215 		*rxdrp = xdrs;
1216 		return;
1217 	}
1218 
1219 	/* op must be RDMA_NOMSG */
1220 	if (cl) {
1221 		DTRACE_PROBE(krpc__e__clntrdma__declongreply__serverreadlist);
1222 		return;
1223 	}
1224 
1225 	if (cl_long_reply->u.c_daddr) {
1226 		DTRACE_PROBE1(krpc__i__longrepl__rdmanomsg__len,
1227 		    int, cl_rdma_reply->c_len);
1228 
1229 		xdrrdma_create(xdrs, (caddr_t)cl_long_reply->u.c_daddr3,
1230 		    cl_rdma_reply->c_len, 0, NULL, XDR_DECODE, conn);
1231 
1232 		*rxdrp = xdrs;
1233 	}
1234 }
1235 
1236 static void
clnt_return_credit(CONN * conn)1237 clnt_return_credit(CONN *conn)
1238 {
1239 	rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
1240 
1241 	mutex_enter(&conn->c_lock);
1242 	cc_info->clnt_cc_in_flight_ops--;
1243 	cv_signal(&cc_info->clnt_cc_cv);
1244 	mutex_exit(&conn->c_lock);
1245 }
1246 
1247 static void
clnt_update_credit(CONN * conn,uint32_t rdma_credit)1248 clnt_update_credit(CONN *conn, uint32_t rdma_credit)
1249 {
1250 	rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
1251 
1252 	/*
1253 	 * If the granted has not altered, avoid taking the
1254 	 * mutex, to essentially do nothing..
1255 	 */
1256 	if (cc_info->clnt_cc_granted_ops == rdma_credit)
1257 		return;
1258 	/*
1259 	 * Get the granted number of buffers for credit control.
1260 	 */
1261 	mutex_enter(&conn->c_lock);
1262 	cc_info->clnt_cc_granted_ops = rdma_credit;
1263 	mutex_exit(&conn->c_lock);
1264 }
1265 
1266 static void
clnt_check_credit(CONN * conn)1267 clnt_check_credit(CONN *conn)
1268 {
1269 	rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
1270 
1271 	/*
1272 	 * Make sure we are not going over our allowed buffer use
1273 	 * (and make sure we have gotten a granted value before).
1274 	 */
1275 	mutex_enter(&conn->c_lock);
1276 	while (cc_info->clnt_cc_in_flight_ops >= cc_info->clnt_cc_granted_ops &&
1277 	    cc_info->clnt_cc_granted_ops != 0) {
1278 		/*
1279 		 * Client has maxed out its granted buffers due to
1280 		 * credit control.  Current handling is to block and wait.
1281 		 */
1282 		cv_wait(&cc_info->clnt_cc_cv, &conn->c_lock);
1283 	}
1284 	cc_info->clnt_cc_in_flight_ops++;
1285 	mutex_exit(&conn->c_lock);
1286 }
1287 
1288 /* ARGSUSED */
1289 static void
clnt_rdma_kabort(CLIENT * h)1290 clnt_rdma_kabort(CLIENT *h)
1291 {
1292 }
1293 
1294 static void
clnt_rdma_kerror(CLIENT * h,struct rpc_err * err)1295 clnt_rdma_kerror(CLIENT *h, struct rpc_err *err)
1296 {
1297 	struct cku_private *p = htop(h);
1298 	*err = p->cku_err;
1299 }
1300 
1301 static bool_t
clnt_rdma_kfreeres(CLIENT * h,xdrproc_t xdr_res,caddr_t res_ptr)1302 clnt_rdma_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr)
1303 {
1304 	struct cku_private *p = htop(h);
1305 	XDR *xdrs;
1306 
1307 	xdrs = &(p->cku_outxdr);
1308 	xdrs->x_op = XDR_FREE;
1309 	return ((*xdr_res)(xdrs, res_ptr));
1310 }
1311 
1312 /* ARGSUSED */
1313 static bool_t
clnt_rdma_kcontrol(CLIENT * h,int cmd,char * arg)1314 clnt_rdma_kcontrol(CLIENT *h, int cmd, char *arg)
1315 {
1316 	return (TRUE);
1317 }
1318 
1319 /* ARGSUSED */
1320 static int
clnt_rdma_ksettimers(CLIENT * h,struct rpc_timers * t,struct rpc_timers * all,int minimum,void (* feedback)(int,int,caddr_t),caddr_t arg,uint32_t xid)1321 clnt_rdma_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all,
1322 	int minimum, void(*feedback)(int, int, caddr_t), caddr_t arg,
1323 	uint32_t xid)
1324 {
1325 	RCSTAT_INCR(rctimers);
1326 	return (0);
1327 }
1328 
1329 int
rdma_reachable(int addr_type,struct netbuf * addr,struct knetconfig ** knconf)1330 rdma_reachable(int addr_type, struct netbuf *addr, struct knetconfig **knconf)
1331 {
1332 	rdma_registry_t	*rp;
1333 	void *handle = NULL;
1334 	struct knetconfig *knc;
1335 	char *pf, *p;
1336 	rdma_stat status;
1337 	int error = 0;
1338 
1339 	if (!INGLOBALZONE(curproc))
1340 		return (-1);
1341 
1342 	/*
1343 	 * modload the RDMA plugins if not already done.
1344 	 */
1345 	if (!rdma_modloaded) {
1346 		mutex_enter(&rdma_modload_lock);
1347 		if (!rdma_modloaded) {
1348 			error = rdma_modload();
1349 		}
1350 		mutex_exit(&rdma_modload_lock);
1351 		if (error)
1352 			return (-1);
1353 	}
1354 
1355 	if (!rdma_dev_available)
1356 		return (-1);
1357 
1358 	rw_enter(&rdma_lock, RW_READER);
1359 	rp = rdma_mod_head;
1360 	while (rp != NULL) {
1361 		if (rp->r_mod_state == RDMA_MOD_INACTIVE) {
1362 			rp = rp->r_next;
1363 			continue;
1364 		}
1365 		status = RDMA_REACHABLE(rp->r_mod->rdma_ops, addr_type, addr,
1366 		    &handle);
1367 		if (status == RDMA_SUCCESS) {
1368 			knc = kmem_zalloc(sizeof (struct knetconfig),
1369 			    KM_SLEEP);
1370 			knc->knc_semantics = NC_TPI_RDMA;
1371 			pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1372 			p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1373 			if (addr_type == AF_INET)
1374 				(void) strncpy(pf, NC_INET, KNC_STRSIZE);
1375 			else if (addr_type == AF_INET6)
1376 				(void) strncpy(pf, NC_INET6, KNC_STRSIZE);
1377 			pf[KNC_STRSIZE - 1] = '\0';
1378 
1379 			(void) strncpy(p, rp->r_mod->rdma_api, KNC_STRSIZE);
1380 			p[KNC_STRSIZE - 1] = '\0';
1381 
1382 			knc->knc_protofmly = pf;
1383 			knc->knc_proto = p;
1384 			knc->knc_rdev = (dev_t)rp;
1385 			*knconf = knc;
1386 			rw_exit(&rdma_lock);
1387 			return (0);
1388 		}
1389 		rp = rp->r_next;
1390 	}
1391 	rw_exit(&rdma_lock);
1392 	return (-1);
1393 }
1394