xref: /illumos-gate/usr/src/uts/common/rpc/rpcib.c (revision 69914347)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 		struct netbuf *d_svcaddr, CONN **conn);
121 
122 struct {
123 	kstat_named_t cache_limit;
124 	kstat_named_t cache_allocation;
125 	kstat_named_t cache_hits;
126 	kstat_named_t cache_misses;
127 	kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 	{"cache_limit",			KSTAT_DATA_UINT64 },
130 	{"cache_allocation",		KSTAT_DATA_UINT64 },
131 	{"cache_hits",			KSTAT_DATA_UINT64 },
132 	{"cache_misses",		KSTAT_DATA_UINT64 },
133 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 };
135 
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 	nulldev,		/* open */
139 	nulldev,		/* close */
140 	nodev,			/* strategy */
141 	nodev,			/* print */
142 	nodev,			/* dump */
143 	nodev,			/* read */
144 	nodev,			/* write */
145 	nodev,			/* ioctl */
146 	nodev,			/* devmap */
147 	nodev,			/* mmap */
148 	nodev,			/* segmap */
149 	nochpoll,		/* poll */
150 	ddi_prop_op,		/* prop_op */
151 	NULL,			/* stream */
152 	D_MP,			/* cb_flag */
153 	CB_REV,			/* rev */
154 	nodev,			/* int (*cb_aread)() */
155 	nodev			/* int (*cb_awrite)() */
156 };
157 
158 /*
159  * Device options
160  */
161 static struct dev_ops rpcib_ops = {
162 	DEVO_REV,		/* devo_rev, */
163 	0,			/* refcnt  */
164 	rpcib_getinfo,		/* info */
165 	nulldev,		/* identify */
166 	nulldev,		/* probe */
167 	rpcib_attach,		/* attach */
168 	rpcib_detach,		/* detach */
169 	nodev,			/* reset */
170 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
171 	NULL,			/* bus operations */
172 	NULL,			/* power */
173 	ddi_quiesce_not_needed,		/* quiesce */
174 };
175 
176 /*
177  * Module linkage information.
178  */
179 
180 static struct modldrv rib_modldrv = {
181 	&mod_driverops,		/* Driver module */
182 	"RPCIB plugin driver",	/* Driver name and version */
183 	&rpcib_ops,		/* Driver ops */
184 };
185 
186 static struct modlinkage rib_modlinkage = {
187 	MODREV_1,
188 	(void *)&rib_modldrv,
189 	NULL
190 };
191 
192 typedef struct rib_lrc_entry {
193 	struct rib_lrc_entry *forw;
194 	struct rib_lrc_entry *back;
195 	char *lrc_buf;
196 
197 	uint32_t lrc_len;
198 	void  *avl_node;
199 	bool_t registered;
200 
201 	struct mrc lrc_mhandle;
202 	bool_t lrc_on_freed_list;
203 } rib_lrc_entry_t;
204 
205 typedef	struct cache_struct	{
206 	rib_lrc_entry_t		r;
207 	uint32_t		len;
208 	uint32_t		elements;
209 	kmutex_t		node_lock;
210 	avl_node_t		avl_link;
211 } cache_avl_struct_t;
212 
213 uint64_t	cache_limit = 100 * 1024 * 1024;
214 static uint64_t	cache_watermark = 80 * 1024 * 1024;
215 static bool_t	stats_enabled = FALSE;
216 
217 static uint64_t max_unsignaled_rws = 5;
218 int nfs_rdma_port = NFS_RDMA_PORT;
219 
220 #define	RIBNETID_TCP	"tcp"
221 #define	RIBNETID_TCP6	"tcp6"
222 
223 /*
224  * rib_stat: private data pointer used when registering
225  *	with the IBTF.  It is returned to the consumer
226  *	in all callbacks.
227  */
228 static rpcib_state_t *rib_stat = NULL;
229 
230 #define	RNR_RETRIES	IBT_RNR_RETRY_1
231 #define	MAX_PORTS	2
232 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
233 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
234 
235 int preposted_rbufs = RDMA_BUFS_GRANT;
236 int send_threshold = 1;
237 
238 /*
239  * Old cards with Tavor driver have limited memory footprint
240  * when booted in 32bit. The rib_max_rbufs tunable can be
241  * tuned for more buffers if needed.
242  */
243 
244 #if !defined(_ELF64) && !defined(__sparc)
245 int rib_max_rbufs = MAX_BUFS;
246 #else
247 int rib_max_rbufs = 10 * MAX_BUFS;
248 #endif	/* !(_ELF64) && !(__sparc) */
249 
250 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
251 
252 /*
253  * State of the plugin.
254  * ACCEPT = accepting new connections and requests.
255  * NO_ACCEPT = not accepting new connection and requests.
256  * This should eventually move to rpcib_state_t structure, since this
257  * will tell in which state the plugin is for a particular type of service
258  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259  * state for one and in no_accept state for the other.
260  */
261 int		plugin_state;
262 kmutex_t	plugin_state_lock;
263 
264 ldi_ident_t rpcib_li;
265 
266 /*
267  * RPCIB RDMATF operations
268  */
269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 static rdma_stat rib_disconnect(CONN *conn);
271 static void rib_listen(struct rdma_svc_data *rd);
272 static void rib_listen_stop(struct rdma_svc_data *rd);
273 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
274 	uint_t buflen, struct mrc *buf_handle);
275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 	struct mrc buf_handle);
277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 		struct mrc buf_handle);
281 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
282 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 	void *lrc);
284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 	caddr_t buf, int len, int cpu);
288 
289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
290 
291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
293 
294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
295 
296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
306 	int addr_type, void *, CONN **);
307 static rdma_stat rib_conn_release(CONN *conn);
308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
309 	rpcib_ping_t *, CONN **);
310 static rdma_stat rib_getinfo(rdma_info_t *info);
311 
312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
314 static void rib_destroy_cache(rib_hca_t *hca);
315 static	void	rib_server_side_cache_reclaim(void *argp);
316 static int avl_compare(const void *t1, const void *t2);
317 
318 static void rib_stop_services(rib_hca_t *);
319 static void rib_close_channels(rib_conn_list_t *);
320 static void rib_conn_close(void *);
321 static void rib_recv_rele(rib_qp_t *);
322 static rdma_stat rib_conn_release_locked(CONN *conn);
323 
324 /*
325  * RPCIB addressing operations
326  */
327 
328 /*
329  * RDMA operations the RPCIB module exports
330  */
331 static rdmaops_t rib_ops = {
332 	rib_reachable,
333 	rib_conn_get,
334 	rib_conn_release,
335 	rib_listen,
336 	rib_listen_stop,
337 	rib_registermem,
338 	rib_deregistermem,
339 	rib_registermemsync,
340 	rib_deregistermemsync,
341 	rib_syncmem,
342 	rib_reg_buf_alloc,
343 	rib_reg_buf_free,
344 	rib_send,
345 	rib_send_resp,
346 	rib_post_resp,
347 	rib_post_resp_remove,
348 	rib_post_recv,
349 	rib_recv,
350 	rib_read,
351 	rib_write,
352 	rib_getinfo,
353 };
354 
355 /*
356  * RDMATF RPCIB plugin details
357  */
358 static rdma_mod_t rib_mod = {
359 	"ibtf",		/* api name */
360 	RDMATF_VERS_1,
361 	0,
362 	&rib_ops,	/* rdma op vector for ibtf */
363 };
364 
365 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
366 static rdma_stat rib_qp_init(rib_qp_t *, int);
367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
373 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
375 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
378 	rib_qp_t **);
379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
380 	rib_qp_t **);
381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
383 static int rib_free_sendwait(struct send_wid *);
384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
386 static void rdma_done_rem_list(rib_qp_t *);
387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
388 
389 static void rib_async_handler(void *,
390 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
393 static int rib_free_svc_recv(struct svc_recv *);
394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
395 static void rib_free_wid(struct recv_wid *);
396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
397 static void rib_detach_hca(ibt_hca_hdl_t);
398 static void rib_close_a_channel(CONN *);
399 static void rib_send_hold(rib_qp_t *);
400 static void rib_send_rele(rib_qp_t *);
401 
402 /*
403  * Registration with IBTF as a consumer
404  */
405 static struct ibt_clnt_modinfo_s rib_modinfo = {
406 	IBTI_V_CURR,
407 	IBT_GENERIC,
408 	rib_async_handler,	/* async event handler */
409 	NULL,			/* Memory Region Handler */
410 	"nfs/ib"
411 };
412 
413 /*
414  * Global strucuture
415  */
416 
417 typedef struct rpcib_s {
418 	dev_info_t	*rpcib_dip;
419 	kmutex_t	rpcib_mutex;
420 } rpcib_t;
421 
422 rpcib_t rpcib;
423 
424 /*
425  * /etc/system controlled variable to control
426  * debugging in rpcib kernel module.
427  * Set it to values greater that 1 to control
428  * the amount of debugging messages required.
429  */
430 int rib_debug = 0;
431 
432 int
_init(void)433 _init(void)
434 {
435 	int error;
436 
437 	error = mod_install((struct modlinkage *)&rib_modlinkage);
438 	if (error != 0) {
439 		/*
440 		 * Could not load module
441 		 */
442 		return (error);
443 	}
444 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
445 	return (0);
446 }
447 
448 int
_fini()449 _fini()
450 {
451 	int status;
452 
453 	/*
454 	 * Remove module
455 	 */
456 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
457 		return (status);
458 	}
459 	mutex_destroy(&plugin_state_lock);
460 	return (0);
461 }
462 
463 int
_info(struct modinfo * modinfop)464 _info(struct modinfo *modinfop)
465 {
466 	return (mod_info(&rib_modlinkage, modinfop));
467 }
468 
469 /*
470  * rpcib_getinfo()
471  * Given the device number, return the devinfo pointer or the
472  * instance number.
473  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
474  */
475 
476 /*ARGSUSED*/
477 static int
rpcib_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
479 {
480 	int ret = DDI_SUCCESS;
481 
482 	switch (cmd) {
483 	case DDI_INFO_DEVT2DEVINFO:
484 		if (rpcib.rpcib_dip != NULL)
485 			*result = rpcib.rpcib_dip;
486 		else {
487 			*result = NULL;
488 			ret = DDI_FAILURE;
489 		}
490 		break;
491 
492 	case DDI_INFO_DEVT2INSTANCE:
493 		*result = NULL;
494 		break;
495 
496 	default:
497 		ret = DDI_FAILURE;
498 	}
499 	return (ret);
500 }
501 
502 static void
rpcib_free_hca_list()503 rpcib_free_hca_list()
504 {
505 	rib_hca_t *hca, *hcap;
506 
507 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
508 	hca = rib_stat->hcas_list;
509 	rib_stat->hcas_list = NULL;
510 	rw_exit(&rib_stat->hcas_list_lock);
511 	while (hca != NULL) {
512 		rw_enter(&hca->state_lock, RW_WRITER);
513 		hcap = hca;
514 		hca = hca->next;
515 		rib_stat->nhca_inited--;
516 		rib_mod.rdma_count--;
517 		hcap->state = HCA_DETACHED;
518 		rw_exit(&hcap->state_lock);
519 		rib_stop_hca_services(hcap);
520 
521 		kmem_free(hcap, sizeof (*hcap));
522 	}
523 }
524 
525 static rdma_stat
rpcib_free_service_list()526 rpcib_free_service_list()
527 {
528 	rib_service_t *service;
529 	ibt_status_t ret;
530 
531 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
532 	while (rib_stat->service_list != NULL) {
533 		service = rib_stat->service_list;
534 		ret = ibt_unbind_all_services(service->srv_hdl);
535 		if (ret != IBT_SUCCESS) {
536 			rw_exit(&rib_stat->service_list_lock);
537 #ifdef DEBUG
538 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
539 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
540 #endif
541 			return (RDMA_FAILED);
542 		}
543 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
544 		    service->srv_hdl);
545 		if (ret != IBT_SUCCESS) {
546 			rw_exit(&rib_stat->service_list_lock);
547 #ifdef DEBUG
548 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
549 			    "ibt_deregister_service failed (%d)\n", (int)ret);
550 #endif
551 			return (RDMA_FAILED);
552 		}
553 		rib_stat->service_list = service->next;
554 		kmem_free(service, sizeof (rib_service_t));
555 	}
556 	rw_exit(&rib_stat->service_list_lock);
557 
558 	return (RDMA_SUCCESS);
559 }
560 
561 static int
rpcib_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
563 {
564 	ibt_status_t	ibt_status;
565 	rdma_stat	r_status;
566 
567 	switch (cmd) {
568 	case DDI_ATTACH:
569 		break;
570 	case DDI_RESUME:
571 		return (DDI_SUCCESS);
572 	default:
573 		return (DDI_FAILURE);
574 	}
575 
576 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
577 
578 	mutex_enter(&rpcib.rpcib_mutex);
579 	if (rpcib.rpcib_dip != NULL) {
580 		mutex_exit(&rpcib.rpcib_mutex);
581 		return (DDI_FAILURE);
582 	}
583 	rpcib.rpcib_dip = dip;
584 	mutex_exit(&rpcib.rpcib_mutex);
585 	/*
586 	 * Create the "rpcib" minor-node.
587 	 */
588 	if (ddi_create_minor_node(dip,
589 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
590 		/* Error message, no cmn_err as they print on console */
591 		return (DDI_FAILURE);
592 	}
593 
594 	if (rib_stat == NULL) {
595 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
596 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
597 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
598 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
599 	}
600 
601 	rib_stat->hca_count = ibt_get_hca_list(NULL);
602 	if (rib_stat->hca_count < 1) {
603 		mutex_destroy(&rib_stat->listen_lock);
604 		rw_destroy(&rib_stat->hcas_list_lock);
605 		mutex_destroy(&rib_stat->open_hca_lock);
606 		kmem_free(rib_stat, sizeof (*rib_stat));
607 		rib_stat = NULL;
608 		return (DDI_FAILURE);
609 	}
610 
611 	ibt_status = ibt_attach(&rib_modinfo, dip,
612 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
613 
614 	if (ibt_status != IBT_SUCCESS) {
615 		mutex_destroy(&rib_stat->listen_lock);
616 		rw_destroy(&rib_stat->hcas_list_lock);
617 		mutex_destroy(&rib_stat->open_hca_lock);
618 		kmem_free(rib_stat, sizeof (*rib_stat));
619 		rib_stat = NULL;
620 		return (DDI_FAILURE);
621 	}
622 
623 	rib_stat->service_list = NULL;
624 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
625 	mutex_enter(&rib_stat->open_hca_lock);
626 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
627 		mutex_exit(&rib_stat->open_hca_lock);
628 		goto open_fail;
629 	}
630 	mutex_exit(&rib_stat->open_hca_lock);
631 
632 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
633 	    DDI_PROP_SUCCESS) {
634 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
635 		    "failed.");
636 		goto register_fail;
637 	}
638 
639 	/*
640 	 * Register with rdmatf
641 	 */
642 	r_status = rdma_register_mod(&rib_mod);
643 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
644 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
645 		    "status = %d", r_status);
646 		goto register_fail;
647 	}
648 
649 	return (DDI_SUCCESS);
650 
651 register_fail:
652 
653 open_fail:
654 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
655 	rpcib_free_hca_list();
656 	(void) rpcib_free_service_list();
657 	mutex_destroy(&rib_stat->listen_lock);
658 	rw_destroy(&rib_stat->hcas_list_lock);
659 	mutex_destroy(&rib_stat->open_hca_lock);
660 	rw_destroy(&rib_stat->service_list_lock);
661 	kmem_free(rib_stat, sizeof (*rib_stat));
662 	rib_stat = NULL;
663 	return (DDI_FAILURE);
664 }
665 
666 /*ARGSUSED*/
667 static int
rpcib_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
669 {
670 	switch (cmd) {
671 
672 	case DDI_DETACH:
673 		break;
674 
675 	case DDI_SUSPEND:
676 	default:
677 		return (DDI_FAILURE);
678 	}
679 
680 	/*
681 	 * Detach the hca and free resources
682 	 */
683 	mutex_enter(&plugin_state_lock);
684 	plugin_state = NO_ACCEPT;
685 	mutex_exit(&plugin_state_lock);
686 
687 	if (rpcib_free_service_list() != RDMA_SUCCESS)
688 		return (DDI_FAILURE);
689 	rpcib_free_hca_list();
690 
691 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
692 	mutex_destroy(&rib_stat->listen_lock);
693 	rw_destroy(&rib_stat->hcas_list_lock);
694 	mutex_destroy(&rib_stat->open_hca_lock);
695 	rw_destroy(&rib_stat->service_list_lock);
696 
697 	kmem_free(rib_stat, sizeof (*rib_stat));
698 	rib_stat = NULL;
699 
700 	mutex_enter(&rpcib.rpcib_mutex);
701 	rpcib.rpcib_dip = NULL;
702 	mutex_exit(&rpcib.rpcib_mutex);
703 	mutex_destroy(&rpcib.rpcib_mutex);
704 	return (DDI_SUCCESS);
705 }
706 
707 
708 static void rib_rbufpool_free(rib_hca_t *, int);
709 static void rib_rbufpool_deregister(rib_hca_t *, int);
710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
712 static rdma_stat rib_rem_replylist(rib_qp_t *);
713 static int rib_remreply(rib_qp_t *, struct reply *);
714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
716 
717 
718 /*
719  * One CQ pair per HCA
720  */
721 static rdma_stat
rib_create_cq(rib_hca_t * hca,uint32_t cq_size,ibt_cq_handler_t cq_handler,rib_cq_t ** cqp)722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
723     rib_cq_t **cqp)
724 {
725 	rib_cq_t	*cq;
726 	ibt_cq_attr_t	cq_attr;
727 	uint32_t	real_size;
728 	ibt_status_t	status;
729 	rdma_stat	error = RDMA_SUCCESS;
730 
731 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
732 	cq->rib_hca = hca;
733 	bzero(&cq_attr, sizeof (cq_attr));
734 	cq_attr.cq_size = cq_size;
735 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
736 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
737 	    &real_size);
738 	if (status != IBT_SUCCESS) {
739 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
740 		    " status=%d", status);
741 		error = RDMA_FAILED;
742 		goto fail;
743 	}
744 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
745 
746 	/*
747 	 * Enable CQ callbacks. CQ Callbacks are single shot
748 	 * (e.g. you have to call ibt_enable_cq_notify()
749 	 * after each callback to get another one).
750 	 */
751 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
752 	if (status != IBT_SUCCESS) {
753 		cmn_err(CE_WARN, "rib_create_cq: "
754 		    "enable_cq_notify failed, status %d", status);
755 		error = RDMA_FAILED;
756 		goto fail;
757 	}
758 	*cqp = cq;
759 
760 	return (error);
761 fail:
762 	if (cq->rib_cq_hdl)
763 		(void) ibt_free_cq(cq->rib_cq_hdl);
764 	if (cq)
765 		kmem_free(cq, sizeof (rib_cq_t));
766 	return (error);
767 }
768 
769 /*
770  * rpcib_find_hca
771  *
772  * Caller should have already locked the hcas_lock before calling
773  * this function.
774  */
775 static rib_hca_t *
rpcib_find_hca(rpcib_state_t * ribstat,ib_guid_t guid)776 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
777 {
778 	rib_hca_t *hca = ribstat->hcas_list;
779 
780 	while (hca && hca->hca_guid != guid)
781 		hca = hca->next;
782 
783 	return (hca);
784 }
785 
786 static rdma_stat
rpcib_open_hcas(rpcib_state_t * ribstat)787 rpcib_open_hcas(rpcib_state_t *ribstat)
788 {
789 	rib_hca_t		*hca;
790 	ibt_status_t		ibt_status;
791 	rdma_stat		status;
792 	ibt_hca_portinfo_t	*pinfop;
793 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
794 	uint_t			size, cq_size;
795 	int			i;
796 	kstat_t *ksp;
797 	cache_avl_struct_t example_avl_node;
798 	char rssc_name[32];
799 	int old_nhca_inited = ribstat->nhca_inited;
800 	ib_guid_t		*hca_guids;
801 
802 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
803 
804 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
805 	if (ribstat->hca_count == 0)
806 		return (RDMA_FAILED);
807 
808 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
809 	/*
810 	 * Open a hca and setup for RDMA
811 	 */
812 	for (i = 0; i < ribstat->hca_count; i++) {
813 		if (rpcib_find_hca(ribstat, hca_guids[i]))
814 			continue;
815 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
816 
817 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
818 		    hca_guids[i], &hca->hca_hdl);
819 		if (ibt_status != IBT_SUCCESS) {
820 			kmem_free(hca, sizeof (rib_hca_t));
821 			continue;
822 		}
823 		hca->hca_guid = hca_guids[i];
824 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
825 		hca->state = HCA_INITED;
826 
827 		/*
828 		 * query HCA info
829 		 */
830 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
831 		if (ibt_status != IBT_SUCCESS) {
832 			goto fail1;
833 		}
834 
835 		/*
836 		 * One PD (Protection Domain) per HCA.
837 		 * A qp is allowed to access a memory region
838 		 * only when it's in the same PD as that of
839 		 * the memory region.
840 		 */
841 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
842 		if (ibt_status != IBT_SUCCESS) {
843 			goto fail1;
844 		}
845 
846 		/*
847 		 * query HCA ports
848 		 */
849 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
850 		    0, &pinfop, &hca->hca_nports, &size);
851 		if (ibt_status != IBT_SUCCESS) {
852 			goto fail2;
853 		}
854 		hca->hca_ports = pinfop;
855 		hca->hca_pinfosz = size;
856 		pinfop = NULL;
857 
858 		cq_size = DEF_CQ_SIZE; /* default cq size */
859 		/*
860 		 * Create 2 pairs of cq's (1 pair for client
861 		 * and the other pair for server) on this hca.
862 		 * If number of qp's gets too large, then several
863 		 * cq's will be needed.
864 		 */
865 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
866 		    &hca->svc_rcq);
867 		if (status != RDMA_SUCCESS) {
868 			goto fail3;
869 		}
870 
871 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
872 		    &hca->svc_scq);
873 		if (status != RDMA_SUCCESS) {
874 			goto fail3;
875 		}
876 
877 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
878 		    &hca->clnt_rcq);
879 		if (status != RDMA_SUCCESS) {
880 			goto fail3;
881 		}
882 
883 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
884 		    &hca->clnt_scq);
885 		if (status != RDMA_SUCCESS) {
886 			goto fail3;
887 		}
888 
889 		/*
890 		 * Create buffer pools.
891 		 * Note rib_rbuf_create also allocates memory windows.
892 		 */
893 		hca->recv_pool = rib_rbufpool_create(hca,
894 		    RECV_BUFFER, rib_max_rbufs);
895 		if (hca->recv_pool == NULL) {
896 			goto fail3;
897 		}
898 
899 		hca->send_pool = rib_rbufpool_create(hca,
900 		    SEND_BUFFER, rib_max_rbufs);
901 		if (hca->send_pool == NULL) {
902 			rib_rbufpool_destroy(hca, RECV_BUFFER);
903 			goto fail3;
904 		}
905 
906 		if (hca->server_side_cache == NULL) {
907 			(void) sprintf(rssc_name,
908 			    "rib_srvr_cache_%llx",
909 			    (long long unsigned int) hca->hca_guid);
910 			hca->server_side_cache = kmem_cache_create(
911 			    rssc_name,
912 			    sizeof (cache_avl_struct_t), 0,
913 			    NULL,
914 			    NULL,
915 			    rib_server_side_cache_reclaim,
916 			    hca, NULL, 0);
917 		}
918 
919 		avl_create(&hca->avl_tree,
920 		    avl_compare,
921 		    sizeof (cache_avl_struct_t),
922 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
923 		    (uint_t)(uintptr_t)&example_avl_node);
924 
925 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
926 		    hca->iblock);
927 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
928 		rw_init(&hca->avl_rw_lock,
929 		    NULL, RW_DRIVER, hca->iblock);
930 		mutex_init(&hca->cache_allocation_lock,
931 		    NULL, MUTEX_DRIVER, NULL);
932 		hca->avl_init = TRUE;
933 
934 		/* Create kstats for the cache */
935 		ASSERT(INGLOBALZONE(curproc));
936 
937 		if (!stats_enabled) {
938 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
939 			    KSTAT_TYPE_NAMED,
940 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
941 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
942 			    GLOBAL_ZONEID);
943 			if (ksp) {
944 				ksp->ks_data = (void *) &rpcib_kstat;
945 				ksp->ks_update = rpcib_cache_kstat_update;
946 				kstat_install(ksp);
947 				stats_enabled = TRUE;
948 			}
949 		}
950 		if (hca->cleanup_helper == NULL) {
951 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
952 
953 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
954 			    (unsigned long long int) hca->hca_guid);
955 			hca->cleanup_helper = ddi_taskq_create(NULL,
956 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
957 		}
958 
959 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
960 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
961 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
962 		    hca->iblock);
963 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
964 		    hca->iblock);
965 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
966 		hca->inuse = TRUE;
967 
968 		hca->next = ribstat->hcas_list;
969 		ribstat->hcas_list = hca;
970 		ribstat->nhca_inited++;
971 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
972 		continue;
973 
974 fail3:
975 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
976 fail2:
977 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
978 fail1:
979 		(void) ibt_close_hca(hca->hca_hdl);
980 		kmem_free(hca, sizeof (rib_hca_t));
981 	}
982 	rw_exit(&ribstat->hcas_list_lock);
983 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
984 	rib_mod.rdma_count = rib_stat->nhca_inited;
985 
986 	/*
987 	 * return success if at least one new hca has been configured.
988 	 */
989 	if (ribstat->nhca_inited != old_nhca_inited)
990 		return (RDMA_SUCCESS);
991 	else
992 		return (RDMA_FAILED);
993 }
994 
995 /*
996  * Callback routines
997  */
998 
999 /*
1000  * SCQ handlers
1001  */
1002 /* ARGSUSED */
1003 static void
rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1004 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1005 {
1006 	ibt_status_t	ibt_status;
1007 	ibt_wc_t	wc;
1008 	struct send_wid	*wd;
1009 	CONN		*conn;
1010 	rib_qp_t	*qp;
1011 	int		i;
1012 
1013 	/*
1014 	 * Re-enable cq notify here to avoid missing any
1015 	 * completion queue notification.
1016 	 */
1017 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1018 
1019 	ibt_status = IBT_SUCCESS;
1020 	while (ibt_status != IBT_CQ_EMPTY) {
1021 		bzero(&wc, sizeof (wc));
1022 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1023 		if (ibt_status != IBT_SUCCESS)
1024 			return;
1025 
1026 		/*
1027 		 * Got a send completion
1028 		 */
1029 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1030 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1031 			qp = wd->qp;
1032 			conn = qptoc(qp);
1033 
1034 			mutex_enter(&wd->sendwait_lock);
1035 			switch (wc.wc_status) {
1036 			case IBT_WC_SUCCESS:
1037 				wd->status = RDMA_SUCCESS;
1038 				break;
1039 			default:
1040 /*
1041  *    RC Send Q Error Code		Local state     Remote State
1042  *    ====================		===========     ============
1043  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1044  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1045  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1046  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1047  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1048  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1049  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1050  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1051  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1052  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1053  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1054  */
1055 				/*
1056 				 * Channel in error state. Set connection to
1057 				 * ERROR and cleanup will happen either from
1058 				 * conn_release  or from rib_conn_get
1059 				 */
1060 				wd->status = RDMA_FAILED;
1061 				mutex_enter(&conn->c_lock);
1062 				if (conn->c_state != C_DISCONN_PEND)
1063 					conn->c_state = C_ERROR_CONN;
1064 				mutex_exit(&conn->c_lock);
1065 				break;
1066 			}
1067 
1068 			if (wd->cv_sig == 1) {
1069 				/*
1070 				 * Notify poster
1071 				 */
1072 				cv_signal(&wd->wait_cv);
1073 				mutex_exit(&wd->sendwait_lock);
1074 			} else {
1075 				/*
1076 				 * Poster not waiting for notification.
1077 				 * Free the send buffers and send_wid
1078 				 */
1079 				for (i = 0; i < wd->nsbufs; i++) {
1080 					rib_rbuf_free(qptoc(wd->qp),
1081 					    SEND_BUFFER,
1082 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1083 				}
1084 
1085 				/* decrement the send ref count */
1086 				rib_send_rele(qp);
1087 
1088 				mutex_exit(&wd->sendwait_lock);
1089 				(void) rib_free_sendwait(wd);
1090 			}
1091 		}
1092 	}
1093 }
1094 
1095 /* ARGSUSED */
1096 static void
rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1097 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1098 {
1099 	ibt_status_t	ibt_status;
1100 	ibt_wc_t	wc;
1101 	struct send_wid	*wd;
1102 	rib_qp_t	*qp;
1103 	CONN		*conn;
1104 	int		i;
1105 
1106 	/*
1107 	 * Re-enable cq notify here to avoid missing any
1108 	 * completion queue notification.
1109 	 */
1110 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1111 
1112 	ibt_status = IBT_SUCCESS;
1113 	while (ibt_status != IBT_CQ_EMPTY) {
1114 		bzero(&wc, sizeof (wc));
1115 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1116 		if (ibt_status != IBT_SUCCESS)
1117 			return;
1118 
1119 		/*
1120 		 * Got a send completion
1121 		 */
1122 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1123 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1124 			qp = wd->qp;
1125 			conn = qptoc(qp);
1126 			mutex_enter(&wd->sendwait_lock);
1127 
1128 			switch (wc.wc_status) {
1129 			case IBT_WC_SUCCESS:
1130 				wd->status = RDMA_SUCCESS;
1131 				break;
1132 			default:
1133 				/*
1134 				 * Channel in error state. Set connection to
1135 				 * ERROR and cleanup will happen either from
1136 				 * conn_release  or conn timeout.
1137 				 */
1138 				wd->status = RDMA_FAILED;
1139 				mutex_enter(&conn->c_lock);
1140 				if (conn->c_state != C_DISCONN_PEND)
1141 					conn->c_state = C_ERROR_CONN;
1142 				mutex_exit(&conn->c_lock);
1143 				break;
1144 			}
1145 
1146 			if (wd->cv_sig == 1) {
1147 				/*
1148 				 * Update completion status and notify poster
1149 				 */
1150 				cv_signal(&wd->wait_cv);
1151 				mutex_exit(&wd->sendwait_lock);
1152 			} else {
1153 				/*
1154 				 * Poster not waiting for notification.
1155 				 * Free the send buffers and send_wid
1156 				 */
1157 				for (i = 0; i < wd->nsbufs; i++) {
1158 					rib_rbuf_free(qptoc(wd->qp),
1159 					    SEND_BUFFER,
1160 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1161 				}
1162 
1163 				/* decrement the send ref count */
1164 				rib_send_rele(qp);
1165 
1166 				mutex_exit(&wd->sendwait_lock);
1167 				(void) rib_free_sendwait(wd);
1168 			}
1169 		}
1170 	}
1171 }
1172 
1173 /*
1174  * RCQ handler
1175  */
1176 /* ARGSUSED */
1177 static void
rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1178 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1179 {
1180 	rib_qp_t	*qp;
1181 	ibt_status_t	ibt_status;
1182 	ibt_wc_t	wc;
1183 	struct recv_wid	*rwid;
1184 
1185 	/*
1186 	 * Re-enable cq notify here to avoid missing any
1187 	 * completion queue notification.
1188 	 */
1189 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1190 
1191 	ibt_status = IBT_SUCCESS;
1192 	while (ibt_status != IBT_CQ_EMPTY) {
1193 		bzero(&wc, sizeof (wc));
1194 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1195 		if (ibt_status != IBT_SUCCESS)
1196 			return;
1197 
1198 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1199 		qp = rwid->qp;
1200 
1201 		if (wc.wc_status == IBT_WC_SUCCESS) {
1202 			XDR	inxdrs, *xdrs;
1203 			uint_t	xid, vers, op, find_xid = 0;
1204 			struct reply	*r;
1205 			CONN *conn = qptoc(qp);
1206 			uint32_t rdma_credit = 0;
1207 
1208 			xdrs = &inxdrs;
1209 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1210 			    wc.wc_bytes_xfer, XDR_DECODE);
1211 			/*
1212 			 * Treat xid as opaque (xid is the first entity
1213 			 * in the rpc rdma message).
1214 			 */
1215 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1216 
1217 			/* Skip xid and set the xdr position accordingly. */
1218 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1219 			(void) xdr_u_int(xdrs, &vers);
1220 			(void) xdr_u_int(xdrs, &rdma_credit);
1221 			(void) xdr_u_int(xdrs, &op);
1222 			XDR_DESTROY(xdrs);
1223 
1224 			if (vers != RPCRDMA_VERS) {
1225 				/*
1226 				 * Invalid RPC/RDMA version. Cannot
1227 				 * interoperate.  Set connection to
1228 				 * ERROR state and bail out.
1229 				 */
1230 				mutex_enter(&conn->c_lock);
1231 				if (conn->c_state != C_DISCONN_PEND)
1232 					conn->c_state = C_ERROR_CONN;
1233 				mutex_exit(&conn->c_lock);
1234 				rib_rbuf_free(conn, RECV_BUFFER,
1235 				    (void *)(uintptr_t)rwid->addr);
1236 				rib_free_wid(rwid);
1237 				rib_recv_rele(qp);
1238 				continue;
1239 			}
1240 
1241 			mutex_enter(&qp->replylist_lock);
1242 			for (r = qp->replylist; r != NULL; r = r->next) {
1243 				if (r->xid == xid) {
1244 					find_xid = 1;
1245 					switch (op) {
1246 					case RDMA_MSG:
1247 					case RDMA_NOMSG:
1248 					case RDMA_MSGP:
1249 						r->status = RDMA_SUCCESS;
1250 						r->vaddr_cq = rwid->addr;
1251 						r->bytes_xfer =
1252 						    wc.wc_bytes_xfer;
1253 						cv_signal(&r->wait_cv);
1254 						break;
1255 					default:
1256 						rib_rbuf_free(qptoc(qp),
1257 						    RECV_BUFFER,
1258 						    (void *)(uintptr_t)
1259 						    rwid->addr);
1260 						break;
1261 					}
1262 					break;
1263 				}
1264 			}
1265 			mutex_exit(&qp->replylist_lock);
1266 			if (find_xid == 0) {
1267 				/* RPC caller not waiting for reply */
1268 
1269 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1270 				    int, xid);
1271 
1272 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1273 				    (void *)(uintptr_t)rwid->addr);
1274 			}
1275 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1276 			CONN *conn = qptoc(qp);
1277 
1278 			/*
1279 			 * Connection being flushed. Just free
1280 			 * the posted buffer
1281 			 */
1282 			rib_rbuf_free(conn, RECV_BUFFER,
1283 			    (void *)(uintptr_t)rwid->addr);
1284 		} else {
1285 			CONN *conn = qptoc(qp);
1286 /*
1287  *  RC Recv Q Error Code		Local state     Remote State
1288  *  ====================		===========     ============
1289  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1290  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1291  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1292  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1293  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1294  *  IBT_WC_WR_FLUSHED_ERR               None            None
1295  */
1296 			/*
1297 			 * Channel in error state. Set connection
1298 			 * in ERROR state.
1299 			 */
1300 			mutex_enter(&conn->c_lock);
1301 			if (conn->c_state != C_DISCONN_PEND)
1302 				conn->c_state = C_ERROR_CONN;
1303 			mutex_exit(&conn->c_lock);
1304 			rib_rbuf_free(conn, RECV_BUFFER,
1305 			    (void *)(uintptr_t)rwid->addr);
1306 		}
1307 		rib_free_wid(rwid);
1308 		rib_recv_rele(qp);
1309 	}
1310 }
1311 
1312 /* Server side */
1313 /* ARGSUSED */
1314 static void
rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1315 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1316 {
1317 	rdma_recv_data_t *rdp;
1318 	rib_qp_t	*qp;
1319 	ibt_status_t	ibt_status;
1320 	ibt_wc_t	wc;
1321 	struct svc_recv	*s_recvp;
1322 	CONN		*conn;
1323 	mblk_t		*mp;
1324 
1325 	/*
1326 	 * Re-enable cq notify here to avoid missing any
1327 	 * completion queue notification.
1328 	 */
1329 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1330 
1331 	ibt_status = IBT_SUCCESS;
1332 	while (ibt_status != IBT_CQ_EMPTY) {
1333 		bzero(&wc, sizeof (wc));
1334 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1335 		if (ibt_status != IBT_SUCCESS)
1336 			return;
1337 
1338 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1339 		qp = s_recvp->qp;
1340 		conn = qptoc(qp);
1341 
1342 		if (wc.wc_status == IBT_WC_SUCCESS) {
1343 			XDR	inxdrs, *xdrs;
1344 			uint_t	xid, vers, op;
1345 			uint32_t rdma_credit;
1346 
1347 			xdrs = &inxdrs;
1348 			/* s_recvp->vaddr stores data */
1349 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1350 			    wc.wc_bytes_xfer, XDR_DECODE);
1351 
1352 			/*
1353 			 * Treat xid as opaque (xid is the first entity
1354 			 * in the rpc rdma message).
1355 			 */
1356 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1357 			/* Skip xid and set the xdr position accordingly. */
1358 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1359 			if (!xdr_u_int(xdrs, &vers) ||
1360 			    !xdr_u_int(xdrs, &rdma_credit) ||
1361 			    !xdr_u_int(xdrs, &op)) {
1362 				rib_rbuf_free(conn, RECV_BUFFER,
1363 				    (void *)(uintptr_t)s_recvp->vaddr);
1364 				XDR_DESTROY(xdrs);
1365 				rib_recv_rele(qp);
1366 				(void) rib_free_svc_recv(s_recvp);
1367 				continue;
1368 			}
1369 			XDR_DESTROY(xdrs);
1370 
1371 			if (vers != RPCRDMA_VERS) {
1372 				/*
1373 				 * Invalid RPC/RDMA version.
1374 				 * Drop rpc rdma message.
1375 				 */
1376 				rib_rbuf_free(conn, RECV_BUFFER,
1377 				    (void *)(uintptr_t)s_recvp->vaddr);
1378 				rib_recv_rele(qp);
1379 				(void) rib_free_svc_recv(s_recvp);
1380 				continue;
1381 			}
1382 			/*
1383 			 * Is this for RDMA_DONE?
1384 			 */
1385 			if (op == RDMA_DONE) {
1386 				rib_rbuf_free(conn, RECV_BUFFER,
1387 				    (void *)(uintptr_t)s_recvp->vaddr);
1388 				/*
1389 				 * Wake up the thread waiting on
1390 				 * a RDMA_DONE for xid
1391 				 */
1392 				mutex_enter(&qp->rdlist_lock);
1393 				rdma_done_notify(qp, xid);
1394 				mutex_exit(&qp->rdlist_lock);
1395 				rib_recv_rele(qp);
1396 				(void) rib_free_svc_recv(s_recvp);
1397 				continue;
1398 			}
1399 
1400 			mutex_enter(&plugin_state_lock);
1401 			mutex_enter(&conn->c_lock);
1402 			if ((plugin_state == ACCEPT) &&
1403 			    (conn->c_state == C_CONNECTED)) {
1404 				conn->c_ref++;
1405 				mutex_exit(&conn->c_lock);
1406 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1407 				    == NULL)
1408 					(void) strwaitbuf(
1409 					    sizeof (*rdp), BPRI_LO);
1410 				/*
1411 				 * Plugin is in accept state, hence the master
1412 				 * transport queue for this is still accepting
1413 				 * requests. Hence we can call svc_queuereq to
1414 				 * queue this recieved msg.
1415 				 */
1416 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1417 				rdp->conn = conn;
1418 				rdp->rpcmsg.addr =
1419 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1420 				rdp->rpcmsg.type = RECV_BUFFER;
1421 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1422 				rdp->status = wc.wc_status;
1423 				mp->b_wptr += sizeof (*rdp);
1424 				(void) svc_queuereq((queue_t *)rib_stat->q, mp,
1425 				    FALSE);
1426 				mutex_exit(&plugin_state_lock);
1427 			} else {
1428 				/*
1429 				 * The master transport for this is going
1430 				 * away and the queue is not accepting anymore
1431 				 * requests for krpc, so don't do anything, just
1432 				 * free the msg.
1433 				 */
1434 				mutex_exit(&conn->c_lock);
1435 				mutex_exit(&plugin_state_lock);
1436 				rib_rbuf_free(conn, RECV_BUFFER,
1437 				    (void *)(uintptr_t)s_recvp->vaddr);
1438 			}
1439 		} else {
1440 			rib_rbuf_free(conn, RECV_BUFFER,
1441 			    (void *)(uintptr_t)s_recvp->vaddr);
1442 		}
1443 		rib_recv_rele(qp);
1444 		(void) rib_free_svc_recv(s_recvp);
1445 	}
1446 }
1447 
1448 static void
rib_attach_hca()1449 rib_attach_hca()
1450 {
1451 	mutex_enter(&rib_stat->open_hca_lock);
1452 	(void) rpcib_open_hcas(rib_stat);
1453 	rib_listen(NULL);
1454 	mutex_exit(&rib_stat->open_hca_lock);
1455 }
1456 
1457 /*
1458  * Handles DR event of IBT_HCA_DETACH_EVENT.
1459  */
1460 /* ARGSUSED */
1461 static void
rib_async_handler(void * clnt_private,ibt_hca_hdl_t hca_hdl,ibt_async_code_t code,ibt_async_event_t * event)1462 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1463     ibt_async_code_t code, ibt_async_event_t *event)
1464 {
1465 	switch (code) {
1466 	case IBT_HCA_ATTACH_EVENT:
1467 		rib_attach_hca();
1468 		break;
1469 	case IBT_HCA_DETACH_EVENT:
1470 		rib_detach_hca(hca_hdl);
1471 #ifdef DEBUG
1472 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1473 #endif
1474 		break;
1475 	case IBT_EVENT_PORT_UP:
1476 		/*
1477 		 * A port is up. We should call rib_listen() since there is
1478 		 * a chance that rib_listen() may have failed during
1479 		 * rib_attach_hca() because the port had not been up yet.
1480 		 */
1481 		rib_listen(NULL);
1482 #ifdef DEBUG
1483 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1484 #endif
1485 		break;
1486 #ifdef DEBUG
1487 	case IBT_EVENT_PATH_MIGRATED:
1488 		cmn_err(CE_NOTE, "rib_async_handler(): "
1489 		    "IBT_EVENT_PATH_MIGRATED\n");
1490 		break;
1491 	case IBT_EVENT_SQD:
1492 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1493 		break;
1494 	case IBT_EVENT_COM_EST:
1495 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1496 		break;
1497 	case IBT_ERROR_CATASTROPHIC_CHAN:
1498 		cmn_err(CE_NOTE, "rib_async_handler(): "
1499 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1500 		break;
1501 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1502 		cmn_err(CE_NOTE, "rib_async_handler(): "
1503 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1504 		break;
1505 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1506 		cmn_err(CE_NOTE, "rib_async_handler(): "
1507 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1508 		break;
1509 	case IBT_ERROR_PATH_MIGRATE_REQ:
1510 		cmn_err(CE_NOTE, "rib_async_handler(): "
1511 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1512 		break;
1513 	case IBT_ERROR_CQ:
1514 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1515 		break;
1516 	case IBT_ERROR_PORT_DOWN:
1517 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1518 		break;
1519 	case IBT_ASYNC_OPAQUE1:
1520 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1521 		break;
1522 	case IBT_ASYNC_OPAQUE2:
1523 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1524 		break;
1525 	case IBT_ASYNC_OPAQUE3:
1526 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1527 		break;
1528 	case IBT_ASYNC_OPAQUE4:
1529 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1530 		break;
1531 #endif
1532 	default:
1533 		break;
1534 	}
1535 }
1536 
1537 /*
1538  * Client's reachable function.
1539  */
1540 static rdma_stat
rib_reachable(int addr_type,struct netbuf * raddr,void ** handle)1541 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1542 {
1543 	rdma_stat	status;
1544 	rpcib_ping_t	rpt;
1545 	struct netbuf	saddr;
1546 	CONN		*conn;
1547 
1548 	bzero(&saddr, sizeof (struct netbuf));
1549 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1550 
1551 	if (status == RDMA_SUCCESS) {
1552 		*handle = (void *)rpt.hca;
1553 		/* release the reference */
1554 		(void) rib_conn_release(conn);
1555 		return (RDMA_SUCCESS);
1556 	} else {
1557 		*handle = NULL;
1558 		DTRACE_PROBE(rpcib__i__pingfailed);
1559 		return (RDMA_FAILED);
1560 	}
1561 }
1562 
1563 /* Client side qp creation */
1564 static rdma_stat
rib_clnt_create_chan(rib_hca_t * hca,struct netbuf * raddr,rib_qp_t ** qp)1565 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1566 {
1567 	rib_qp_t	*kqp = NULL;
1568 	CONN		*conn;
1569 	rdma_clnt_cred_ctrl_t *cc_info;
1570 
1571 	ASSERT(qp != NULL);
1572 	*qp = NULL;
1573 
1574 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1575 	conn = qptoc(kqp);
1576 	kqp->hca = hca;
1577 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1578 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1579 
1580 	kqp->mode = RIB_CLIENT;
1581 	kqp->chan_flags = IBT_BLOCKING;
1582 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1583 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1584 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1585 	/*
1586 	 * Initialize
1587 	 */
1588 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1589 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1592 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1595 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1597 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1598 	/*
1599 	 * Initialize the client credit control
1600 	 * portion of the rdmaconn struct.
1601 	 */
1602 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1603 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1604 	cc_info->clnt_cc_granted_ops = 0;
1605 	cc_info->clnt_cc_in_flight_ops = 0;
1606 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1607 
1608 	*qp = kqp;
1609 	return (RDMA_SUCCESS);
1610 }
1611 
1612 /* Server side qp creation */
1613 static rdma_stat
rib_svc_create_chan(rib_hca_t * hca,caddr_t q,uint8_t port,rib_qp_t ** qp)1614 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1615 {
1616 	rib_qp_t	*kqp = NULL;
1617 	ibt_chan_sizes_t	chan_sizes;
1618 	ibt_rc_chan_alloc_args_t	qp_attr;
1619 	ibt_status_t		ibt_status;
1620 	rdma_srv_cred_ctrl_t *cc_info;
1621 
1622 	*qp = NULL;
1623 
1624 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1625 	kqp->hca = hca;
1626 	kqp->port_num = port;
1627 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1628 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1629 
1630 	/*
1631 	 * Create the qp handle
1632 	 */
1633 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1634 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1635 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1636 	qp_attr.rc_pd = hca->pd_hdl;
1637 	qp_attr.rc_hca_port_num = port;
1638 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1639 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1640 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1641 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1642 	qp_attr.rc_clone_chan = NULL;
1643 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1644 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1645 
1646 	rw_enter(&hca->state_lock, RW_READER);
1647 	if (hca->state != HCA_DETACHED) {
1648 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1649 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1650 		    &chan_sizes);
1651 	} else {
1652 		rw_exit(&hca->state_lock);
1653 		goto fail;
1654 	}
1655 	rw_exit(&hca->state_lock);
1656 
1657 	if (ibt_status != IBT_SUCCESS) {
1658 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1659 		    int, ibt_status);
1660 		goto fail;
1661 	}
1662 
1663 	kqp->mode = RIB_SERVER;
1664 	kqp->chan_flags = IBT_BLOCKING;
1665 	kqp->q = q;	/* server ONLY */
1666 
1667 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1668 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1669 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1670 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1672 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1674 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1676 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1677 	/*
1678 	 * Set the private data area to qp to be used in callbacks
1679 	 */
1680 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1681 	kqp->rdmaconn.c_state = C_CONNECTED;
1682 
1683 	/*
1684 	 * Initialize the server credit control
1685 	 * portion of the rdmaconn struct.
1686 	 */
1687 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1688 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1689 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1690 	cc_info->srv_cc_cur_buffers_used = 0;
1691 	cc_info->srv_cc_posted = preposted_rbufs;
1692 
1693 	*qp = kqp;
1694 
1695 	return (RDMA_SUCCESS);
1696 fail:
1697 	if (kqp)
1698 		kmem_free(kqp, sizeof (rib_qp_t));
1699 
1700 	return (RDMA_FAILED);
1701 }
1702 
1703 /* ARGSUSED */
1704 ibt_cm_status_t
rib_clnt_cm_handler(void * clnt_hdl,ibt_cm_event_t * event,ibt_cm_return_args_t * ret_args,void * priv_data,ibt_priv_data_len_t len)1705 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1706     ibt_cm_return_args_t *ret_args, void *priv_data,
1707     ibt_priv_data_len_t len)
1708 {
1709 	rib_hca_t	*hca;
1710 
1711 	hca = (rib_hca_t *)clnt_hdl;
1712 
1713 	switch (event->cm_type) {
1714 
1715 	/* got a connection close event */
1716 	case IBT_CM_EVENT_CONN_CLOSED:
1717 	{
1718 		CONN	*conn;
1719 		rib_qp_t *qp;
1720 
1721 		/* check reason why connection was closed */
1722 		switch (event->cm_event.closed) {
1723 		case IBT_CM_CLOSED_DREP_RCVD:
1724 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1725 		case IBT_CM_CLOSED_DUP:
1726 		case IBT_CM_CLOSED_ABORT:
1727 		case IBT_CM_CLOSED_ALREADY:
1728 			/*
1729 			 * These cases indicate the local end initiated
1730 			 * the closing of the channel. Nothing to do here.
1731 			 */
1732 			break;
1733 		default:
1734 			/*
1735 			 * Reason for CONN_CLOSED event must be one of
1736 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1737 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1738 			 * the remote end is closing the channel. In these
1739 			 * cases free the channel and transition to error
1740 			 * state
1741 			 */
1742 			qp = ibt_get_chan_private(event->cm_channel);
1743 			conn = qptoc(qp);
1744 			mutex_enter(&conn->c_lock);
1745 			if (conn->c_state == C_DISCONN_PEND) {
1746 				mutex_exit(&conn->c_lock);
1747 				break;
1748 			}
1749 
1750 			conn->c_state = C_ERROR_CONN;
1751 
1752 			/*
1753 			 * Free the conn if c_ref is down to 0 already
1754 			 */
1755 			if (conn->c_ref == 0) {
1756 				/*
1757 				 * Remove from list and free conn
1758 				 */
1759 				conn->c_state = C_DISCONN_PEND;
1760 				mutex_exit(&conn->c_lock);
1761 				rw_enter(&hca->state_lock, RW_READER);
1762 				if (hca->state != HCA_DETACHED)
1763 					(void) rib_disconnect_channel(conn,
1764 					    &hca->cl_conn_list);
1765 				rw_exit(&hca->state_lock);
1766 			} else {
1767 				/*
1768 				 * conn will be freed when c_ref goes to 0.
1769 				 * Indicate to cleaning thread not to close
1770 				 * the connection, but just free the channel.
1771 				 */
1772 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1773 				mutex_exit(&conn->c_lock);
1774 			}
1775 #ifdef DEBUG
1776 			if (rib_debug)
1777 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1778 				    "(CONN_CLOSED) channel disconnected");
1779 #endif
1780 			break;
1781 		}
1782 		break;
1783 	}
1784 	default:
1785 		break;
1786 	}
1787 	return (IBT_CM_ACCEPT);
1788 }
1789 
1790 /*
1791  * Connect to the server.
1792  */
1793 rdma_stat
rib_conn_to_srv(rib_hca_t * hca,rib_qp_t * qp,rpcib_ping_t * rptp)1794 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1795 {
1796 	ibt_chan_open_args_t	chan_args;	/* channel args */
1797 	ibt_chan_sizes_t	chan_sizes;
1798 	ibt_rc_chan_alloc_args_t	qp_attr;
1799 	ibt_status_t		ibt_status;
1800 	ibt_rc_returns_t	ret_args;	/* conn reject info */
1801 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1802 	ibt_ip_cm_info_t	ipcm_info;
1803 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1804 
1805 
1806 	(void) bzero(&chan_args, sizeof (chan_args));
1807 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1808 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1809 
1810 	ipcm_info.src_addr.family = rptp->srcip.family;
1811 	switch (ipcm_info.src_addr.family) {
1812 	case AF_INET:
1813 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1814 		break;
1815 	case AF_INET6:
1816 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1817 		break;
1818 	}
1819 
1820 	ipcm_info.dst_addr.family = rptp->srcip.family;
1821 	switch (ipcm_info.dst_addr.family) {
1822 	case AF_INET:
1823 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1824 		break;
1825 	case AF_INET6:
1826 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1827 		break;
1828 	}
1829 
1830 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1831 
1832 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1833 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1834 
1835 	if (ibt_status != IBT_SUCCESS) {
1836 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1837 		return (-1);
1838 	}
1839 
1840 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1841 	/* Alloc a RC channel */
1842 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1843 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1844 	qp_attr.rc_pd = hca->pd_hdl;
1845 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1846 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1847 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1848 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1849 	qp_attr.rc_clone_chan = NULL;
1850 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1851 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1852 
1853 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1854 	chan_args.oc_path = &rptp->path;
1855 
1856 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1857 	chan_args.oc_cm_clnt_private = (void *)hca;
1858 	chan_args.oc_rdma_ra_out = 4;
1859 	chan_args.oc_rdma_ra_in = 4;
1860 	chan_args.oc_path_retry_cnt = 2;
1861 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1862 	chan_args.oc_priv_data = cmp_ip_pvt;
1863 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1864 
1865 refresh:
1866 	rw_enter(&hca->state_lock, RW_READER);
1867 	if (hca->state != HCA_DETACHED) {
1868 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1869 		    IBT_ACHAN_NO_FLAGS,
1870 		    &qp_attr, &qp->qp_hdl,
1871 		    &chan_sizes);
1872 	} else {
1873 		rw_exit(&hca->state_lock);
1874 		return (RDMA_FAILED);
1875 	}
1876 	rw_exit(&hca->state_lock);
1877 
1878 	if (ibt_status != IBT_SUCCESS) {
1879 		DTRACE_PROBE1(rpcib__i_conntosrv,
1880 		    int, ibt_status);
1881 		return (RDMA_FAILED);
1882 	}
1883 
1884 	/* Connect to the Server */
1885 	(void) bzero(&ret_args, sizeof (ret_args));
1886 	mutex_enter(&qp->cb_lock);
1887 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1888 	    IBT_BLOCKING, &chan_args, &ret_args);
1889 	if (ibt_status != IBT_SUCCESS) {
1890 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1891 		    int, ibt_status, int, ret_args.rc_status);
1892 
1893 		(void) ibt_free_channel(qp->qp_hdl);
1894 		qp->qp_hdl = NULL;
1895 		mutex_exit(&qp->cb_lock);
1896 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1897 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1898 			/*
1899 			 * Got IBT_CM_CONN_STALE probably because of stale
1900 			 * data on the passive end of a channel that existed
1901 			 * prior to reboot. Retry establishing a channel
1902 			 * REFRESH_ATTEMPTS times, during which time the
1903 			 * stale conditions on the server might clear up.
1904 			 */
1905 			goto refresh;
1906 		}
1907 		return (RDMA_FAILED);
1908 	}
1909 	mutex_exit(&qp->cb_lock);
1910 	/*
1911 	 * Set the private data area to qp to be used in callbacks
1912 	 */
1913 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1914 	return (RDMA_SUCCESS);
1915 }
1916 
1917 rdma_stat
rib_ping_srv(int addr_type,struct netbuf * raddr,rpcib_ping_t * rptp)1918 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1919 {
1920 	uint_t			i, addr_count;
1921 	ibt_status_t		ibt_status;
1922 	uint8_t			num_paths_p;
1923 	ibt_ip_path_attr_t	ipattr;
1924 	ibt_path_ip_src_t	srcip;
1925 	rpcib_ipaddrs_t		addrs4;
1926 	rpcib_ipaddrs_t		addrs6;
1927 	struct sockaddr_in	*sinp;
1928 	struct sockaddr_in6	*sin6p;
1929 	rdma_stat		retval = RDMA_FAILED;
1930 	rib_hca_t *hca;
1931 
1932 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1933 		return (RDMA_INVAL);
1934 	ASSERT(raddr->buf != NULL);
1935 
1936 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1937 
1938 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1939 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1940 		retval = RDMA_FAILED;
1941 		goto done2;
1942 	}
1943 
1944 	if (addr_type == AF_INET) {
1945 		addr_count = addrs4.ri_count;
1946 		sinp = (struct sockaddr_in *)raddr->buf;
1947 		rptp->dstip.family = AF_INET;
1948 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1949 		sinp = addrs4.ri_list;
1950 	} else {
1951 		addr_count = addrs6.ri_count;
1952 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1953 		rptp->dstip.family = AF_INET6;
1954 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1955 		sin6p = addrs6.ri_list;
1956 	}
1957 
1958 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1959 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1960 		rw_enter(&hca->state_lock, RW_READER);
1961 		if (hca->state == HCA_DETACHED) {
1962 			rw_exit(&hca->state_lock);
1963 			continue;
1964 		}
1965 
1966 		ipattr.ipa_dst_ip	= &rptp->dstip;
1967 		ipattr.ipa_hca_guid	= hca->hca_guid;
1968 		ipattr.ipa_ndst		= 1;
1969 		ipattr.ipa_max_paths	= 1;
1970 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1971 		for (i = 0; i < addr_count; i++) {
1972 			num_paths_p = 0;
1973 			if (addr_type == AF_INET) {
1974 				ipattr.ipa_src_ip.un.ip4addr =
1975 				    sinp[i].sin_addr.s_addr;
1976 			} else {
1977 				ipattr.ipa_src_ip.un.ip6addr =
1978 				    sin6p[i].sin6_addr;
1979 			}
1980 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1981 
1982 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1983 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1984 			    &num_paths_p, &srcip);
1985 			if (ibt_status == IBT_SUCCESS &&
1986 			    num_paths_p != 0 &&
1987 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1988 				rptp->hca = hca;
1989 				rw_exit(&hca->state_lock);
1990 				if (addr_type == AF_INET) {
1991 					rptp->srcip.family = AF_INET;
1992 					rptp->srcip.un.ip4addr =
1993 					    srcip.ip_primary.un.ip4addr;
1994 				} else {
1995 					rptp->srcip.family = AF_INET6;
1996 					rptp->srcip.un.ip6addr =
1997 					    srcip.ip_primary.un.ip6addr;
1998 
1999 				}
2000 				retval = RDMA_SUCCESS;
2001 				goto done1;
2002 			}
2003 		}
2004 		rw_exit(&hca->state_lock);
2005 	}
2006 done1:
2007 	rw_exit(&rib_stat->hcas_list_lock);
2008 done2:
2009 	if (addrs4.ri_size > 0)
2010 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2011 	if (addrs6.ri_size > 0)
2012 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2013 	return (retval);
2014 }
2015 
2016 /*
2017  * Close channel, remove from connection list and
2018  * free up resources allocated for that channel.
2019  */
2020 rdma_stat
rib_disconnect_channel(CONN * conn,rib_conn_list_t * conn_list)2021 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2022 {
2023 	rib_qp_t	*qp = ctoqp(conn);
2024 	rib_hca_t	*hca;
2025 
2026 	mutex_enter(&conn->c_lock);
2027 	if (conn->c_timeout != NULL) {
2028 		mutex_exit(&conn->c_lock);
2029 		(void) untimeout(conn->c_timeout);
2030 		mutex_enter(&conn->c_lock);
2031 	}
2032 
2033 	while (conn->c_flags & C_CLOSE_PENDING) {
2034 		cv_wait(&conn->c_cv, &conn->c_lock);
2035 	}
2036 	mutex_exit(&conn->c_lock);
2037 
2038 	/*
2039 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2040 	 */
2041 	hca = qp->hca;
2042 	if (conn_list != NULL)
2043 		(void) rib_rm_conn(conn, conn_list);
2044 
2045 	/*
2046 	 * There is only one case where we get here with
2047 	 * qp_hdl = NULL, which is during connection setup on
2048 	 * the client. In such a case there are no posted
2049 	 * send/recv buffers.
2050 	 */
2051 	if (qp->qp_hdl != NULL) {
2052 		mutex_enter(&qp->posted_rbufs_lock);
2053 		while (qp->n_posted_rbufs)
2054 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2055 		mutex_exit(&qp->posted_rbufs_lock);
2056 
2057 		mutex_enter(&qp->send_rbufs_lock);
2058 		while (qp->n_send_rbufs)
2059 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2060 		mutex_exit(&qp->send_rbufs_lock);
2061 
2062 		(void) ibt_free_channel(qp->qp_hdl);
2063 			qp->qp_hdl = NULL;
2064 	}
2065 
2066 	ASSERT(qp->rdlist == NULL);
2067 
2068 	if (qp->replylist != NULL) {
2069 		(void) rib_rem_replylist(qp);
2070 	}
2071 
2072 	cv_destroy(&qp->cb_conn_cv);
2073 	cv_destroy(&qp->posted_rbufs_cv);
2074 	cv_destroy(&qp->send_rbufs_cv);
2075 	mutex_destroy(&qp->cb_lock);
2076 	mutex_destroy(&qp->replylist_lock);
2077 	mutex_destroy(&qp->posted_rbufs_lock);
2078 	mutex_destroy(&qp->send_rbufs_lock);
2079 	mutex_destroy(&qp->rdlist_lock);
2080 
2081 	cv_destroy(&conn->c_cv);
2082 	mutex_destroy(&conn->c_lock);
2083 
2084 	if (conn->c_raddr.buf != NULL) {
2085 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2086 	}
2087 	if (conn->c_laddr.buf != NULL) {
2088 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2089 	}
2090 	if (conn->c_netid != NULL) {
2091 		kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2092 	}
2093 	if (conn->c_addrmask.buf != NULL) {
2094 		kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2095 	}
2096 
2097 	/*
2098 	 * Credit control cleanup.
2099 	 */
2100 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2101 		rdma_clnt_cred_ctrl_t *cc_info;
2102 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2103 		cv_destroy(&cc_info->clnt_cc_cv);
2104 	}
2105 
2106 	kmem_free(qp, sizeof (rib_qp_t));
2107 
2108 	/*
2109 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2110 	 * then the hca is no longer being used.
2111 	 */
2112 	if (conn_list != NULL) {
2113 		rw_enter(&hca->state_lock, RW_READER);
2114 		if (hca->state == HCA_DETACHED) {
2115 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2116 			if (hca->srv_conn_list.conn_hd == NULL) {
2117 				rw_enter(&hca->cl_conn_list.conn_lock,
2118 				    RW_READER);
2119 
2120 				if (hca->cl_conn_list.conn_hd == NULL) {
2121 					mutex_enter(&hca->inuse_lock);
2122 					hca->inuse = FALSE;
2123 					cv_signal(&hca->cb_cv);
2124 					mutex_exit(&hca->inuse_lock);
2125 				}
2126 				rw_exit(&hca->cl_conn_list.conn_lock);
2127 			}
2128 			rw_exit(&hca->srv_conn_list.conn_lock);
2129 		}
2130 		rw_exit(&hca->state_lock);
2131 	}
2132 
2133 	return (RDMA_SUCCESS);
2134 }
2135 
2136 /*
2137  * All sends are done under the protection of
2138  * the wdesc->sendwait_lock. n_send_rbufs count
2139  * is protected using the send_rbufs_lock.
2140  * lock ordering is:
2141  * sendwait_lock -> send_rbufs_lock
2142  */
2143 
2144 void
rib_send_hold(rib_qp_t * qp)2145 rib_send_hold(rib_qp_t *qp)
2146 {
2147 	mutex_enter(&qp->send_rbufs_lock);
2148 	qp->n_send_rbufs++;
2149 	mutex_exit(&qp->send_rbufs_lock);
2150 }
2151 
2152 void
rib_send_rele(rib_qp_t * qp)2153 rib_send_rele(rib_qp_t *qp)
2154 {
2155 	mutex_enter(&qp->send_rbufs_lock);
2156 	qp->n_send_rbufs--;
2157 	if (qp->n_send_rbufs == 0)
2158 		cv_signal(&qp->send_rbufs_cv);
2159 	mutex_exit(&qp->send_rbufs_lock);
2160 }
2161 
2162 void
rib_recv_rele(rib_qp_t * qp)2163 rib_recv_rele(rib_qp_t *qp)
2164 {
2165 	mutex_enter(&qp->posted_rbufs_lock);
2166 	qp->n_posted_rbufs--;
2167 	if (qp->n_posted_rbufs == 0)
2168 		cv_signal(&qp->posted_rbufs_cv);
2169 	mutex_exit(&qp->posted_rbufs_lock);
2170 }
2171 
2172 /*
2173  * Wait for send completion notification. Only on receiving a
2174  * notification be it a successful or error completion, free the
2175  * send_wid.
2176  */
2177 static rdma_stat
rib_sendwait(rib_qp_t * qp,struct send_wid * wd)2178 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2179 {
2180 	clock_t timout, cv_wait_ret;
2181 	rdma_stat error = RDMA_SUCCESS;
2182 	int	i;
2183 
2184 	/*
2185 	 * Wait for send to complete
2186 	 */
2187 	ASSERT(wd != NULL);
2188 	mutex_enter(&wd->sendwait_lock);
2189 	if (wd->status == (uint_t)SEND_WAIT) {
2190 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2191 		    ddi_get_lbolt();
2192 
2193 		if (qp->mode == RIB_SERVER) {
2194 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2195 			    &wd->sendwait_lock, timout)) > 0 &&
2196 			    wd->status == (uint_t)SEND_WAIT)
2197 				;
2198 			switch (cv_wait_ret) {
2199 			case -1:	/* timeout */
2200 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2201 
2202 				wd->cv_sig = 0;		/* no signal needed */
2203 				error = RDMA_TIMEDOUT;
2204 				break;
2205 			default:	/* got send completion */
2206 				break;
2207 			}
2208 		} else {
2209 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2210 			    &wd->sendwait_lock, timout)) > 0 &&
2211 			    wd->status == (uint_t)SEND_WAIT)
2212 				;
2213 			switch (cv_wait_ret) {
2214 			case -1:	/* timeout */
2215 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2216 
2217 				wd->cv_sig = 0;		/* no signal needed */
2218 				error = RDMA_TIMEDOUT;
2219 				break;
2220 			case 0:		/* interrupted */
2221 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2222 
2223 				wd->cv_sig = 0;		/* no signal needed */
2224 				error = RDMA_INTR;
2225 				break;
2226 			default:	/* got send completion */
2227 				break;
2228 			}
2229 		}
2230 	}
2231 
2232 	if (wd->status != (uint_t)SEND_WAIT) {
2233 		/* got send completion */
2234 		if (wd->status != RDMA_SUCCESS) {
2235 			switch (wd->status) {
2236 			case RDMA_CONNLOST:
2237 				error = RDMA_CONNLOST;
2238 				break;
2239 			default:
2240 				error = RDMA_FAILED;
2241 				break;
2242 			}
2243 		}
2244 		for (i = 0; i < wd->nsbufs; i++) {
2245 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2246 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2247 		}
2248 
2249 		rib_send_rele(qp);
2250 
2251 		mutex_exit(&wd->sendwait_lock);
2252 		(void) rib_free_sendwait(wd);
2253 
2254 	} else {
2255 		mutex_exit(&wd->sendwait_lock);
2256 	}
2257 	return (error);
2258 }
2259 
2260 static struct send_wid *
rib_init_sendwait(uint32_t xid,int cv_sig,rib_qp_t * qp)2261 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2262 {
2263 	struct send_wid	*wd;
2264 
2265 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2266 	wd->xid = xid;
2267 	wd->cv_sig = cv_sig;
2268 	wd->qp = qp;
2269 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2270 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2271 	wd->status = (uint_t)SEND_WAIT;
2272 
2273 	return (wd);
2274 }
2275 
2276 static int
rib_free_sendwait(struct send_wid * wdesc)2277 rib_free_sendwait(struct send_wid *wdesc)
2278 {
2279 	cv_destroy(&wdesc->wait_cv);
2280 	mutex_destroy(&wdesc->sendwait_lock);
2281 	kmem_free(wdesc, sizeof (*wdesc));
2282 
2283 	return (0);
2284 }
2285 
2286 static rdma_stat
rib_rem_rep(rib_qp_t * qp,struct reply * rep)2287 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2288 {
2289 	mutex_enter(&qp->replylist_lock);
2290 	if (rep != NULL) {
2291 		(void) rib_remreply(qp, rep);
2292 		mutex_exit(&qp->replylist_lock);
2293 		return (RDMA_SUCCESS);
2294 	}
2295 	mutex_exit(&qp->replylist_lock);
2296 	return (RDMA_FAILED);
2297 }
2298 
2299 /*
2300  * Send buffers are freed here only in case of error in posting
2301  * on QP. If the post succeeded, the send buffers are freed upon
2302  * send completion in rib_sendwait() or in the scq_handler.
2303  */
2304 rdma_stat
rib_send_and_wait(CONN * conn,struct clist * cl,uint32_t msgid,int send_sig,int cv_sig,caddr_t * swid)2305 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2306     int send_sig, int cv_sig, caddr_t *swid)
2307 {
2308 	struct send_wid	*wdesc;
2309 	struct clist	*clp;
2310 	ibt_status_t	ibt_status = IBT_SUCCESS;
2311 	rdma_stat	ret = RDMA_SUCCESS;
2312 	ibt_send_wr_t	tx_wr;
2313 	int		i, nds;
2314 	ibt_wr_ds_t	sgl[DSEG_MAX];
2315 	uint_t		total_msg_size;
2316 	rib_qp_t	*qp;
2317 
2318 	qp = ctoqp(conn);
2319 
2320 	ASSERT(cl != NULL);
2321 
2322 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2323 
2324 	nds = 0;
2325 	total_msg_size = 0;
2326 	clp = cl;
2327 	while (clp != NULL) {
2328 		if (nds >= DSEG_MAX) {
2329 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2330 			return (RDMA_FAILED);
2331 		}
2332 		sgl[nds].ds_va = clp->w.c_saddr;
2333 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2334 		sgl[nds].ds_len = clp->c_len;
2335 		total_msg_size += clp->c_len;
2336 		clp = clp->c_next;
2337 		nds++;
2338 	}
2339 
2340 	if (send_sig) {
2341 		/* Set SEND_SIGNAL flag. */
2342 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2343 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2344 		*swid = (caddr_t)wdesc;
2345 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2346 		mutex_enter(&wdesc->sendwait_lock);
2347 		wdesc->nsbufs = nds;
2348 		for (i = 0; i < nds; i++) {
2349 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2350 		}
2351 	} else {
2352 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2353 		*swid = NULL;
2354 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2355 	}
2356 
2357 	tx_wr.wr_opcode = IBT_WRC_SEND;
2358 	tx_wr.wr_trans = IBT_RC_SRV;
2359 	tx_wr.wr_nds = nds;
2360 	tx_wr.wr_sgl = sgl;
2361 
2362 	mutex_enter(&conn->c_lock);
2363 	if (conn->c_state == C_CONNECTED) {
2364 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2365 	}
2366 	if (conn->c_state != C_CONNECTED ||
2367 	    ibt_status != IBT_SUCCESS) {
2368 		if (conn->c_state != C_DISCONN_PEND)
2369 			conn->c_state = C_ERROR_CONN;
2370 		mutex_exit(&conn->c_lock);
2371 		if (send_sig) {
2372 			for (i = 0; i < nds; i++) {
2373 				rib_rbuf_free(conn, SEND_BUFFER,
2374 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2375 			}
2376 			mutex_exit(&wdesc->sendwait_lock);
2377 			(void) rib_free_sendwait(wdesc);
2378 		}
2379 		return (RDMA_CONNLOST);
2380 	}
2381 
2382 	mutex_exit(&conn->c_lock);
2383 
2384 	if (send_sig) {
2385 		rib_send_hold(qp);
2386 		mutex_exit(&wdesc->sendwait_lock);
2387 		if (cv_sig) {
2388 			/*
2389 			 * cv_wait for send to complete.
2390 			 * We can fail due to a timeout or signal or
2391 			 * unsuccessful send.
2392 			 */
2393 			ret = rib_sendwait(qp, wdesc);
2394 
2395 			return (ret);
2396 		}
2397 	}
2398 
2399 	return (RDMA_SUCCESS);
2400 }
2401 
2402 
2403 rdma_stat
rib_send(CONN * conn,struct clist * cl,uint32_t msgid)2404 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2405 {
2406 	rdma_stat	ret;
2407 	caddr_t		wd;
2408 
2409 	/* send-wait & cv_signal */
2410 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2411 	return (ret);
2412 }
2413 
2414 /*
2415  * Deprecated/obsolete interface not used currently
2416  * but earlier used for READ-READ protocol.
2417  * Send RPC reply and wait for RDMA_DONE.
2418  */
2419 rdma_stat
rib_send_resp(CONN * conn,struct clist * cl,uint32_t msgid)2420 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2421 {
2422 	rdma_stat ret = RDMA_SUCCESS;
2423 	struct rdma_done_list *rd;
2424 	clock_t cv_wait_ret;
2425 	caddr_t *wid = NULL;
2426 	rib_qp_t *qp = ctoqp(conn);
2427 
2428 	mutex_enter(&qp->rdlist_lock);
2429 	rd = rdma_done_add(qp, msgid);
2430 
2431 	/* No cv_signal (whether send-wait or no-send-wait) */
2432 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2433 
2434 	if (ret != RDMA_SUCCESS) {
2435 		rdma_done_rm(qp, rd);
2436 	} else {
2437 		/*
2438 		 * Wait for RDMA_DONE from remote end
2439 		 */
2440 		cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2441 		    &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2442 		    TR_CLOCK_TICK);
2443 
2444 		rdma_done_rm(qp, rd);
2445 
2446 		if (cv_wait_ret < 0) {
2447 			ret = RDMA_TIMEDOUT;
2448 		}
2449 	}
2450 
2451 	mutex_exit(&qp->rdlist_lock);
2452 	return (ret);
2453 }
2454 
2455 static struct recv_wid *
rib_create_wid(rib_qp_t * qp,ibt_wr_ds_t * sgl,uint32_t msgid)2456 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2457 {
2458 	struct recv_wid	*rwid;
2459 
2460 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2461 	rwid->xid = msgid;
2462 	rwid->addr = sgl->ds_va;
2463 	rwid->qp = qp;
2464 
2465 	return (rwid);
2466 }
2467 
2468 static void
rib_free_wid(struct recv_wid * rwid)2469 rib_free_wid(struct recv_wid *rwid)
2470 {
2471 	kmem_free(rwid, sizeof (struct recv_wid));
2472 }
2473 
2474 rdma_stat
rib_clnt_post(CONN * conn,struct clist * cl,uint32_t msgid)2475 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2476 {
2477 	rib_qp_t	*qp = ctoqp(conn);
2478 	struct clist	*clp = cl;
2479 	struct reply	*rep;
2480 	struct recv_wid	*rwid;
2481 	int		nds;
2482 	ibt_wr_ds_t	sgl[DSEG_MAX];
2483 	ibt_recv_wr_t	recv_wr;
2484 	rdma_stat	ret;
2485 	ibt_status_t	ibt_status;
2486 
2487 	/*
2488 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2489 	 */
2490 
2491 	nds = 0;
2492 	while (cl != NULL) {
2493 		if (nds >= DSEG_MAX) {
2494 			ret = RDMA_FAILED;
2495 			goto done;
2496 		}
2497 		sgl[nds].ds_va = cl->w.c_saddr;
2498 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2499 		sgl[nds].ds_len = cl->c_len;
2500 		cl = cl->c_next;
2501 		nds++;
2502 	}
2503 
2504 	if (nds != 1) {
2505 		ret = RDMA_FAILED;
2506 		goto done;
2507 	}
2508 
2509 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2510 	recv_wr.wr_nds = nds;
2511 	recv_wr.wr_sgl = sgl;
2512 
2513 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2514 	if (rwid) {
2515 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2516 	} else {
2517 		ret = RDMA_NORESOURCE;
2518 		goto done;
2519 	}
2520 	rep = rib_addreplylist(qp, msgid);
2521 	if (!rep) {
2522 		rib_free_wid(rwid);
2523 		ret = RDMA_NORESOURCE;
2524 		goto done;
2525 	}
2526 
2527 	mutex_enter(&conn->c_lock);
2528 
2529 	if (conn->c_state == C_CONNECTED) {
2530 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2531 	}
2532 
2533 	if (conn->c_state != C_CONNECTED ||
2534 	    ibt_status != IBT_SUCCESS) {
2535 		if (conn->c_state != C_DISCONN_PEND)
2536 			conn->c_state = C_ERROR_CONN;
2537 		mutex_exit(&conn->c_lock);
2538 		rib_free_wid(rwid);
2539 		(void) rib_rem_rep(qp, rep);
2540 		ret = RDMA_CONNLOST;
2541 		goto done;
2542 	}
2543 
2544 	mutex_enter(&qp->posted_rbufs_lock);
2545 	qp->n_posted_rbufs++;
2546 	mutex_exit(&qp->posted_rbufs_lock);
2547 
2548 	mutex_exit(&conn->c_lock);
2549 	return (RDMA_SUCCESS);
2550 
2551 done:
2552 	while (clp != NULL) {
2553 		rib_rbuf_free(conn, RECV_BUFFER,
2554 		    (void *)(uintptr_t)clp->w.c_saddr3);
2555 		clp = clp->c_next;
2556 	}
2557 	return (ret);
2558 }
2559 
2560 rdma_stat
rib_svc_post(CONN * conn,struct clist * cl)2561 rib_svc_post(CONN* conn, struct clist *cl)
2562 {
2563 	rib_qp_t	*qp = ctoqp(conn);
2564 	struct svc_recv	*s_recvp;
2565 	int		nds;
2566 	ibt_wr_ds_t	sgl[DSEG_MAX];
2567 	ibt_recv_wr_t	recv_wr;
2568 	ibt_status_t	ibt_status;
2569 
2570 	nds = 0;
2571 	while (cl != NULL) {
2572 		if (nds >= DSEG_MAX) {
2573 			return (RDMA_FAILED);
2574 		}
2575 		sgl[nds].ds_va = cl->w.c_saddr;
2576 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2577 		sgl[nds].ds_len = cl->c_len;
2578 		cl = cl->c_next;
2579 		nds++;
2580 	}
2581 
2582 	if (nds != 1) {
2583 		rib_rbuf_free(conn, RECV_BUFFER,
2584 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2585 
2586 		return (RDMA_FAILED);
2587 	}
2588 
2589 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2590 	recv_wr.wr_nds = nds;
2591 	recv_wr.wr_sgl = sgl;
2592 
2593 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2594 	/* Use s_recvp's addr as wr id */
2595 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2596 	mutex_enter(&conn->c_lock);
2597 	if (conn->c_state == C_CONNECTED) {
2598 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2599 	}
2600 	if (conn->c_state != C_CONNECTED ||
2601 	    ibt_status != IBT_SUCCESS) {
2602 		if (conn->c_state != C_DISCONN_PEND)
2603 			conn->c_state = C_ERROR_CONN;
2604 		mutex_exit(&conn->c_lock);
2605 		rib_rbuf_free(conn, RECV_BUFFER,
2606 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2607 		(void) rib_free_svc_recv(s_recvp);
2608 
2609 		return (RDMA_CONNLOST);
2610 	}
2611 	mutex_exit(&conn->c_lock);
2612 
2613 	return (RDMA_SUCCESS);
2614 }
2615 
2616 /* Client */
2617 rdma_stat
rib_post_resp(CONN * conn,struct clist * cl,uint32_t msgid)2618 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2619 {
2620 	return (rib_clnt_post(conn, cl, msgid));
2621 }
2622 
2623 /* Client */
2624 rdma_stat
rib_post_resp_remove(CONN * conn,uint32_t msgid)2625 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2626 {
2627 	rib_qp_t	*qp = ctoqp(conn);
2628 	struct reply	*rep;
2629 
2630 	mutex_enter(&qp->replylist_lock);
2631 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2632 		if (rep->xid == msgid) {
2633 			if (rep->vaddr_cq) {
2634 				rib_rbuf_free(conn, RECV_BUFFER,
2635 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2636 			}
2637 			(void) rib_remreply(qp, rep);
2638 			break;
2639 		}
2640 	}
2641 	mutex_exit(&qp->replylist_lock);
2642 
2643 	return (RDMA_SUCCESS);
2644 }
2645 
2646 /* Server */
2647 rdma_stat
rib_post_recv(CONN * conn,struct clist * cl)2648 rib_post_recv(CONN *conn, struct clist *cl)
2649 {
2650 	rib_qp_t	*qp = ctoqp(conn);
2651 
2652 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2653 		mutex_enter(&qp->posted_rbufs_lock);
2654 		qp->n_posted_rbufs++;
2655 		mutex_exit(&qp->posted_rbufs_lock);
2656 		return (RDMA_SUCCESS);
2657 	}
2658 	return (RDMA_FAILED);
2659 }
2660 
2661 /*
2662  * Client side only interface to "recv" the rpc reply buf
2663  * posted earlier by rib_post_resp(conn, cl, msgid).
2664  */
2665 rdma_stat
rib_recv(CONN * conn,struct clist ** clp,uint32_t msgid)2666 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2667 {
2668 	struct reply *rep = NULL;
2669 	clock_t timout, cv_wait_ret;
2670 	rdma_stat ret = RDMA_SUCCESS;
2671 	rib_qp_t *qp = ctoqp(conn);
2672 
2673 	/*
2674 	 * Find the reply structure for this msgid
2675 	 */
2676 	mutex_enter(&qp->replylist_lock);
2677 
2678 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2679 		if (rep->xid == msgid)
2680 			break;
2681 	}
2682 
2683 	if (rep != NULL) {
2684 		/*
2685 		 * If message not yet received, wait.
2686 		 */
2687 		if (rep->status == (uint_t)REPLY_WAIT) {
2688 			timout = ddi_get_lbolt() +
2689 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2690 
2691 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2692 			    &qp->replylist_lock, timout)) > 0 &&
2693 			    rep->status == (uint_t)REPLY_WAIT)
2694 				;
2695 
2696 			switch (cv_wait_ret) {
2697 			case -1:	/* timeout */
2698 				ret = RDMA_TIMEDOUT;
2699 				break;
2700 			case 0:
2701 				ret = RDMA_INTR;
2702 				break;
2703 			default:
2704 				break;
2705 			}
2706 		}
2707 
2708 		if (rep->status == RDMA_SUCCESS) {
2709 			struct clist *cl = NULL;
2710 
2711 			/*
2712 			 * Got message successfully
2713 			 */
2714 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2715 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2716 			*clp = cl;
2717 		} else {
2718 			if (rep->status != (uint_t)REPLY_WAIT) {
2719 				/*
2720 				 * Got error in reply message. Free
2721 				 * recv buffer here.
2722 				 */
2723 				ret = rep->status;
2724 				rib_rbuf_free(conn, RECV_BUFFER,
2725 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2726 			}
2727 		}
2728 		(void) rib_remreply(qp, rep);
2729 	} else {
2730 		/*
2731 		 * No matching reply structure found for given msgid on the
2732 		 * reply wait list.
2733 		 */
2734 		ret = RDMA_INVAL;
2735 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2736 	}
2737 
2738 	/*
2739 	 * Done.
2740 	 */
2741 	mutex_exit(&qp->replylist_lock);
2742 	return (ret);
2743 }
2744 
2745 /*
2746  * RDMA write a buffer to the remote address.
2747  */
2748 rdma_stat
rib_write(CONN * conn,struct clist * cl,int wait)2749 rib_write(CONN *conn, struct clist *cl, int wait)
2750 {
2751 	ibt_send_wr_t	tx_wr;
2752 	int		cv_sig;
2753 	ibt_wr_ds_t	sgl[DSEG_MAX];
2754 	struct send_wid	*wdesc;
2755 	ibt_status_t	ibt_status;
2756 	rdma_stat	ret = RDMA_SUCCESS;
2757 	rib_qp_t	*qp = ctoqp(conn);
2758 	uint64_t	n_writes = 0;
2759 
2760 	if (cl == NULL) {
2761 		return (RDMA_FAILED);
2762 	}
2763 
2764 	while ((cl != NULL)) {
2765 		if (cl->c_len > 0) {
2766 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2767 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2768 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2769 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2770 			sgl[0].ds_va = cl->w.c_saddr;
2771 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2772 			sgl[0].ds_len = cl->c_len;
2773 
2774 			if (wait) {
2775 				cv_sig = 1;
2776 			} else {
2777 				if (n_writes > max_unsignaled_rws) {
2778 					n_writes = 0;
2779 					cv_sig = 1;
2780 				} else {
2781 					cv_sig = 0;
2782 				}
2783 			}
2784 
2785 			if (cv_sig) {
2786 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2787 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2788 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2789 				mutex_enter(&wdesc->sendwait_lock);
2790 			} else {
2791 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2792 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2793 			}
2794 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2795 			tx_wr.wr_trans = IBT_RC_SRV;
2796 			tx_wr.wr_nds = 1;
2797 			tx_wr.wr_sgl = sgl;
2798 
2799 			mutex_enter(&conn->c_lock);
2800 			if (conn->c_state == C_CONNECTED) {
2801 				ibt_status =
2802 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2803 			}
2804 			if (conn->c_state != C_CONNECTED ||
2805 			    ibt_status != IBT_SUCCESS) {
2806 				if (conn->c_state != C_DISCONN_PEND)
2807 					conn->c_state = C_ERROR_CONN;
2808 				mutex_exit(&conn->c_lock);
2809 				if (cv_sig) {
2810 					mutex_exit(&wdesc->sendwait_lock);
2811 					(void) rib_free_sendwait(wdesc);
2812 				}
2813 				return (RDMA_CONNLOST);
2814 			}
2815 
2816 			mutex_exit(&conn->c_lock);
2817 
2818 			/*
2819 			 * Wait for send to complete
2820 			 */
2821 			if (cv_sig) {
2822 
2823 				rib_send_hold(qp);
2824 				mutex_exit(&wdesc->sendwait_lock);
2825 
2826 				ret = rib_sendwait(qp, wdesc);
2827 				if (ret != 0)
2828 					return (ret);
2829 			}
2830 			n_writes ++;
2831 		}
2832 		cl = cl->c_next;
2833 	}
2834 	return (RDMA_SUCCESS);
2835 }
2836 
2837 /*
2838  * RDMA Read a buffer from the remote address.
2839  */
2840 rdma_stat
rib_read(CONN * conn,struct clist * cl,int wait)2841 rib_read(CONN *conn, struct clist *cl, int wait)
2842 {
2843 	ibt_send_wr_t	rx_wr;
2844 	int		cv_sig = 0;
2845 	ibt_wr_ds_t	sgl;
2846 	struct send_wid	*wdesc;
2847 	ibt_status_t	ibt_status = IBT_SUCCESS;
2848 	rdma_stat	ret = RDMA_SUCCESS;
2849 	rib_qp_t	*qp = ctoqp(conn);
2850 
2851 	if (cl == NULL) {
2852 		return (RDMA_FAILED);
2853 	}
2854 
2855 	while (cl != NULL) {
2856 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2857 		/*
2858 		 * Remote address is at the head chunk item in list.
2859 		 */
2860 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2861 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2862 
2863 		sgl.ds_va = cl->u.c_daddr;
2864 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2865 		sgl.ds_len = cl->c_len;
2866 
2867 		/*
2868 		 * If there are multiple chunks to be read, and
2869 		 * wait is set, ask for signal only for the last chunk
2870 		 * and wait only on the last chunk. The completion of
2871 		 * RDMA_READ on last chunk ensures that reads on all
2872 		 * previous chunks are also completed.
2873 		 */
2874 		if (wait && (cl->c_next == NULL)) {
2875 			cv_sig = 1;
2876 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2877 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2878 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2879 			mutex_enter(&wdesc->sendwait_lock);
2880 		} else {
2881 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2882 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2883 		}
2884 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2885 		rx_wr.wr_trans = IBT_RC_SRV;
2886 		rx_wr.wr_nds = 1;
2887 		rx_wr.wr_sgl = &sgl;
2888 
2889 		mutex_enter(&conn->c_lock);
2890 		if (conn->c_state == C_CONNECTED) {
2891 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2892 		}
2893 		if (conn->c_state != C_CONNECTED ||
2894 		    ibt_status != IBT_SUCCESS) {
2895 			if (conn->c_state != C_DISCONN_PEND)
2896 				conn->c_state = C_ERROR_CONN;
2897 			mutex_exit(&conn->c_lock);
2898 			if (wait && (cl->c_next == NULL)) {
2899 				mutex_exit(&wdesc->sendwait_lock);
2900 				(void) rib_free_sendwait(wdesc);
2901 			}
2902 			return (RDMA_CONNLOST);
2903 		}
2904 
2905 		mutex_exit(&conn->c_lock);
2906 
2907 		/*
2908 		 * Wait for send to complete if this is the
2909 		 * last item in the list.
2910 		 */
2911 		if (wait && cl->c_next == NULL) {
2912 			rib_send_hold(qp);
2913 			mutex_exit(&wdesc->sendwait_lock);
2914 
2915 			ret = rib_sendwait(qp, wdesc);
2916 
2917 			if (ret != 0)
2918 				return (ret);
2919 		}
2920 		cl = cl->c_next;
2921 	}
2922 	return (RDMA_SUCCESS);
2923 }
2924 
2925 /*
2926  * rib_srv_cm_handler()
2927  *    Connection Manager callback to handle RC connection requests.
2928  */
2929 /* ARGSUSED */
2930 static ibt_cm_status_t
rib_srv_cm_handler(void * any,ibt_cm_event_t * event,ibt_cm_return_args_t * ret_args,void * priv_data,ibt_priv_data_len_t len)2931 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2932     ibt_cm_return_args_t *ret_args, void *priv_data,
2933     ibt_priv_data_len_t len)
2934 {
2935 	queue_t		*q;
2936 	rib_qp_t	*qp;
2937 	rib_hca_t	*hca;
2938 	rdma_stat	status = RDMA_SUCCESS;
2939 	int		i;
2940 	struct clist	cl;
2941 	rdma_buf_t	rdbuf = {0};
2942 	void		*buf = NULL;
2943 	CONN		*conn;
2944 	ibt_ip_cm_info_t	ipinfo;
2945 	struct sockaddr_in *s;
2946 	struct sockaddr_in6 *s6;
2947 	int sin_size = sizeof (struct sockaddr_in);
2948 	int in_size = sizeof (struct in_addr);
2949 	int sin6_size = sizeof (struct sockaddr_in6);
2950 
2951 	ASSERT(any != NULL);
2952 	ASSERT(event != NULL);
2953 
2954 	hca = (rib_hca_t *)any;
2955 
2956 	/* got a connection request */
2957 	switch (event->cm_type) {
2958 	case IBT_CM_EVENT_REQ_RCV:
2959 		/*
2960 		 * If the plugin is in the NO_ACCEPT state, bail out.
2961 		 */
2962 		mutex_enter(&plugin_state_lock);
2963 		if (plugin_state == NO_ACCEPT) {
2964 			mutex_exit(&plugin_state_lock);
2965 			return (IBT_CM_REJECT);
2966 		}
2967 		mutex_exit(&plugin_state_lock);
2968 
2969 		/*
2970 		 * Need to send a MRA MAD to CM so that it does not
2971 		 * timeout on us.
2972 		 */
2973 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2974 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2975 
2976 		mutex_enter(&rib_stat->open_hca_lock);
2977 		q = rib_stat->q;
2978 		mutex_exit(&rib_stat->open_hca_lock);
2979 
2980 		status = rib_svc_create_chan(hca, (caddr_t)q,
2981 		    event->cm_event.req.req_prim_hca_port, &qp);
2982 
2983 		if (status) {
2984 			return (IBT_CM_REJECT);
2985 		}
2986 
2987 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2988 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2989 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2990 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2991 
2992 		/*
2993 		 * Pre-posts RECV buffers
2994 		 */
2995 		conn = qptoc(qp);
2996 		for (i = 0; i < preposted_rbufs; i++) {
2997 			bzero(&rdbuf, sizeof (rdbuf));
2998 			rdbuf.type = RECV_BUFFER;
2999 			buf = rib_rbuf_alloc(conn, &rdbuf);
3000 			if (buf == NULL) {
3001 				/*
3002 				 * A connection is not established yet.
3003 				 * Just flush the channel. Buffers
3004 				 * posted till now will error out with
3005 				 * IBT_WC_WR_FLUSHED_ERR.
3006 				 */
3007 				(void) ibt_flush_channel(qp->qp_hdl);
3008 				(void) rib_disconnect_channel(conn, NULL);
3009 				return (IBT_CM_REJECT);
3010 			}
3011 
3012 			bzero(&cl, sizeof (cl));
3013 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3014 			cl.c_len = rdbuf.len;
3015 			cl.c_smemhandle.mrc_lmr =
3016 			    rdbuf.handle.mrc_lmr; /* lkey */
3017 			cl.c_next = NULL;
3018 			status = rib_post_recv(conn, &cl);
3019 			if (status != RDMA_SUCCESS) {
3020 				/*
3021 				 * A connection is not established yet.
3022 				 * Just flush the channel. Buffers
3023 				 * posted till now will error out with
3024 				 * IBT_WC_WR_FLUSHED_ERR.
3025 				 */
3026 				(void) ibt_flush_channel(qp->qp_hdl);
3027 				(void) rib_disconnect_channel(conn, NULL);
3028 				return (IBT_CM_REJECT);
3029 			}
3030 		}
3031 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3032 
3033 		/*
3034 		 * Get the address translation
3035 		 */
3036 		rw_enter(&hca->state_lock, RW_READER);
3037 		if (hca->state == HCA_DETACHED) {
3038 			rw_exit(&hca->state_lock);
3039 			return (IBT_CM_REJECT);
3040 		}
3041 		rw_exit(&hca->state_lock);
3042 
3043 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3044 
3045 		if (ibt_get_ip_data(event->cm_priv_data_len,
3046 		    event->cm_priv_data,
3047 		    &ipinfo) != IBT_SUCCESS) {
3048 
3049 			return (IBT_CM_REJECT);
3050 		}
3051 
3052 		switch (ipinfo.src_addr.family) {
3053 		case AF_INET:
3054 
3055 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3056 			    KM_SLEEP);
3057 			(void) strcpy(conn->c_netid, RIBNETID_TCP);
3058 
3059 			conn->c_raddr.maxlen =
3060 			    conn->c_raddr.len = sin_size;
3061 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3062 
3063 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3064 			s->sin_family = AF_INET;
3065 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3066 			    &s->sin_addr, in_size);
3067 
3068 			conn->c_laddr.maxlen =
3069 			    conn->c_laddr.len = sin_size;
3070 			conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3071 
3072 			s = (struct sockaddr_in *)conn->c_laddr.buf;
3073 			s->sin_family = AF_INET;
3074 			bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3075 			    &s->sin_addr, in_size);
3076 
3077 			conn->c_addrmask.maxlen = conn->c_addrmask.len =
3078 			    sizeof (struct sockaddr_in);
3079 			conn->c_addrmask.buf =
3080 			    kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3081 			((struct sockaddr_in *)
3082 			    conn->c_addrmask.buf)->sin_addr.s_addr =
3083 			    (uint32_t)~0;
3084 			((struct sockaddr_in *)
3085 			    conn->c_addrmask.buf)->sin_family =
3086 			    (sa_family_t)~0;
3087 			break;
3088 
3089 		case AF_INET6:
3090 
3091 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3092 			    KM_SLEEP);
3093 			(void) strcpy(conn->c_netid, RIBNETID_TCP6);
3094 
3095 			conn->c_raddr.maxlen =
3096 			    conn->c_raddr.len = sin6_size;
3097 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3098 
3099 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3100 			s6->sin6_family = AF_INET6;
3101 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3102 			    &s6->sin6_addr,
3103 			    sizeof (struct in6_addr));
3104 
3105 			conn->c_laddr.maxlen =
3106 			    conn->c_laddr.len = sin6_size;
3107 			conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3108 
3109 			s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3110 			s6->sin6_family = AF_INET6;
3111 			bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3112 			    &s6->sin6_addr,
3113 			    sizeof (struct in6_addr));
3114 
3115 			conn->c_addrmask.maxlen = conn->c_addrmask.len =
3116 			    sizeof (struct sockaddr_in6);
3117 			conn->c_addrmask.buf =
3118 			    kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3119 			(void) memset(&((struct sockaddr_in6 *)
3120 			    conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3121 			    sizeof (struct in6_addr));
3122 			((struct sockaddr_in6 *)
3123 			    conn->c_addrmask.buf)->sin6_family =
3124 			    (sa_family_t)~0;
3125 			break;
3126 
3127 		default:
3128 			return (IBT_CM_REJECT);
3129 		}
3130 
3131 		break;
3132 
3133 	case IBT_CM_EVENT_CONN_CLOSED:
3134 	{
3135 		CONN		*conn;
3136 		rib_qp_t	*qp;
3137 
3138 		switch (event->cm_event.closed) {
3139 		case IBT_CM_CLOSED_DREP_RCVD:
3140 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3141 		case IBT_CM_CLOSED_DUP:
3142 		case IBT_CM_CLOSED_ABORT:
3143 		case IBT_CM_CLOSED_ALREADY:
3144 			/*
3145 			 * These cases indicate the local end initiated
3146 			 * the closing of the channel. Nothing to do here.
3147 			 */
3148 			break;
3149 		default:
3150 			/*
3151 			 * Reason for CONN_CLOSED event must be one of
3152 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3153 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3154 			 * the remote end is closing the channel. In these
3155 			 * cases free the channel and transition to error
3156 			 * state
3157 			 */
3158 			qp = ibt_get_chan_private(event->cm_channel);
3159 			conn = qptoc(qp);
3160 			mutex_enter(&conn->c_lock);
3161 			if (conn->c_state == C_DISCONN_PEND) {
3162 				mutex_exit(&conn->c_lock);
3163 				break;
3164 			}
3165 			conn->c_state = C_ERROR_CONN;
3166 
3167 			/*
3168 			 * Free the conn if c_ref goes down to 0
3169 			 */
3170 			if (conn->c_ref == 0) {
3171 				/*
3172 				 * Remove from list and free conn
3173 				 */
3174 				conn->c_state = C_DISCONN_PEND;
3175 				mutex_exit(&conn->c_lock);
3176 				(void) rib_disconnect_channel(conn,
3177 				    &hca->srv_conn_list);
3178 			} else {
3179 				/*
3180 				 * conn will be freed when c_ref goes to 0.
3181 				 * Indicate to cleaning thread not to close
3182 				 * the connection, but just free the channel.
3183 				 */
3184 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3185 				mutex_exit(&conn->c_lock);
3186 			}
3187 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3188 			break;
3189 		}
3190 		break;
3191 	}
3192 	case IBT_CM_EVENT_CONN_EST:
3193 		/*
3194 		 * RTU received, hence connection established.
3195 		 */
3196 		if (rib_debug > 1)
3197 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3198 			    "(CONN_EST) channel established");
3199 		break;
3200 
3201 	default:
3202 		if (rib_debug > 2) {
3203 			/* Let CM handle the following events. */
3204 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3205 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3206 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3207 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3208 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3209 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3210 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3211 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3212 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3213 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3214 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3215 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3216 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3217 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3218 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3219 			}
3220 		}
3221 		return (IBT_CM_DEFAULT);
3222 	}
3223 
3224 	/* accept all other CM messages (i.e. let the CM handle them) */
3225 	return (IBT_CM_ACCEPT);
3226 }
3227 
3228 static rdma_stat
rib_register_service(rib_hca_t * hca,int service_type,uint8_t protocol_num,in_port_t dst_port)3229 rib_register_service(rib_hca_t *hca, int service_type,
3230     uint8_t protocol_num, in_port_t dst_port)
3231 {
3232 	ibt_srv_desc_t		sdesc;
3233 	ibt_hca_portinfo_t	*port_infop;
3234 	ib_svc_id_t		srv_id;
3235 	ibt_srv_hdl_t		srv_hdl;
3236 	uint_t			port_size;
3237 	uint_t			pki, i, num_ports, nbinds;
3238 	ibt_status_t		ibt_status;
3239 	rib_service_t		*service;
3240 	ib_pkey_t		pkey;
3241 
3242 	/*
3243 	 * Query all ports for the given HCA
3244 	 */
3245 	rw_enter(&hca->state_lock, RW_READER);
3246 	if (hca->state != HCA_DETACHED) {
3247 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3248 		    &num_ports, &port_size);
3249 		rw_exit(&hca->state_lock);
3250 	} else {
3251 		rw_exit(&hca->state_lock);
3252 		return (RDMA_FAILED);
3253 	}
3254 	if (ibt_status != IBT_SUCCESS) {
3255 		return (RDMA_FAILED);
3256 	}
3257 
3258 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3259 	    int, num_ports);
3260 
3261 	for (i = 0; i < num_ports; i++) {
3262 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3263 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3264 			    int, i+1);
3265 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3266 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3267 			    int, i+1);
3268 		}
3269 	}
3270 
3271 	/*
3272 	 * Get all the IP addresses on this system to register the
3273 	 * given "service type" on all DNS recognized IP addrs.
3274 	 * Each service type such as NFS will have all the systems
3275 	 * IP addresses as its different names. For now the only
3276 	 * type of service we support in RPCIB is NFS.
3277 	 */
3278 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3279 	/*
3280 	 * Start registering and binding service to active
3281 	 * on active ports on this HCA.
3282 	 */
3283 	nbinds = 0;
3284 	for (service = rib_stat->service_list;
3285 	    service && (service->srv_type != service_type);
3286 	    service = service->next)
3287 		;
3288 
3289 	if (service == NULL) {
3290 		/*
3291 		 * We use IP addresses as the service names for
3292 		 * service registration.  Register each of them
3293 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3294 		 * register the service with machine's loopback address.
3295 		 */
3296 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3297 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3298 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3299 		sdesc.sd_handler = rib_srv_cm_handler;
3300 		sdesc.sd_flags = 0;
3301 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3302 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3303 		    1, &srv_hdl, &srv_id);
3304 		if ((ibt_status != IBT_SUCCESS) &&
3305 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3306 			rw_exit(&rib_stat->service_list_lock);
3307 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3308 			    int, ibt_status);
3309 			ibt_free_portinfo(port_infop, port_size);
3310 			return (RDMA_FAILED);
3311 		}
3312 
3313 		/*
3314 		 * Allocate and prepare a service entry
3315 		 */
3316 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3317 
3318 		service->srv_type = service_type;
3319 		service->srv_hdl = srv_hdl;
3320 		service->srv_id = srv_id;
3321 
3322 		service->next = rib_stat->service_list;
3323 		rib_stat->service_list = service;
3324 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3325 		    int, service->srv_type);
3326 	} else {
3327 		srv_hdl = service->srv_hdl;
3328 		srv_id = service->srv_id;
3329 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3330 		    int, service->srv_type);
3331 	}
3332 
3333 	for (i = 0; i < num_ports; i++) {
3334 		ibt_sbind_hdl_t		sbp;
3335 		rib_hca_service_t	*hca_srv;
3336 		ib_gid_t		gid;
3337 
3338 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3339 			continue;
3340 
3341 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3342 			pkey = port_infop[i].p_pkey_tbl[pki];
3343 
3344 			rw_enter(&hca->bound_services_lock, RW_READER);
3345 			gid = port_infop[i].p_sgid_tbl[0];
3346 			for (hca_srv = hca->bound_services; hca_srv;
3347 			    hca_srv = hca_srv->next) {
3348 				if ((hca_srv->srv_id == service->srv_id) &&
3349 				    (hca_srv->gid.gid_prefix ==
3350 				    gid.gid_prefix) &&
3351 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3352 					break;
3353 			}
3354 			rw_exit(&hca->bound_services_lock);
3355 			if (hca_srv != NULL) {
3356 				/*
3357 				 * port is alreay bound the the service
3358 				 */
3359 				DTRACE_PROBE1(
3360 				    rpcib__i__regservice__already__bound,
3361 				    int, i+1);
3362 				nbinds++;
3363 				continue;
3364 			}
3365 
3366 			if ((pkey & IBSRM_HB) &&
3367 			    (pkey != IB_PKEY_INVALID_FULL)) {
3368 
3369 				sbp = NULL;
3370 				ibt_status = ibt_bind_service(srv_hdl,
3371 				    gid, NULL, hca, &sbp);
3372 
3373 				if (ibt_status == IBT_SUCCESS) {
3374 					hca_srv = kmem_zalloc(
3375 					    sizeof (rib_hca_service_t),
3376 					    KM_SLEEP);
3377 					hca_srv->srv_id = srv_id;
3378 					hca_srv->gid = gid;
3379 					hca_srv->sbind_hdl = sbp;
3380 
3381 					rw_enter(&hca->bound_services_lock,
3382 					    RW_WRITER);
3383 					hca_srv->next = hca->bound_services;
3384 					hca->bound_services = hca_srv;
3385 					rw_exit(&hca->bound_services_lock);
3386 					nbinds++;
3387 				}
3388 
3389 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3390 				    int, ibt_status);
3391 			}
3392 		}
3393 	}
3394 	rw_exit(&rib_stat->service_list_lock);
3395 
3396 	ibt_free_portinfo(port_infop, port_size);
3397 
3398 	if (nbinds == 0) {
3399 		return (RDMA_FAILED);
3400 	} else {
3401 		/*
3402 		 * Put this plugin into accept state, since atleast
3403 		 * one registration was successful.
3404 		 */
3405 		mutex_enter(&plugin_state_lock);
3406 		plugin_state = ACCEPT;
3407 		mutex_exit(&plugin_state_lock);
3408 		return (RDMA_SUCCESS);
3409 	}
3410 }
3411 
3412 void
rib_listen(struct rdma_svc_data * rd)3413 rib_listen(struct rdma_svc_data *rd)
3414 {
3415 	rdma_stat status;
3416 	int n_listening = 0;
3417 	rib_hca_t *hca;
3418 
3419 	mutex_enter(&rib_stat->listen_lock);
3420 	/*
3421 	 * if rd parameter is NULL then it means that rib_stat->q is
3422 	 * already initialized by a call from RDMA and we just want to
3423 	 * add a newly attached HCA to the same listening state as other
3424 	 * HCAs.
3425 	 */
3426 	if (rd == NULL) {
3427 		if (rib_stat->q == NULL) {
3428 			mutex_exit(&rib_stat->listen_lock);
3429 			return;
3430 		}
3431 	} else {
3432 		rib_stat->q = &rd->q;
3433 	}
3434 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3435 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3436 		/*
3437 		 * First check if a hca is still attached
3438 		 */
3439 		rw_enter(&hca->state_lock, RW_READER);
3440 		if (hca->state != HCA_INITED) {
3441 			rw_exit(&hca->state_lock);
3442 			continue;
3443 		}
3444 		rw_exit(&hca->state_lock);
3445 
3446 		/*
3447 		 * Right now the only service type is NFS. Hence
3448 		 * force feed this value. Ideally to communicate
3449 		 * the service type it should be passed down in
3450 		 * rdma_svc_data.
3451 		 */
3452 		status = rib_register_service(hca, NFS,
3453 		    IPPROTO_TCP, nfs_rdma_port);
3454 		if (status == RDMA_SUCCESS)
3455 			n_listening++;
3456 	}
3457 	rw_exit(&rib_stat->hcas_list_lock);
3458 
3459 	/*
3460 	 * Service active on an HCA, check rd->err_code for more
3461 	 * explainable errors.
3462 	 */
3463 	if (rd) {
3464 		if (n_listening > 0) {
3465 			rd->active = 1;
3466 			rd->err_code = RDMA_SUCCESS;
3467 		} else {
3468 			rd->active = 0;
3469 			rd->err_code = RDMA_FAILED;
3470 		}
3471 	}
3472 	mutex_exit(&rib_stat->listen_lock);
3473 }
3474 
3475 /* XXXX */
3476 /* ARGSUSED */
3477 static void
rib_listen_stop(struct rdma_svc_data * svcdata)3478 rib_listen_stop(struct rdma_svc_data *svcdata)
3479 {
3480 	rib_hca_t		*hca;
3481 
3482 	mutex_enter(&rib_stat->listen_lock);
3483 	/*
3484 	 * KRPC called the RDMATF to stop the listeners, this means
3485 	 * stop sending incomming or recieved requests to KRPC master
3486 	 * transport handle for RDMA-IB. This is also means that the
3487 	 * master transport handle, responsible for us, is going away.
3488 	 */
3489 	mutex_enter(&plugin_state_lock);
3490 	plugin_state = NO_ACCEPT;
3491 	if (svcdata != NULL)
3492 		svcdata->active = 0;
3493 	mutex_exit(&plugin_state_lock);
3494 
3495 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3496 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3497 		/*
3498 		 * First check if a hca is still attached
3499 		 */
3500 		rw_enter(&hca->state_lock, RW_READER);
3501 		if (hca->state == HCA_DETACHED) {
3502 			rw_exit(&hca->state_lock);
3503 			continue;
3504 		}
3505 		rib_close_channels(&hca->srv_conn_list);
3506 		rib_stop_services(hca);
3507 		rw_exit(&hca->state_lock);
3508 	}
3509 	rw_exit(&rib_stat->hcas_list_lock);
3510 
3511 	/*
3512 	 * Avoid rib_listen() using the stale q field.
3513 	 * This could happen if a port goes up after all services
3514 	 * are already unregistered.
3515 	 */
3516 	rib_stat->q = NULL;
3517 	mutex_exit(&rib_stat->listen_lock);
3518 }
3519 
3520 /*
3521  * Traverse the HCA's service list to unbind and deregister services.
3522  * For each bound service of HCA to be removed, first find the corresponding
3523  * service handle (srv_hdl) and then unbind the service by calling
3524  * ibt_unbind_service().
3525  */
3526 static void
rib_stop_services(rib_hca_t * hca)3527 rib_stop_services(rib_hca_t *hca)
3528 {
3529 	rib_hca_service_t *srv_list, *to_remove;
3530 
3531 	/*
3532 	 * unbind and deregister the services for this service type.
3533 	 * Right now there is only one service type. In future it will
3534 	 * be passed down to this function.
3535 	 */
3536 	rw_enter(&hca->bound_services_lock, RW_READER);
3537 	srv_list = hca->bound_services;
3538 	hca->bound_services = NULL;
3539 	rw_exit(&hca->bound_services_lock);
3540 
3541 	while (srv_list != NULL) {
3542 		rib_service_t *sc;
3543 
3544 		to_remove = srv_list;
3545 		srv_list = to_remove->next;
3546 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3547 		for (sc = rib_stat->service_list;
3548 		    sc && (sc->srv_id != to_remove->srv_id);
3549 		    sc = sc->next)
3550 			;
3551 		/*
3552 		 * if sc is NULL then the service doesn't exist anymore,
3553 		 * probably just removed completely through rib_stat.
3554 		 */
3555 		if (sc != NULL)
3556 			(void) ibt_unbind_service(sc->srv_hdl,
3557 			    to_remove->sbind_hdl);
3558 		rw_exit(&rib_stat->service_list_lock);
3559 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3560 	}
3561 }
3562 
3563 static struct svc_recv *
rib_init_svc_recv(rib_qp_t * qp,ibt_wr_ds_t * sgl)3564 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3565 {
3566 	struct svc_recv	*recvp;
3567 
3568 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3569 	recvp->vaddr = sgl->ds_va;
3570 	recvp->qp = qp;
3571 	recvp->bytes_xfer = 0;
3572 	return (recvp);
3573 }
3574 
3575 static int
rib_free_svc_recv(struct svc_recv * recvp)3576 rib_free_svc_recv(struct svc_recv *recvp)
3577 {
3578 	kmem_free(recvp, sizeof (*recvp));
3579 
3580 	return (0);
3581 }
3582 
3583 static struct reply *
rib_addreplylist(rib_qp_t * qp,uint32_t msgid)3584 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3585 {
3586 	struct reply	*rep;
3587 
3588 
3589 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3590 	if (rep == NULL) {
3591 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3592 		return (NULL);
3593 	}
3594 	rep->xid = msgid;
3595 	rep->vaddr_cq = 0;
3596 	rep->bytes_xfer = 0;
3597 	rep->status = (uint_t)REPLY_WAIT;
3598 	rep->prev = NULL;
3599 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3600 
3601 	mutex_enter(&qp->replylist_lock);
3602 	if (qp->replylist) {
3603 		rep->next = qp->replylist;
3604 		qp->replylist->prev = rep;
3605 	}
3606 	qp->rep_list_size++;
3607 
3608 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3609 	    int, qp->rep_list_size);
3610 
3611 	qp->replylist = rep;
3612 	mutex_exit(&qp->replylist_lock);
3613 
3614 	return (rep);
3615 }
3616 
3617 static rdma_stat
rib_rem_replylist(rib_qp_t * qp)3618 rib_rem_replylist(rib_qp_t *qp)
3619 {
3620 	struct reply	*r, *n;
3621 
3622 	mutex_enter(&qp->replylist_lock);
3623 	for (r = qp->replylist; r != NULL; r = n) {
3624 		n = r->next;
3625 		(void) rib_remreply(qp, r);
3626 	}
3627 	mutex_exit(&qp->replylist_lock);
3628 
3629 	return (RDMA_SUCCESS);
3630 }
3631 
3632 static int
rib_remreply(rib_qp_t * qp,struct reply * rep)3633 rib_remreply(rib_qp_t *qp, struct reply *rep)
3634 {
3635 
3636 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3637 	if (rep->prev) {
3638 		rep->prev->next = rep->next;
3639 	}
3640 	if (rep->next) {
3641 		rep->next->prev = rep->prev;
3642 	}
3643 	if (qp->replylist == rep)
3644 		qp->replylist = rep->next;
3645 
3646 	cv_destroy(&rep->wait_cv);
3647 	qp->rep_list_size--;
3648 
3649 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3650 	    int, qp->rep_list_size);
3651 
3652 	kmem_free(rep, sizeof (*rep));
3653 
3654 	return (0);
3655 }
3656 
3657 rdma_stat
rib_registermem(CONN * conn,caddr_t adsp,caddr_t buf,uint_t buflen,struct mrc * buf_handle)3658 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3659     struct mrc *buf_handle)
3660 {
3661 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3662 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3663 	rdma_stat	status;
3664 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3665 
3666 	/*
3667 	 * Note: ALL buffer pools use the same memory type RDMARW.
3668 	 */
3669 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3670 	if (status == RDMA_SUCCESS) {
3671 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3672 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3673 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3674 	} else {
3675 		buf_handle->mrc_linfo = (uintptr_t)NULL;
3676 		buf_handle->mrc_lmr = 0;
3677 		buf_handle->mrc_rmr = 0;
3678 	}
3679 	return (status);
3680 }
3681 
3682 static rdma_stat
rib_reg_mem(rib_hca_t * hca,caddr_t adsp,caddr_t buf,uint_t size,ibt_mr_flags_t spec,ibt_mr_hdl_t * mr_hdlp,ibt_mr_desc_t * mr_descp)3683 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3684     ibt_mr_flags_t spec,
3685     ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3686 {
3687 	ibt_mr_attr_t	mem_attr;
3688 	ibt_status_t	ibt_status;
3689 	mem_attr.mr_vaddr = (uintptr_t)buf;
3690 	mem_attr.mr_len = (ib_msglen_t)size;
3691 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3692 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3693 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3694 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3695 
3696 	rw_enter(&hca->state_lock, RW_READER);
3697 	if (hca->state != HCA_DETACHED) {
3698 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3699 		    &mem_attr, mr_hdlp, mr_descp);
3700 		rw_exit(&hca->state_lock);
3701 	} else {
3702 		rw_exit(&hca->state_lock);
3703 		return (RDMA_FAILED);
3704 	}
3705 
3706 	if (ibt_status != IBT_SUCCESS) {
3707 		return (RDMA_FAILED);
3708 	}
3709 	return (RDMA_SUCCESS);
3710 }
3711 
3712 rdma_stat
rib_registermemsync(CONN * conn,caddr_t adsp,caddr_t buf,uint_t buflen,struct mrc * buf_handle,RIB_SYNCMEM_HANDLE * sync_handle,void * lrc)3713 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3714     struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3715 {
3716 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3717 	rib_lrc_entry_t *l;
3718 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3719 	rdma_stat	status;
3720 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3721 
3722 	/*
3723 	 * Non-coherent memory registration.
3724 	 */
3725 	l = (rib_lrc_entry_t *)lrc;
3726 	if (l) {
3727 		if (l->registered) {
3728 			buf_handle->mrc_linfo =
3729 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3730 			buf_handle->mrc_lmr =
3731 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3732 			buf_handle->mrc_rmr =
3733 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3734 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3735 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3736 			return (RDMA_SUCCESS);
3737 		} else {
3738 			/* Always register the whole buffer */
3739 			buf = (caddr_t)l->lrc_buf;
3740 			buflen = l->lrc_len;
3741 		}
3742 	}
3743 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3744 
3745 	if (status == RDMA_SUCCESS) {
3746 		if (l) {
3747 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3748 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3749 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3750 			l->registered		 = TRUE;
3751 		}
3752 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3753 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3754 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3755 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3756 	} else {
3757 		buf_handle->mrc_linfo = (uintptr_t)NULL;
3758 		buf_handle->mrc_lmr = 0;
3759 		buf_handle->mrc_rmr = 0;
3760 	}
3761 	return (status);
3762 }
3763 
3764 /* ARGSUSED */
3765 rdma_stat
rib_deregistermem(CONN * conn,caddr_t buf,struct mrc buf_handle)3766 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3767 {
3768 	rib_hca_t *hca = (ctoqp(conn))->hca;
3769 	/*
3770 	 * Allow memory deregistration even if HCA is
3771 	 * getting detached. Need all outstanding
3772 	 * memory registrations to be deregistered
3773 	 * before HCA_DETACH_EVENT can be accepted.
3774 	 */
3775 	(void) ibt_deregister_mr(hca->hca_hdl,
3776 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3777 	return (RDMA_SUCCESS);
3778 }
3779 
3780 /* ARGSUSED */
3781 rdma_stat
rib_deregistermemsync(CONN * conn,caddr_t buf,struct mrc buf_handle,RIB_SYNCMEM_HANDLE sync_handle,void * lrc)3782 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3783     RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3784 {
3785 	rib_lrc_entry_t *l;
3786 	l = (rib_lrc_entry_t *)lrc;
3787 	if (l)
3788 		if (l->registered)
3789 			return (RDMA_SUCCESS);
3790 
3791 	(void) rib_deregistermem(conn, buf, buf_handle);
3792 
3793 	return (RDMA_SUCCESS);
3794 }
3795 
3796 /* ARGSUSED */
3797 rdma_stat
rib_syncmem(CONN * conn,RIB_SYNCMEM_HANDLE shandle,caddr_t buf,int len,int cpu)3798 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3799     int len, int cpu)
3800 {
3801 	ibt_status_t	status;
3802 	rib_hca_t *hca = (ctoqp(conn))->hca;
3803 	ibt_mr_sync_t	mr_segment;
3804 
3805 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3806 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3807 	mr_segment.ms_len = (ib_memlen_t)len;
3808 	if (cpu) {
3809 		/* make incoming data visible to memory */
3810 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3811 	} else {
3812 		/* make memory changes visible to IO */
3813 		mr_segment.ms_flags = IBT_SYNC_READ;
3814 	}
3815 	rw_enter(&hca->state_lock, RW_READER);
3816 	if (hca->state != HCA_DETACHED) {
3817 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3818 		rw_exit(&hca->state_lock);
3819 	} else {
3820 		rw_exit(&hca->state_lock);
3821 		return (RDMA_FAILED);
3822 	}
3823 
3824 	if (status == IBT_SUCCESS)
3825 		return (RDMA_SUCCESS);
3826 	else {
3827 		return (RDMA_FAILED);
3828 	}
3829 }
3830 
3831 /*
3832  * XXXX	????
3833  */
3834 static rdma_stat
rib_getinfo(rdma_info_t * info)3835 rib_getinfo(rdma_info_t *info)
3836 {
3837 	/*
3838 	 * XXXX	Hack!
3839 	 */
3840 	info->addrlen = 16;
3841 	info->mts = 1000000;
3842 	info->mtu = 1000000;
3843 
3844 	return (RDMA_SUCCESS);
3845 }
3846 
3847 rib_bufpool_t *
rib_rbufpool_create(rib_hca_t * hca,int ptype,int num)3848 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3849 {
3850 	rib_bufpool_t	*rbp = NULL;
3851 	bufpool_t	*bp = NULL;
3852 	caddr_t		buf;
3853 	ibt_mr_attr_t	mem_attr;
3854 	ibt_status_t	ibt_status;
3855 	int		i, j;
3856 
3857 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3858 
3859 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3860 	    num * sizeof (void *), KM_SLEEP);
3861 
3862 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3863 	bp->numelems = num;
3864 
3865 
3866 	switch (ptype) {
3867 	case SEND_BUFFER:
3868 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3869 		bp->rsize = RPC_MSG_SZ;
3870 		break;
3871 	case RECV_BUFFER:
3872 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3873 		bp->rsize = RPC_BUF_SIZE;
3874 		break;
3875 	default:
3876 		goto fail;
3877 	}
3878 
3879 	/*
3880 	 * Register the pool.
3881 	 */
3882 	bp->bufsize = num * bp->rsize;
3883 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3884 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3885 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3886 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3887 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3888 	rw_enter(&hca->state_lock, RW_READER);
3889 
3890 	if (hca->state == HCA_DETACHED) {
3891 		rw_exit(&hca->state_lock);
3892 		goto fail;
3893 	}
3894 
3895 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3896 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3897 		mem_attr.mr_vaddr = (uintptr_t)buf;
3898 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3899 		mem_attr.mr_as = NULL;
3900 		ibt_status = ibt_register_mr(hca->hca_hdl,
3901 		    hca->pd_hdl, &mem_attr,
3902 		    &rbp->mr_hdl[i],
3903 		    &rbp->mr_desc[i]);
3904 		if (ibt_status != IBT_SUCCESS) {
3905 			for (j = 0; j < i; j++) {
3906 				(void) ibt_deregister_mr(hca->hca_hdl,
3907 				    rbp->mr_hdl[j]);
3908 			}
3909 			rw_exit(&hca->state_lock);
3910 			goto fail;
3911 		}
3912 	}
3913 	rw_exit(&hca->state_lock);
3914 	buf = (caddr_t)bp->buf;
3915 	for (i = 0; i < num; i++, buf += bp->rsize) {
3916 		bp->buflist[i] = (void *)buf;
3917 	}
3918 	bp->buffree = num - 1;	/* no. of free buffers */
3919 	rbp->bpool = bp;
3920 
3921 	return (rbp);
3922 fail:
3923 	if (bp) {
3924 		if (bp->buf)
3925 			kmem_free(bp->buf, bp->bufsize);
3926 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3927 	}
3928 	if (rbp) {
3929 		if (rbp->mr_hdl)
3930 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3931 		if (rbp->mr_desc)
3932 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3933 		kmem_free(rbp, sizeof (rib_bufpool_t));
3934 	}
3935 	return (NULL);
3936 }
3937 
3938 static void
rib_rbufpool_deregister(rib_hca_t * hca,int ptype)3939 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3940 {
3941 	int i;
3942 	rib_bufpool_t *rbp = NULL;
3943 	bufpool_t *bp;
3944 
3945 	/*
3946 	 * Obtain pool address based on type of pool
3947 	 */
3948 	switch (ptype) {
3949 		case SEND_BUFFER:
3950 			rbp = hca->send_pool;
3951 			break;
3952 		case RECV_BUFFER:
3953 			rbp = hca->recv_pool;
3954 			break;
3955 		default:
3956 			return;
3957 	}
3958 	if (rbp == NULL)
3959 		return;
3960 
3961 	bp = rbp->bpool;
3962 
3963 	/*
3964 	 * Deregister the pool memory and free it.
3965 	 */
3966 	for (i = 0; i < bp->numelems; i++) {
3967 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3968 	}
3969 }
3970 
3971 static void
rib_rbufpool_free(rib_hca_t * hca,int ptype)3972 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3973 {
3974 
3975 	rib_bufpool_t *rbp = NULL;
3976 	bufpool_t *bp;
3977 
3978 	/*
3979 	 * Obtain pool address based on type of pool
3980 	 */
3981 	switch (ptype) {
3982 		case SEND_BUFFER:
3983 			rbp = hca->send_pool;
3984 			break;
3985 		case RECV_BUFFER:
3986 			rbp = hca->recv_pool;
3987 			break;
3988 		default:
3989 			return;
3990 	}
3991 	if (rbp == NULL)
3992 		return;
3993 
3994 	bp = rbp->bpool;
3995 
3996 	/*
3997 	 * Free the pool memory.
3998 	 */
3999 	if (rbp->mr_hdl)
4000 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4001 
4002 	if (rbp->mr_desc)
4003 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4004 	if (bp->buf)
4005 		kmem_free(bp->buf, bp->bufsize);
4006 	mutex_destroy(&bp->buflock);
4007 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4008 	kmem_free(rbp, sizeof (rib_bufpool_t));
4009 }
4010 
4011 void
rib_rbufpool_destroy(rib_hca_t * hca,int ptype)4012 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4013 {
4014 	/*
4015 	 * Deregister the pool memory and free it.
4016 	 */
4017 	rib_rbufpool_deregister(hca, ptype);
4018 	rib_rbufpool_free(hca, ptype);
4019 }
4020 
4021 /*
4022  * Fetch a buffer from the pool of type specified in rdbuf->type.
4023  */
4024 static rdma_stat
rib_reg_buf_alloc(CONN * conn,rdma_buf_t * rdbuf)4025 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4026 {
4027 	rib_lrc_entry_t *rlep;
4028 
4029 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4030 		rlep = rib_get_cache_buf(conn, rdbuf->len);
4031 		rdbuf->rb_private =  (caddr_t)rlep;
4032 		rdbuf->addr = rlep->lrc_buf;
4033 		rdbuf->handle = rlep->lrc_mhandle;
4034 		return (RDMA_SUCCESS);
4035 	}
4036 
4037 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4038 	if (rdbuf->addr) {
4039 		switch (rdbuf->type) {
4040 		case SEND_BUFFER:
4041 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4042 			break;
4043 		case RECV_BUFFER:
4044 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4045 			break;
4046 		default:
4047 			rdbuf->len = 0;
4048 		}
4049 		return (RDMA_SUCCESS);
4050 	} else
4051 		return (RDMA_FAILED);
4052 }
4053 
4054 /*
4055  * Fetch a buffer of specified type.
4056  * Note that rdbuf->handle is mw's rkey.
4057  */
4058 static void *
rib_rbuf_alloc(CONN * conn,rdma_buf_t * rdbuf)4059 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4060 {
4061 	rib_qp_t	*qp = ctoqp(conn);
4062 	rib_hca_t	*hca = qp->hca;
4063 	rdma_btype	ptype = rdbuf->type;
4064 	void		*buf;
4065 	rib_bufpool_t	*rbp = NULL;
4066 	bufpool_t	*bp;
4067 	int		i;
4068 
4069 	/*
4070 	 * Obtain pool address based on type of pool
4071 	 */
4072 	switch (ptype) {
4073 	case SEND_BUFFER:
4074 		rbp = hca->send_pool;
4075 		break;
4076 	case RECV_BUFFER:
4077 		rbp = hca->recv_pool;
4078 		break;
4079 	default:
4080 		return (NULL);
4081 	}
4082 	if (rbp == NULL)
4083 		return (NULL);
4084 
4085 	bp = rbp->bpool;
4086 
4087 	mutex_enter(&bp->buflock);
4088 	if (bp->buffree < 0) {
4089 		mutex_exit(&bp->buflock);
4090 		return (NULL);
4091 	}
4092 
4093 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4094 	buf = bp->buflist[bp->buffree];
4095 	rdbuf->addr = buf;
4096 	rdbuf->len = bp->rsize;
4097 	for (i = bp->numelems - 1; i >= 0; i--) {
4098 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4099 			rdbuf->handle.mrc_rmr =
4100 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4101 			rdbuf->handle.mrc_linfo =
4102 			    (uintptr_t)rbp->mr_hdl[i];
4103 			rdbuf->handle.mrc_lmr =
4104 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4105 			bp->buffree--;
4106 
4107 			mutex_exit(&bp->buflock);
4108 
4109 			return (buf);
4110 		}
4111 	}
4112 
4113 	mutex_exit(&bp->buflock);
4114 
4115 	return (NULL);
4116 }
4117 
4118 static void
rib_reg_buf_free(CONN * conn,rdma_buf_t * rdbuf)4119 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4120 {
4121 
4122 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4123 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4124 		rdbuf->rb_private = NULL;
4125 		return;
4126 	}
4127 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4128 }
4129 
4130 static void
rib_rbuf_free(CONN * conn,int ptype,void * buf)4131 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4132 {
4133 	rib_qp_t *qp = ctoqp(conn);
4134 	rib_hca_t *hca = qp->hca;
4135 	rib_bufpool_t *rbp = NULL;
4136 	bufpool_t *bp;
4137 
4138 	/*
4139 	 * Obtain pool address based on type of pool
4140 	 */
4141 	switch (ptype) {
4142 	case SEND_BUFFER:
4143 		rbp = hca->send_pool;
4144 		break;
4145 	case RECV_BUFFER:
4146 		rbp = hca->recv_pool;
4147 		break;
4148 	default:
4149 		return;
4150 	}
4151 	if (rbp == NULL)
4152 		return;
4153 
4154 	bp = rbp->bpool;
4155 
4156 	mutex_enter(&bp->buflock);
4157 	if (++bp->buffree >= bp->numelems) {
4158 		/*
4159 		 * Should never happen
4160 		 */
4161 		bp->buffree--;
4162 	} else {
4163 		bp->buflist[bp->buffree] = buf;
4164 	}
4165 	mutex_exit(&bp->buflock);
4166 }
4167 
4168 static rdma_stat
rib_add_connlist(CONN * cn,rib_conn_list_t * connlist)4169 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4170 {
4171 	rw_enter(&connlist->conn_lock, RW_WRITER);
4172 	if (connlist->conn_hd) {
4173 		cn->c_next = connlist->conn_hd;
4174 		connlist->conn_hd->c_prev = cn;
4175 	}
4176 	connlist->conn_hd = cn;
4177 	rw_exit(&connlist->conn_lock);
4178 
4179 	return (RDMA_SUCCESS);
4180 }
4181 
4182 static rdma_stat
rib_rm_conn(CONN * cn,rib_conn_list_t * connlist)4183 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4184 {
4185 	rw_enter(&connlist->conn_lock, RW_WRITER);
4186 	if (cn->c_prev) {
4187 		cn->c_prev->c_next = cn->c_next;
4188 	}
4189 	if (cn->c_next) {
4190 		cn->c_next->c_prev = cn->c_prev;
4191 	}
4192 	if (connlist->conn_hd == cn)
4193 		connlist->conn_hd = cn->c_next;
4194 	rw_exit(&connlist->conn_lock);
4195 
4196 	return (RDMA_SUCCESS);
4197 }
4198 
4199 /* ARGSUSED */
4200 static rdma_stat
rib_conn_get(struct netbuf * s_svcaddr,struct netbuf * d_svcaddr,int addr_type,void * handle,CONN ** conn)4201 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4202     int addr_type, void *handle, CONN **conn)
4203 {
4204 	rdma_stat status;
4205 	rpcib_ping_t rpt;
4206 
4207 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4208 	return (status);
4209 }
4210 
4211 /*
4212  * rib_find_hca_connection
4213  *
4214  * if there is an existing connection to the specified address then
4215  * it will be returned in conn, otherwise conn will be set to NULL.
4216  * Also cleans up any connection that is in error state.
4217  */
4218 static int
rib_find_hca_connection(rib_hca_t * hca,struct netbuf * s_svcaddr,struct netbuf * d_svcaddr,CONN ** conn)4219 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4220     struct netbuf *d_svcaddr, CONN **conn)
4221 {
4222 	CONN *cn;
4223 	clock_t cv_stat, timout;
4224 
4225 	*conn = NULL;
4226 again:
4227 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4228 	cn = hca->cl_conn_list.conn_hd;
4229 	while (cn != NULL) {
4230 		/*
4231 		 * First, clear up any connection in the ERROR state
4232 		 */
4233 		mutex_enter(&cn->c_lock);
4234 		if (cn->c_state == C_ERROR_CONN) {
4235 			if (cn->c_ref == 0) {
4236 				/*
4237 				 * Remove connection from list and destroy it.
4238 				 */
4239 				cn->c_state = C_DISCONN_PEND;
4240 				mutex_exit(&cn->c_lock);
4241 				rw_exit(&hca->cl_conn_list.conn_lock);
4242 				rib_conn_close((void *)cn);
4243 				goto again;
4244 			}
4245 			mutex_exit(&cn->c_lock);
4246 			cn = cn->c_next;
4247 			continue;
4248 		}
4249 		if (cn->c_state == C_DISCONN_PEND) {
4250 			mutex_exit(&cn->c_lock);
4251 			cn = cn->c_next;
4252 			continue;
4253 		}
4254 
4255 		/*
4256 		 * source address is only checked for if there is one,
4257 		 * this is the case for retries.
4258 		 */
4259 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4260 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4261 		    d_svcaddr->len) == 0) &&
4262 		    ((s_svcaddr->len == 0) ||
4263 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4264 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4265 		    s_svcaddr->len) == 0)))) {
4266 			/*
4267 			 * Our connection. Give up conn list lock
4268 			 * as we are done traversing the list.
4269 			 */
4270 			rw_exit(&hca->cl_conn_list.conn_lock);
4271 			if (cn->c_state == C_CONNECTED) {
4272 				cn->c_ref++;	/* sharing a conn */
4273 				mutex_exit(&cn->c_lock);
4274 				*conn = cn;
4275 				return (RDMA_SUCCESS);
4276 			}
4277 			if (cn->c_state == C_CONN_PEND) {
4278 				/*
4279 				 * Hold a reference to this conn before
4280 				 * we give up the lock.
4281 				 */
4282 				cn->c_ref++;
4283 				timout =  ddi_get_lbolt() +
4284 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4285 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4286 				    &cn->c_lock, timout)) > 0 &&
4287 				    cn->c_state == C_CONN_PEND)
4288 					;
4289 				if (cv_stat == 0) {
4290 					(void) rib_conn_release_locked(cn);
4291 					return (RDMA_INTR);
4292 				}
4293 				if (cv_stat < 0) {
4294 					(void) rib_conn_release_locked(cn);
4295 					return (RDMA_TIMEDOUT);
4296 				}
4297 				if (cn->c_state == C_CONNECTED) {
4298 					*conn = cn;
4299 					mutex_exit(&cn->c_lock);
4300 					return (RDMA_SUCCESS);
4301 				} else {
4302 					(void) rib_conn_release_locked(cn);
4303 					return (RDMA_TIMEDOUT);
4304 				}
4305 			}
4306 		}
4307 		mutex_exit(&cn->c_lock);
4308 		cn = cn->c_next;
4309 	}
4310 	rw_exit(&hca->cl_conn_list.conn_lock);
4311 	*conn = NULL;
4312 	return (RDMA_FAILED);
4313 }
4314 
4315 /*
4316  * Connection management.
4317  * IBTF does not support recycling of channels. So connections are only
4318  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4319  * C_DISCONN_PEND state. No C_IDLE state.
4320  * C_CONN_PEND state: Connection establishment in progress to the server.
4321  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4322  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4323  * only in this state.
4324  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4325  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4326  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4327  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4328  * c_ref drops to 0 (this indicates that RPC has no more references to this
4329  * connection), the connection should be destroyed. A connection transitions
4330  * into this state when it is being destroyed.
4331  */
4332 /* ARGSUSED */
4333 static rdma_stat
rib_connect(struct netbuf * s_svcaddr,struct netbuf * d_svcaddr,int addr_type,rpcib_ping_t * rpt,CONN ** conn)4334 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4335     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4336 {
4337 	CONN *cn;
4338 	int status;
4339 	rib_hca_t *hca;
4340 	rib_qp_t *qp;
4341 	int s_addr_len;
4342 	char *s_addr_buf;
4343 
4344 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4345 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4346 		rw_enter(&hca->state_lock, RW_READER);
4347 		if (hca->state != HCA_DETACHED) {
4348 			status = rib_find_hca_connection(hca, s_svcaddr,
4349 			    d_svcaddr, conn);
4350 			rw_exit(&hca->state_lock);
4351 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4352 				rw_exit(&rib_stat->hcas_list_lock);
4353 				return (status);
4354 			}
4355 		} else
4356 			rw_exit(&hca->state_lock);
4357 	}
4358 	rw_exit(&rib_stat->hcas_list_lock);
4359 
4360 	/*
4361 	 * No existing connection found, establish a new connection.
4362 	 */
4363 	bzero(rpt, sizeof (rpcib_ping_t));
4364 
4365 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4366 	if (status != RDMA_SUCCESS) {
4367 		return (RDMA_FAILED);
4368 	}
4369 	hca = rpt->hca;
4370 
4371 	if (rpt->srcip.family == AF_INET) {
4372 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4373 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4374 	} else if (rpt->srcip.family == AF_INET6) {
4375 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4376 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4377 	} else {
4378 		return (RDMA_FAILED);
4379 	}
4380 
4381 	/*
4382 	 * Channel to server doesn't exist yet, create one.
4383 	 */
4384 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4385 		return (RDMA_FAILED);
4386 	}
4387 	cn = qptoc(qp);
4388 	cn->c_state = C_CONN_PEND;
4389 	cn->c_ref = 1;
4390 
4391 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4392 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4393 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4394 
4395 	if (rpt->srcip.family == AF_INET) {
4396 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4397 		(void) strcpy(cn->c_netid, RIBNETID_TCP);
4398 
4399 		cn->c_addrmask.len = cn->c_addrmask.maxlen =
4400 		    sizeof (struct sockaddr_in);
4401 		cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4402 
4403 		((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4404 		    (uint32_t)~0;
4405 		((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4406 		    (ushort_t)~0;
4407 
4408 	} else {
4409 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4410 		(void) strcpy(cn->c_netid, RIBNETID_TCP6);
4411 
4412 		cn->c_addrmask.len = cn->c_addrmask.maxlen =
4413 		    sizeof (struct sockaddr_in6);
4414 		cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4415 
4416 		(void) memset(
4417 		    &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4418 		    (uchar_t)~0, sizeof (struct in6_addr));
4419 		((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4420 		    (sa_family_t)~0;
4421 	}
4422 
4423 	/*
4424 	 * Add to conn list.
4425 	 * We had given up the READER lock. In the time since then,
4426 	 * another thread might have created the connection we are
4427 	 * trying here. But for now, that is quiet alright - there
4428 	 * might be two connections between a pair of hosts instead
4429 	 * of one. If we really want to close that window,
4430 	 * then need to check the list after acquiring the
4431 	 * WRITER lock.
4432 	 */
4433 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4434 	status = rib_conn_to_srv(hca, qp, rpt);
4435 	mutex_enter(&cn->c_lock);
4436 
4437 	if (cn->c_flags & C_CLOSE_PENDING) {
4438 		/*
4439 		 * This handles a case where the module or
4440 		 * HCA detached in the time a connection is
4441 		 * established. In such a case close the
4442 		 * connection immediately if this is the
4443 		 * only reference.
4444 		 */
4445 		if (cn->c_ref == 1) {
4446 			cn->c_ref--;
4447 			cn->c_state = C_DISCONN_PEND;
4448 			mutex_exit(&cn->c_lock);
4449 			rib_conn_close((void *)cn);
4450 			return (RDMA_FAILED);
4451 		}
4452 
4453 		/*
4454 		 * Connection to be closed later when c_ref = 0
4455 		 */
4456 		status = RDMA_FAILED;
4457 	}
4458 
4459 	if (status == RDMA_SUCCESS) {
4460 		cn->c_state = C_CONNECTED;
4461 		*conn = cn;
4462 	} else {
4463 		cn->c_state = C_ERROR_CONN;
4464 		cn->c_ref--;
4465 	}
4466 	cv_signal(&cn->c_cv);
4467 	mutex_exit(&cn->c_lock);
4468 	return (status);
4469 }
4470 
4471 static void
rib_conn_close(void * rarg)4472 rib_conn_close(void *rarg)
4473 {
4474 	CONN *conn = (CONN *)rarg;
4475 	rib_qp_t *qp = ctoqp(conn);
4476 
4477 	mutex_enter(&conn->c_lock);
4478 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4479 
4480 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4481 
4482 		/*
4483 		 * Live connection in CONNECTED state.
4484 		 */
4485 		if (conn->c_state == C_CONNECTED) {
4486 			conn->c_state = C_ERROR_CONN;
4487 		}
4488 		mutex_exit(&conn->c_lock);
4489 
4490 		rib_close_a_channel(conn);
4491 
4492 		mutex_enter(&conn->c_lock);
4493 		conn->c_flags &= ~C_CLOSE_PENDING;
4494 	}
4495 
4496 	mutex_exit(&conn->c_lock);
4497 
4498 	if (qp->mode == RIB_SERVER)
4499 		(void) rib_disconnect_channel(conn,
4500 		    &qp->hca->srv_conn_list);
4501 	else
4502 		(void) rib_disconnect_channel(conn,
4503 		    &qp->hca->cl_conn_list);
4504 }
4505 
4506 static void
rib_conn_timeout_call(void * carg)4507 rib_conn_timeout_call(void *carg)
4508 {
4509 	time_t idle_time;
4510 	CONN *conn = (CONN *)carg;
4511 	rib_hca_t *hca = ctoqp(conn)->hca;
4512 	int error;
4513 
4514 	mutex_enter(&conn->c_lock);
4515 	if ((conn->c_ref > 0) ||
4516 	    (conn->c_state == C_DISCONN_PEND)) {
4517 		conn->c_timeout = NULL;
4518 		mutex_exit(&conn->c_lock);
4519 		return;
4520 	}
4521 
4522 	idle_time = (gethrestime_sec() - conn->c_last_used);
4523 
4524 	if ((idle_time <= rib_conn_timeout) &&
4525 	    (conn->c_state != C_ERROR_CONN)) {
4526 		/*
4527 		 * There was activity after the last timeout.
4528 		 * Extend the conn life. Unless the conn is
4529 		 * already in error state.
4530 		 */
4531 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4532 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4533 		mutex_exit(&conn->c_lock);
4534 		return;
4535 	}
4536 
4537 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4538 	    (void *)conn, DDI_NOSLEEP);
4539 
4540 	/*
4541 	 * If taskq dispatch fails above, then reset the timeout
4542 	 * to try again after 10 secs.
4543 	 */
4544 
4545 	if (error != DDI_SUCCESS) {
4546 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4547 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4548 		mutex_exit(&conn->c_lock);
4549 		return;
4550 	}
4551 
4552 	conn->c_state = C_DISCONN_PEND;
4553 	mutex_exit(&conn->c_lock);
4554 }
4555 
4556 static rdma_stat
rib_conn_release(CONN * conn)4557 rib_conn_release(CONN *conn)
4558 {
4559 	mutex_enter(&conn->c_lock);
4560 	return (rib_conn_release_locked(conn));
4561 }
4562 
4563 /*
4564  * Expects conn->c_lock to be held on entry.
4565  * c_lock released on return
4566  */
4567 static rdma_stat
rib_conn_release_locked(CONN * conn)4568 rib_conn_release_locked(CONN *conn)
4569 {
4570 	conn->c_ref--;
4571 
4572 	conn->c_last_used = gethrestime_sec();
4573 	if (conn->c_ref > 0) {
4574 		mutex_exit(&conn->c_lock);
4575 		return (RDMA_SUCCESS);
4576 	}
4577 
4578 	/*
4579 	 * If a conn is C_ERROR_CONN, close the channel.
4580 	 */
4581 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4582 		conn->c_state = C_DISCONN_PEND;
4583 		mutex_exit(&conn->c_lock);
4584 		rib_conn_close((void *)conn);
4585 		return (RDMA_SUCCESS);
4586 	}
4587 
4588 	/*
4589 	 * c_ref == 0, set a timeout for conn release
4590 	 */
4591 
4592 	if (conn->c_timeout == NULL) {
4593 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4594 		    SEC_TO_TICK(rib_conn_timeout));
4595 	}
4596 
4597 	mutex_exit(&conn->c_lock);
4598 	return (RDMA_SUCCESS);
4599 }
4600 
4601 /*
4602  * Add at front of list
4603  */
4604 static struct rdma_done_list *
rdma_done_add(rib_qp_t * qp,uint32_t xid)4605 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4606 {
4607 	struct rdma_done_list *rd;
4608 
4609 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4610 
4611 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4612 	rd->xid = xid;
4613 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4614 
4615 	rd->prev = NULL;
4616 	rd->next = qp->rdlist;
4617 	if (qp->rdlist != NULL)
4618 		qp->rdlist->prev = rd;
4619 	qp->rdlist = rd;
4620 
4621 	return (rd);
4622 }
4623 
4624 static void
rdma_done_rm(rib_qp_t * qp,struct rdma_done_list * rd)4625 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4626 {
4627 	struct rdma_done_list *r;
4628 
4629 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4630 
4631 	r = rd->next;
4632 	if (r != NULL) {
4633 		r->prev = rd->prev;
4634 	}
4635 
4636 	r = rd->prev;
4637 	if (r != NULL) {
4638 		r->next = rd->next;
4639 	} else {
4640 		qp->rdlist = rd->next;
4641 	}
4642 
4643 	cv_destroy(&rd->rdma_done_cv);
4644 	kmem_free(rd, sizeof (*rd));
4645 }
4646 
4647 static void
rdma_done_rem_list(rib_qp_t * qp)4648 rdma_done_rem_list(rib_qp_t *qp)
4649 {
4650 	struct rdma_done_list	*r, *n;
4651 
4652 	mutex_enter(&qp->rdlist_lock);
4653 	for (r = qp->rdlist; r != NULL; r = n) {
4654 		n = r->next;
4655 		rdma_done_rm(qp, r);
4656 	}
4657 	mutex_exit(&qp->rdlist_lock);
4658 }
4659 
4660 static void
rdma_done_notify(rib_qp_t * qp,uint32_t xid)4661 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4662 {
4663 	struct rdma_done_list *r = qp->rdlist;
4664 
4665 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4666 
4667 	while (r) {
4668 		if (r->xid == xid) {
4669 			cv_signal(&r->rdma_done_cv);
4670 			return;
4671 		} else {
4672 			r = r->next;
4673 		}
4674 	}
4675 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4676 	    int, xid);
4677 }
4678 
4679 /*
4680  * Expects conn->c_lock to be held by the caller.
4681  */
4682 
4683 static void
rib_close_a_channel(CONN * conn)4684 rib_close_a_channel(CONN *conn)
4685 {
4686 	rib_qp_t	*qp;
4687 	qp = ctoqp(conn);
4688 
4689 	if (qp->qp_hdl == NULL) {
4690 		/* channel already freed */
4691 		return;
4692 	}
4693 
4694 	/*
4695 	 * Call ibt_close_rc_channel in blocking mode
4696 	 * with no callbacks.
4697 	 */
4698 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4699 	    NULL, 0, NULL, NULL, 0);
4700 }
4701 
4702 /*
4703  * Goes through all connections and closes the channel
4704  * This will cause all the WRs on those channels to be
4705  * flushed.
4706  */
4707 static void
rib_close_channels(rib_conn_list_t * connlist)4708 rib_close_channels(rib_conn_list_t *connlist)
4709 {
4710 	CONN		*conn, *tmp;
4711 
4712 	rw_enter(&connlist->conn_lock, RW_READER);
4713 	conn = connlist->conn_hd;
4714 	while (conn != NULL) {
4715 		mutex_enter(&conn->c_lock);
4716 		tmp = conn->c_next;
4717 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4718 
4719 			if (conn->c_state == C_CONN_PEND) {
4720 				conn->c_flags |= C_CLOSE_PENDING;
4721 				goto next;
4722 			}
4723 
4724 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4725 
4726 			/*
4727 			 * Live connection in CONNECTED state.
4728 			 */
4729 			if (conn->c_state == C_CONNECTED)
4730 				conn->c_state = C_ERROR_CONN;
4731 			mutex_exit(&conn->c_lock);
4732 
4733 			rib_close_a_channel(conn);
4734 
4735 			mutex_enter(&conn->c_lock);
4736 			conn->c_flags &= ~C_CLOSE_PENDING;
4737 			/* Signal a pending rib_disconnect_channel() */
4738 			cv_signal(&conn->c_cv);
4739 		}
4740 next:
4741 		mutex_exit(&conn->c_lock);
4742 		conn = tmp;
4743 	}
4744 	rw_exit(&connlist->conn_lock);
4745 }
4746 
4747 /*
4748  * Frees up all connections that are no longer being referenced
4749  */
4750 static void
rib_purge_connlist(rib_conn_list_t * connlist)4751 rib_purge_connlist(rib_conn_list_t *connlist)
4752 {
4753 	CONN		*conn;
4754 
4755 top:
4756 	rw_enter(&connlist->conn_lock, RW_READER);
4757 	conn = connlist->conn_hd;
4758 	while (conn != NULL) {
4759 		mutex_enter(&conn->c_lock);
4760 
4761 		/*
4762 		 * At this point connection is either in ERROR
4763 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4764 		 * then some other thread is culling that connection.
4765 		 * If not and if c_ref is 0, then destroy the connection.
4766 		 */
4767 		if (conn->c_ref == 0 &&
4768 		    conn->c_state != C_DISCONN_PEND) {
4769 			/*
4770 			 * Cull the connection
4771 			 */
4772 			conn->c_state = C_DISCONN_PEND;
4773 			mutex_exit(&conn->c_lock);
4774 			rw_exit(&connlist->conn_lock);
4775 			(void) rib_disconnect_channel(conn, connlist);
4776 			goto top;
4777 		} else {
4778 			/*
4779 			 * conn disconnect already scheduled or will
4780 			 * happen from conn_release when c_ref drops to 0.
4781 			 */
4782 			mutex_exit(&conn->c_lock);
4783 		}
4784 		conn = conn->c_next;
4785 	}
4786 	rw_exit(&connlist->conn_lock);
4787 
4788 	/*
4789 	 * At this point, only connections with c_ref != 0 are on the list
4790 	 */
4791 }
4792 
4793 /*
4794  * Free all the HCA resources and close
4795  * the hca.
4796  */
4797 
4798 static void
rib_free_hca(rib_hca_t * hca)4799 rib_free_hca(rib_hca_t *hca)
4800 {
4801 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4802 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4803 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4804 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4805 
4806 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4807 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4808 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4809 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4810 
4811 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4812 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4813 	rib_destroy_cache(hca);
4814 	if (rib_mod.rdma_count == 0)
4815 		(void) rdma_unregister_mod(&rib_mod);
4816 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4817 	(void) ibt_close_hca(hca->hca_hdl);
4818 	hca->hca_hdl = NULL;
4819 }
4820 
4821 
4822 static void
rib_stop_hca_services(rib_hca_t * hca)4823 rib_stop_hca_services(rib_hca_t *hca)
4824 {
4825 	rib_stop_services(hca);
4826 	rib_close_channels(&hca->cl_conn_list);
4827 	rib_close_channels(&hca->srv_conn_list);
4828 
4829 	rib_purge_connlist(&hca->cl_conn_list);
4830 	rib_purge_connlist(&hca->srv_conn_list);
4831 
4832 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4833 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4834 		    GLOBAL_ZONEID);
4835 		stats_enabled = FALSE;
4836 	}
4837 
4838 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4839 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4840 	if (hca->srv_conn_list.conn_hd == NULL &&
4841 	    hca->cl_conn_list.conn_hd == NULL) {
4842 		/*
4843 		 * conn_lists are NULL, so destroy
4844 		 * buffers, close hca and be done.
4845 		 */
4846 		rib_free_hca(hca);
4847 	}
4848 	rw_exit(&hca->cl_conn_list.conn_lock);
4849 	rw_exit(&hca->srv_conn_list.conn_lock);
4850 
4851 	if (hca->hca_hdl != NULL) {
4852 		mutex_enter(&hca->inuse_lock);
4853 		while (hca->inuse)
4854 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4855 		mutex_exit(&hca->inuse_lock);
4856 
4857 		rib_free_hca(hca);
4858 	}
4859 	rw_destroy(&hca->bound_services_lock);
4860 
4861 	if (hca->cleanup_helper != NULL) {
4862 		ddi_taskq_destroy(hca->cleanup_helper);
4863 		hca->cleanup_helper = NULL;
4864 	}
4865 }
4866 
4867 /*
4868  * Cleans and closes up all uses of the HCA
4869  */
4870 static void
rib_detach_hca(ibt_hca_hdl_t hca_hdl)4871 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4872 {
4873 	rib_hca_t *hca = NULL;
4874 	rib_hca_t **hcap;
4875 
4876 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4877 	for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4878 		hca = *hcap;
4879 		rw_enter(&hca->state_lock, RW_WRITER);
4880 		if (hca->hca_hdl == hca_hdl) {
4881 			/*
4882 			 * Mark as detached and remove from
4883 			 * hca list.
4884 			 */
4885 			hca->state = HCA_DETACHED;
4886 			*hcap = hca->next;
4887 			rib_stat->nhca_inited--;
4888 			rib_mod.rdma_count--;
4889 			rw_exit(&hca->state_lock);
4890 			break;
4891 		}
4892 		rw_exit(&hca->state_lock);
4893 	}
4894 	rw_exit(&rib_stat->hcas_list_lock);
4895 
4896 	if (hca == NULL)
4897 		return;
4898 	ASSERT(hca->hca_hdl == hca_hdl);
4899 
4900 	/*
4901 	 * Stop all services on the HCA
4902 	 * Go through cl_conn_list and close all rc_channels
4903 	 * Go through svr_conn_list and close all rc_channels
4904 	 * Free connections whose c_ref has dropped to 0
4905 	 * Destroy all CQs
4906 	 * Deregister and released all buffer pool memory after all
4907 	 * connections are destroyed
4908 	 * Free the protection domain
4909 	 * ibt_close_hca()
4910 	 */
4911 	rib_stop_hca_services(hca);
4912 
4913 	kmem_free(hca, sizeof (*hca));
4914 }
4915 
4916 static void
rib_server_side_cache_reclaim(void * argp)4917 rib_server_side_cache_reclaim(void *argp)
4918 {
4919 	cache_avl_struct_t    *rcas;
4920 	rib_lrc_entry_t		*rb;
4921 	rib_hca_t *hca = (rib_hca_t *)argp;
4922 
4923 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4924 	rcas = avl_first(&hca->avl_tree);
4925 	if (rcas != NULL)
4926 		avl_remove(&hca->avl_tree, rcas);
4927 
4928 	while (rcas != NULL) {
4929 		while (rcas->r.forw != &rcas->r) {
4930 			rcas->elements--;
4931 			rb = rcas->r.forw;
4932 			remque(rb);
4933 			if (rb->registered)
4934 				(void) rib_deregistermem_via_hca(hca,
4935 				    rb->lrc_buf, rb->lrc_mhandle);
4936 
4937 			hca->cache_allocation -= rb->lrc_len;
4938 			kmem_free(rb->lrc_buf, rb->lrc_len);
4939 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4940 		}
4941 		mutex_destroy(&rcas->node_lock);
4942 		kmem_cache_free(hca->server_side_cache, rcas);
4943 		rcas = avl_first(&hca->avl_tree);
4944 		if (rcas != NULL)
4945 			avl_remove(&hca->avl_tree, rcas);
4946 	}
4947 	rw_exit(&hca->avl_rw_lock);
4948 }
4949 
4950 static void
rib_server_side_cache_cleanup(void * argp)4951 rib_server_side_cache_cleanup(void *argp)
4952 {
4953 	cache_avl_struct_t    *rcas;
4954 	rib_lrc_entry_t		*rb;
4955 	rib_hca_t *hca = (rib_hca_t *)argp;
4956 
4957 	mutex_enter(&hca->cache_allocation_lock);
4958 	if (hca->cache_allocation < cache_limit) {
4959 		mutex_exit(&hca->cache_allocation_lock);
4960 		return;
4961 	}
4962 	mutex_exit(&hca->cache_allocation_lock);
4963 
4964 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4965 	rcas = avl_last(&hca->avl_tree);
4966 	if (rcas != NULL)
4967 		avl_remove(&hca->avl_tree, rcas);
4968 
4969 	while (rcas != NULL) {
4970 		while (rcas->r.forw != &rcas->r) {
4971 			rcas->elements--;
4972 			rb = rcas->r.forw;
4973 			remque(rb);
4974 			if (rb->registered)
4975 				(void) rib_deregistermem_via_hca(hca,
4976 				    rb->lrc_buf, rb->lrc_mhandle);
4977 
4978 			hca->cache_allocation -= rb->lrc_len;
4979 
4980 			kmem_free(rb->lrc_buf, rb->lrc_len);
4981 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4982 		}
4983 		mutex_destroy(&rcas->node_lock);
4984 		if (hca->server_side_cache) {
4985 			kmem_cache_free(hca->server_side_cache, rcas);
4986 		}
4987 
4988 		if (hca->cache_allocation < cache_limit) {
4989 			rw_exit(&hca->avl_rw_lock);
4990 			return;
4991 		}
4992 
4993 		rcas = avl_last(&hca->avl_tree);
4994 		if (rcas != NULL)
4995 			avl_remove(&hca->avl_tree, rcas);
4996 	}
4997 	rw_exit(&hca->avl_rw_lock);
4998 }
4999 
5000 static int
avl_compare(const void * t1,const void * t2)5001 avl_compare(const void *t1, const void *t2)
5002 {
5003 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5004 		return (0);
5005 
5006 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5007 		return (-1);
5008 
5009 	return (1);
5010 }
5011 
5012 static void
rib_destroy_cache(rib_hca_t * hca)5013 rib_destroy_cache(rib_hca_t *hca)
5014 {
5015 	if (hca->avl_init) {
5016 		rib_server_side_cache_reclaim((void *)hca);
5017 		if (hca->server_side_cache) {
5018 			kmem_cache_destroy(hca->server_side_cache);
5019 			hca->server_side_cache = NULL;
5020 		}
5021 		avl_destroy(&hca->avl_tree);
5022 		mutex_destroy(&hca->cache_allocation_lock);
5023 		rw_destroy(&hca->avl_rw_lock);
5024 	}
5025 	hca->avl_init = FALSE;
5026 }
5027 
5028 static void
rib_force_cleanup(void * hca)5029 rib_force_cleanup(void *hca)
5030 {
5031 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5032 		(void) ddi_taskq_dispatch(
5033 		    ((rib_hca_t *)hca)->cleanup_helper,
5034 		    rib_server_side_cache_cleanup,
5035 		    (void *)hca, DDI_NOSLEEP);
5036 }
5037 
5038 static rib_lrc_entry_t *
rib_get_cache_buf(CONN * conn,uint32_t len)5039 rib_get_cache_buf(CONN *conn, uint32_t len)
5040 {
5041 	cache_avl_struct_t	cas, *rcas;
5042 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5043 	rib_lrc_entry_t *reply_buf;
5044 	avl_index_t where = (uintptr_t)NULL;
5045 	uint64_t c_alloc = 0;
5046 
5047 	if (!hca->avl_init)
5048 		goto  error_alloc;
5049 
5050 	cas.len = len;
5051 
5052 	rw_enter(&hca->avl_rw_lock, RW_READER);
5053 
5054 	mutex_enter(&hca->cache_allocation_lock);
5055 	c_alloc = hca->cache_allocation;
5056 	mutex_exit(&hca->cache_allocation_lock);
5057 
5058 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5059 	    &where)) == NULL) {
5060 		/* Am I above the cache limit */
5061 		if ((c_alloc + len) >= cache_limit) {
5062 			rib_force_cleanup((void *)hca);
5063 			rw_exit(&hca->avl_rw_lock);
5064 			mutex_enter(&hca->cache_allocation_lock);
5065 			hca->cache_misses_above_the_limit ++;
5066 			mutex_exit(&hca->cache_allocation_lock);
5067 
5068 			/* Allocate and register the buffer directly */
5069 			goto error_alloc;
5070 		}
5071 
5072 		rw_exit(&hca->avl_rw_lock);
5073 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
5074 
5075 		/* Recheck to make sure no other thread added the entry in */
5076 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5077 		    &cas, &where)) == NULL) {
5078 			/* Allocate an avl tree entry */
5079 			rcas = (cache_avl_struct_t *)
5080 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5081 
5082 			bzero(rcas, sizeof (cache_avl_struct_t));
5083 			rcas->elements = 0;
5084 			rcas->r.forw = &rcas->r;
5085 			rcas->r.back = &rcas->r;
5086 			rcas->len = len;
5087 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5088 			avl_insert(&hca->avl_tree, rcas, where);
5089 		}
5090 	}
5091 
5092 	mutex_enter(&rcas->node_lock);
5093 
5094 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5095 		reply_buf = rcas->r.forw;
5096 		remque(reply_buf);
5097 		rcas->elements--;
5098 		mutex_exit(&rcas->node_lock);
5099 		rw_exit(&hca->avl_rw_lock);
5100 
5101 		mutex_enter(&hca->cache_allocation_lock);
5102 		hca->cache_hits++;
5103 		hca->cache_allocation -= len;
5104 		mutex_exit(&hca->cache_allocation_lock);
5105 	} else {
5106 		/* Am I above the cache limit */
5107 		mutex_exit(&rcas->node_lock);
5108 		if ((c_alloc + len) >= cache_limit) {
5109 			rib_force_cleanup((void *)hca);
5110 			rw_exit(&hca->avl_rw_lock);
5111 
5112 			mutex_enter(&hca->cache_allocation_lock);
5113 			hca->cache_misses_above_the_limit++;
5114 			mutex_exit(&hca->cache_allocation_lock);
5115 			/* Allocate and register the buffer directly */
5116 			goto error_alloc;
5117 		}
5118 		rw_exit(&hca->avl_rw_lock);
5119 		mutex_enter(&hca->cache_allocation_lock);
5120 		hca->cache_misses++;
5121 		mutex_exit(&hca->cache_allocation_lock);
5122 		/* Allocate a reply_buf entry */
5123 		reply_buf = (rib_lrc_entry_t *)
5124 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5125 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
5126 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5127 		reply_buf->lrc_len  = len;
5128 		reply_buf->registered = FALSE;
5129 		reply_buf->avl_node = (void *)rcas;
5130 	}
5131 
5132 	return (reply_buf);
5133 
5134 error_alloc:
5135 	reply_buf = (rib_lrc_entry_t *)
5136 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5137 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
5138 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5139 	reply_buf->lrc_len = len;
5140 	reply_buf->registered = FALSE;
5141 	reply_buf->avl_node = NULL;
5142 
5143 	return (reply_buf);
5144 }
5145 
5146 /*
5147  * Return a pre-registered back to the cache (without
5148  * unregistering the buffer)..
5149  */
5150 
5151 static void
rib_free_cache_buf(CONN * conn,rib_lrc_entry_t * reg_buf)5152 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5153 {
5154 	cache_avl_struct_t    cas, *rcas;
5155 	avl_index_t where = (uintptr_t)NULL;
5156 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5157 
5158 	if (!hca->avl_init)
5159 		goto  error_free;
5160 
5161 	cas.len = reg_buf->lrc_len;
5162 	rw_enter(&hca->avl_rw_lock, RW_READER);
5163 	if ((rcas = (cache_avl_struct_t *)
5164 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5165 		rw_exit(&hca->avl_rw_lock);
5166 		goto error_free;
5167 	} else {
5168 		cas.len = reg_buf->lrc_len;
5169 		mutex_enter(&rcas->node_lock);
5170 		insque(reg_buf, &rcas->r);
5171 		rcas->elements ++;
5172 		mutex_exit(&rcas->node_lock);
5173 		rw_exit(&hca->avl_rw_lock);
5174 		mutex_enter(&hca->cache_allocation_lock);
5175 		hca->cache_allocation += cas.len;
5176 		mutex_exit(&hca->cache_allocation_lock);
5177 	}
5178 
5179 	return;
5180 
5181 error_free:
5182 
5183 	if (reg_buf->registered)
5184 		(void) rib_deregistermem_via_hca(hca,
5185 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5186 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5187 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5188 }
5189 
5190 static rdma_stat
rib_registermem_via_hca(rib_hca_t * hca,caddr_t adsp,caddr_t buf,uint_t buflen,struct mrc * buf_handle)5191 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5192     uint_t buflen, struct mrc *buf_handle)
5193 {
5194 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5195 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5196 	rdma_stat	status;
5197 
5198 
5199 	/*
5200 	 * Note: ALL buffer pools use the same memory type RDMARW.
5201 	 */
5202 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5203 	if (status == RDMA_SUCCESS) {
5204 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5205 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5206 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5207 	} else {
5208 		buf_handle->mrc_linfo = (uintptr_t)NULL;
5209 		buf_handle->mrc_lmr = 0;
5210 		buf_handle->mrc_rmr = 0;
5211 	}
5212 	return (status);
5213 }
5214 
5215 /* ARGSUSED */
5216 static rdma_stat
rib_deregistermemsync_via_hca(rib_hca_t * hca,caddr_t buf,struct mrc buf_handle,RIB_SYNCMEM_HANDLE sync_handle)5217 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5218     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5219 {
5220 
5221 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5222 	return (RDMA_SUCCESS);
5223 }
5224 
5225 /* ARGSUSED */
5226 static rdma_stat
rib_deregistermem_via_hca(rib_hca_t * hca,caddr_t buf,struct mrc buf_handle)5227 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5228 {
5229 
5230 	(void) ibt_deregister_mr(hca->hca_hdl,
5231 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5232 	return (RDMA_SUCCESS);
5233 }
5234 
5235 /*
5236  * Check if the IP interface named by `lifrp' is RDMA-capable.
5237  */
5238 static boolean_t
rpcib_rdma_capable_interface(struct lifreq * lifrp)5239 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5240 {
5241 	char ifname[LIFNAMSIZ];
5242 	char *cp;
5243 
5244 	if (lifrp->lifr_type == IFT_IB)
5245 		return (B_TRUE);
5246 
5247 	/*
5248 	 * Strip off the logical interface portion before getting
5249 	 * intimate with the name.
5250 	 */
5251 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5252 	if ((cp = strchr(ifname, ':')) != NULL)
5253 		*cp = '\0';
5254 
5255 	return (strcmp("lo0", ifname) == 0);
5256 }
5257 
5258 static int
rpcib_do_ip_ioctl(int cmd,int len,void * arg)5259 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5260 {
5261 	vnode_t *kkvp, *vp;
5262 	TIUSER  *tiptr;
5263 	struct  strioctl iocb;
5264 	k_sigset_t smask;
5265 	int	err = 0;
5266 
5267 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5268 		if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5269 		    &tiptr, CRED()) == 0) {
5270 			vp = tiptr->fp->f_vnode;
5271 		} else {
5272 			VN_RELE(kkvp);
5273 			return (EPROTO);
5274 		}
5275 	} else {
5276 		return (EPROTO);
5277 	}
5278 
5279 	iocb.ic_cmd = cmd;
5280 	iocb.ic_timout = 0;
5281 	iocb.ic_len = len;
5282 	iocb.ic_dp = (caddr_t)arg;
5283 	sigintr(&smask, 0);
5284 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5285 	sigunintr(&smask);
5286 	(void) t_kclose(tiptr, 0);
5287 	VN_RELE(kkvp);
5288 	return (err);
5289 }
5290 
5291 /*
5292  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5293  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5294  */
5295 static int
rpcib_do_lifconf(struct lifconf * lifcp,uint_t * bufsizep)5296 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5297 {
5298 	int err;
5299 	struct lifnum lifn;
5300 
5301 	bzero(&lifn, sizeof (struct lifnum));
5302 	lifn.lifn_family = AF_UNSPEC;
5303 
5304 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5305 	if (err != 0)
5306 		return (err);
5307 
5308 	/*
5309 	 * Pad the interface count to account for additional interfaces that
5310 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5311 	 */
5312 	lifn.lifn_count += 4;
5313 
5314 	bzero(lifcp, sizeof (struct lifconf));
5315 	lifcp->lifc_family = AF_UNSPEC;
5316 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5317 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5318 
5319 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5320 	if (err != 0) {
5321 		kmem_free(lifcp->lifc_buf, *bufsizep);
5322 		return (err);
5323 	}
5324 	return (0);
5325 }
5326 
5327 static boolean_t
rpcib_get_ib_addresses(rpcib_ipaddrs_t * addrs4,rpcib_ipaddrs_t * addrs6)5328 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5329 {
5330 	uint_t i, nifs;
5331 	uint_t bufsize;
5332 	struct lifconf lifc;
5333 	struct lifreq *lifrp;
5334 	struct sockaddr_in *sinp;
5335 	struct sockaddr_in6 *sin6p;
5336 
5337 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5338 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5339 
5340 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5341 		return (B_FALSE);
5342 
5343 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5344 		kmem_free(lifc.lifc_buf, bufsize);
5345 		return (B_FALSE);
5346 	}
5347 
5348 	/*
5349 	 * Worst case is that all of the addresses are IB-capable and have
5350 	 * the same address family, so size our buffers accordingly.
5351 	 */
5352 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5353 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5354 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5355 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5356 
5357 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5358 		if (!rpcib_rdma_capable_interface(lifrp))
5359 			continue;
5360 
5361 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5362 			sinp = addrs4->ri_list;
5363 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5364 			    sizeof (struct sockaddr_in));
5365 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5366 			sin6p = addrs6->ri_list;
5367 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5368 			    sizeof (struct sockaddr_in6));
5369 		}
5370 	}
5371 
5372 	kmem_free(lifc.lifc_buf, bufsize);
5373 	return (B_TRUE);
5374 }
5375 
5376 /* ARGSUSED */
5377 static int
rpcib_cache_kstat_update(kstat_t * ksp,int rw)5378 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5379 {
5380 	rib_hca_t *hca;
5381 
5382 	if (KSTAT_WRITE == rw) {
5383 		return (EACCES);
5384 	}
5385 
5386 	rpcib_kstat.cache_limit.value.ui64 =
5387 	    (uint64_t)cache_limit;
5388 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5389 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5390 		rpcib_kstat.cache_allocation.value.ui64 +=
5391 		    (uint64_t)hca->cache_allocation;
5392 		rpcib_kstat.cache_hits.value.ui64 +=
5393 		    (uint64_t)hca->cache_hits;
5394 		rpcib_kstat.cache_misses.value.ui64 +=
5395 		    (uint64_t)hca->cache_misses;
5396 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5397 		    (uint64_t)hca->cache_misses_above_the_limit;
5398 	}
5399 	rw_exit(&rib_stat->hcas_list_lock);
5400 	return (0);
5401 }
5402