xref: /illumos-gate/usr/src/uts/common/io/ib/clients/iser/iser_ib.c (revision 30e7468f8f41aa30ada067b2c1d5d284046514da)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ddi.h>
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <netinet/in.h>
31 #include <sys/sunddi.h>
32 #include <sys/sysmacros.h>
33 #include <sys/iscsi_protocol.h>
34 
35 #include <sys/ib/clients/iser/iser.h>
36 #include <sys/ib/clients/iser/iser_idm.h>
37 
38 /*
39  * iser_ib.c
40  * Routines for InfiniBand transport for iSER
41  *
42  * This file contains the routines to interface with the IBT API to attach and
43  * allocate IB resources, handle async events, and post recv work requests.
44  *
45  */
46 
47 static iser_hca_t *iser_ib_gid2hca(ib_gid_t gid);
48 static iser_hca_t *iser_ib_guid2hca(ib_guid_t guid);
49 
50 static iser_hca_t *iser_ib_alloc_hca(ib_guid_t guid);
51 static int iser_ib_free_hca(iser_hca_t *hca);
52 static int iser_ib_update_hcaports(iser_hca_t *hca);
53 static int iser_ib_init_hcas(void);
54 static int iser_ib_fini_hcas(void);
55 
56 static iser_sbind_t *iser_ib_get_bind(
57     iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid);
58 static int iser_ib_activate_port(
59     idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid);
60 static void iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid);
61 
62 static void iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size);
63 static void iser_ib_fini_qp(iser_qp_t *qp);
64 
65 static int iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size,
66     ibt_cq_hdl_t *cq_hdl);
67 
68 static void iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
69     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
70     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs);
71 
72 static void iser_ib_handle_portup_event(ibt_hca_hdl_t hdl,
73     ibt_async_event_t *event);
74 static void iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl,
75     ibt_async_event_t *event);
76 static void iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl,
77     ibt_async_event_t *event);
78 
79 static struct ibt_clnt_modinfo_s iser_ib_modinfo = {
80 	IBTI_V_CURR,
81 	IBT_STORAGE_DEV,
82 	iser_ib_async_handler,
83 	NULL,
84 	"iSER"
85 };
86 
87 /*
88  * iser_ib_init
89  *
90  * This function registers the HCA drivers with IBTF and registers and binds
91  * iSER as a service with IBTF.
92  */
93 int
94 iser_ib_init(void)
95 {
96 	int		status;
97 
98 	/* Register with IBTF */
99 	status = ibt_attach(&iser_ib_modinfo, iser_state->is_dip, iser_state,
100 	    &iser_state->is_ibhdl);
101 	if (status != DDI_SUCCESS) {
102 		ISER_LOG(CE_NOTE, "iser_ib_init: ibt_attach failed (0x%x)",
103 		    status);
104 		return (DDI_FAILURE);
105 	}
106 
107 	/* Create the global work request kmem_cache */
108 	iser_state->iser_wr_cache = kmem_cache_create("iser_wr_cache",
109 	    sizeof (iser_wr_t), 0, NULL, NULL, NULL,
110 	    iser_state, NULL, KM_SLEEP);
111 
112 	/* Populate our list of HCAs */
113 	status = iser_ib_init_hcas();
114 	if (status != DDI_SUCCESS) {
115 		/* HCAs failed to initialize, tear it down */
116 		kmem_cache_destroy(iser_state->iser_wr_cache);
117 		(void) ibt_detach(iser_state->is_ibhdl);
118 		iser_state->is_ibhdl = NULL;
119 		ISER_LOG(CE_NOTE, "iser_ib_init: failed to initialize HCAs");
120 		return (DDI_FAILURE);
121 	}
122 
123 	/* Target will register iSER as a service with IBTF when required */
124 
125 	/* Target will bind this service when it comes online */
126 
127 	return (DDI_SUCCESS);
128 }
129 
130 /*
131  * iser_ib_fini
132  *
133  * This function unbinds and degisters the iSER service from IBTF
134  */
135 int
136 iser_ib_fini(void)
137 {
138 	/* IDM would have already disabled all the services */
139 
140 	/* Teardown the HCA list and associated resources */
141 	if (iser_ib_fini_hcas() != DDI_SUCCESS)
142 		return (DDI_FAILURE);
143 
144 	/* Teardown the global work request kmem_cache */
145 	kmem_cache_destroy(iser_state->iser_wr_cache);
146 
147 	/* Deregister with IBTF */
148 	if (iser_state->is_ibhdl != NULL) {
149 		(void) ibt_detach(iser_state->is_ibhdl);
150 		iser_state->is_ibhdl = NULL;
151 	}
152 
153 	return (DDI_SUCCESS);
154 }
155 
156 /*
157  * iser_ib_register_service
158  *
159  * This function registers the iSER service using the RDMA-Aware Service ID.
160  */
161 int
162 iser_ib_register_service(idm_svc_t *idm_svc)
163 {
164 	ibt_srv_desc_t	srvdesc;
165 	iser_svc_t	*iser_svc;
166 	int		status;
167 
168 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
169 
170 	/* Set up IBTI client callback handler from the CM */
171 	srvdesc.sd_handler = iser_ib_cm_handler;
172 
173 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
174 
175 	iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
176 
177 	/* Register the service on the specified port */
178 	status = ibt_register_service(
179 	    iser_state->is_ibhdl, &srvdesc,
180 	    iser_svc->is_svcid, 1, &iser_svc->is_srvhdl, NULL);
181 
182 	return (status);
183 }
184 
185 /*
186  * iser_ib_bind_service
187  *
188  * This function binds a given iSER service on all available HCA ports
189  */
190 int
191 iser_ib_bind_service(idm_svc_t *idm_svc)
192 {
193 	iser_hca_t	*hca;
194 	ib_gid_t	gid;
195 	int		num_ports = 0;
196 	int		num_binds = 0;
197 	int		status;
198 	int		i;
199 
200 	ASSERT(idm_svc != NULL);
201 	ASSERT(idm_svc->is_iser_svc != NULL);
202 
203 	/* Register the iSER service on all available ports */
204 	mutex_enter(&iser_state->is_hcalist_lock);
205 
206 	for (hca = list_head(&iser_state->is_hcalist);
207 	    hca != NULL;
208 	    hca = list_next(&iser_state->is_hcalist, hca)) {
209 
210 		for (i = 0; i < hca->hca_num_ports; i++) {
211 			num_ports++;
212 			if (hca->hca_port_info[i].p_linkstate !=
213 			    IBT_PORT_ACTIVE) {
214 				/*
215 				 * Move on. We will attempt to bind service
216 				 * in our async handler if the port comes up
217 				 * at a later time.
218 				 */
219 				continue;
220 			}
221 
222 			gid = hca->hca_port_info[i].p_sgid_tbl[0];
223 
224 			/* If the port is already bound, skip */
225 			if (iser_ib_get_bind(
226 			    idm_svc->is_iser_svc, hca->hca_guid, gid) == NULL) {
227 
228 				status = iser_ib_activate_port(
229 				    idm_svc, hca->hca_guid, gid);
230 				if (status != IBT_SUCCESS) {
231 					ISER_LOG(CE_NOTE,
232 					    "iser_ib_bind_service: "
233 					    "iser_ib_activate_port failure "
234 					    "(0x%x)", status);
235 					continue;
236 				}
237 			}
238 			num_binds++;
239 		}
240 	}
241 	mutex_exit(&iser_state->is_hcalist_lock);
242 
243 	if (num_binds) {
244 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Service available on "
245 		    "(%d) of (%d) ports", num_binds, num_ports);
246 		return (ISER_STATUS_SUCCESS);
247 	} else {
248 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Did not bind service");
249 		return (ISER_STATUS_FAIL);
250 	}
251 }
252 
253 /*
254  * iser_ib_unbind_service
255  *
256  * This function unbinds a given service on a all HCA ports
257  */
258 void
259 iser_ib_unbind_service(idm_svc_t *idm_svc)
260 {
261 	iser_svc_t	*iser_svc;
262 	iser_sbind_t	*is_sbind, *next_sb;
263 
264 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
265 
266 		iser_svc = idm_svc->is_iser_svc;
267 
268 		for (is_sbind = list_head(&iser_svc->is_sbindlist);
269 		    is_sbind != NULL;
270 		    is_sbind = next_sb) {
271 			next_sb = list_next(&iser_svc->is_sbindlist, is_sbind);
272 			ibt_unbind_service(iser_svc->is_srvhdl,
273 			    is_sbind->is_sbindhdl);
274 			list_remove(&iser_svc->is_sbindlist, is_sbind);
275 			kmem_free(is_sbind, sizeof (iser_sbind_t));
276 		}
277 	}
278 }
279 
280 /* ARGSUSED */
281 void
282 iser_ib_deregister_service(idm_svc_t *idm_svc)
283 {
284 	iser_svc_t	*iser_svc;
285 
286 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
287 
288 		iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
289 		ibt_deregister_service(iser_state->is_ibhdl,
290 		    iser_svc->is_srvhdl);
291 		ibt_release_ip_sid(iser_svc->is_svcid);
292 	}
293 }
294 
295 /*
296  * iser_ib_get_paths
297  * This function finds the IB path between the local and the remote address.
298  *
299  */
300 int
301 iser_ib_get_paths(ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip,
302     ibt_path_info_t *path, ibt_path_ip_src_t *path_src_ip)
303 {
304 	ibt_ip_path_attr_t	ipattr;
305 	int			status;
306 
307 	(void) bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
308 	ipattr.ipa_dst_ip	= remote_ip;
309 	ipattr.ipa_src_ip	= *local_ip;
310 	ipattr.ipa_max_paths	= 1;
311 	ipattr.ipa_ndst		= 1;
312 
313 	(void) bzero(path, sizeof (ibt_path_info_t));
314 	status = ibt_get_ip_paths(iser_state->is_ibhdl, IBT_PATH_NO_FLAGS,
315 	    &ipattr, path, NULL, path_src_ip);
316 	if (status != IBT_SUCCESS) {
317 		ISER_LOG(CE_NOTE, "ibt_get_ip_paths: ibt_get_ip_paths "
318 		    "failure: status (%d)", status);
319 		return (status);
320 	}
321 
322 	if (local_ip != NULL) {
323 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: IP[%x to %x]",
324 		    local_ip->un.ip4addr, remote_ip->un.ip4addr);
325 	} else {
326 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: "
327 		    "IP[INADDR_ANY to %x]", remote_ip->un.ip4addr);
328 	}
329 
330 	return (ISER_STATUS_SUCCESS);
331 }
332 
333 /*
334  * iser_ib_alloc_rc_channel
335  *
336  * This function allocates a reliable communication channel using the specified
337  * channel attributes.
338  */
339 iser_chan_t *
340 iser_ib_alloc_rc_channel(ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip)
341 {
342 
343 	iser_chan_t			*chan;
344 	ib_gid_t			lgid;
345 	uint8_t				hca_port; /* from path */
346 	iser_hca_t			*hca;
347 	ibt_path_ip_src_t		path_src_ip;
348 	ibt_rc_chan_alloc_args_t	chanargs;
349 	uint_t				sq_size, rq_size;
350 	int				status;
351 
352 	chan = kmem_zalloc(sizeof (iser_chan_t), KM_SLEEP);
353 
354 	mutex_init(&chan->ic_lock, NULL, MUTEX_DRIVER, NULL);
355 	mutex_init(&chan->ic_sq_post_lock, NULL, MUTEX_DRIVER, NULL);
356 
357 	/* Lookup a path to the given destination */
358 	status = iser_ib_get_paths(local_ip, remote_ip, &chan->ic_ibt_path,
359 	    &path_src_ip);
360 
361 	if (status != ISER_STATUS_SUCCESS) {
362 		ISER_LOG(CE_NOTE, "iser_ib_get_paths failed: status (%d)",
363 		    status);
364 		mutex_destroy(&chan->ic_lock);
365 		mutex_destroy(&chan->ic_sq_post_lock);
366 		kmem_free(chan, sizeof (iser_chan_t));
367 		return (NULL);
368 	}
369 
370 	/* get the local gid from the path info */
371 	lgid = chan->ic_ibt_path.pi_prim_cep_path.cep_adds_vect.av_sgid;
372 
373 	/* get the hca port from the path info */
374 	hca_port = chan->ic_ibt_path.pi_prim_cep_path.cep_hca_port_num;
375 
376 	/* Lookup the hca using the gid in the path info */
377 	hca = iser_ib_gid2hca(lgid);
378 	if (hca == NULL) {
379 		ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel: failed "
380 		    "to lookup HCA handle");
381 		mutex_destroy(&chan->ic_lock);
382 		mutex_destroy(&chan->ic_sq_post_lock);
383 		kmem_free(chan, sizeof (iser_chan_t));
384 		return (NULL);
385 	}
386 
387 	/* Set up the iSER channel handle with HCA and IP data */
388 	chan->ic_hca		= hca;
389 	chan->ic_localip	= path_src_ip.ip_primary;
390 	chan->ic_remoteip	= *remote_ip;
391 
392 	/*
393 	 * Determine the queue sizes, based upon the HCA query data.
394 	 * For our Work Queues, we will use either our default value,
395 	 * or the HCA's maximum value, whichever is smaller.
396 	 */
397 	sq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_SENDQ_SIZE);
398 	rq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_RECVQ_SIZE);
399 
400 	/*
401 	 * For our Completion Queues, we again check the device maximum.
402 	 * We want to end up with CQs that are the next size up from the
403 	 * WQs they are servicing so that they have some overhead.
404 	 */
405 	if (hca->hca_attr.hca_max_cq_sz >= (sq_size + 1)) {
406 		chan->ic_sendcq_sz = sq_size + 1;
407 	} else {
408 		chan->ic_sendcq_sz = hca->hca_attr.hca_max_cq_sz;
409 		sq_size = chan->ic_sendcq_sz - 1;
410 	}
411 
412 	if (hca->hca_attr.hca_max_cq_sz >= (rq_size + 1)) {
413 		chan->ic_recvcq_sz = rq_size + 1;
414 	} else {
415 		chan->ic_recvcq_sz = hca->hca_attr.hca_max_cq_sz;
416 		rq_size = chan->ic_recvcq_sz - 1;
417 	}
418 
419 	/* Initialize the iSER channel's QP handle */
420 	iser_ib_init_qp(chan, sq_size, rq_size);
421 
422 	/* Set up the Send Completion Queue */
423 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_sendcq_sz,
424 	    &chan->ic_sendcq);
425 	if (status != ISER_STATUS_SUCCESS) {
426 		iser_ib_fini_qp(&chan->ic_qp);
427 		mutex_destroy(&chan->ic_lock);
428 		mutex_destroy(&chan->ic_sq_post_lock);
429 		kmem_free(chan, sizeof (iser_chan_t));
430 		return (NULL);
431 	}
432 	ibt_set_cq_handler(chan->ic_sendcq, iser_ib_sendcq_handler, chan);
433 	ibt_enable_cq_notify(chan->ic_sendcq, IBT_NEXT_COMPLETION);
434 
435 	/* Set up the Receive Completion Queue */
436 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_recvcq_sz,
437 	    &chan->ic_recvcq);
438 	if (status != ISER_STATUS_SUCCESS) {
439 		(void) ibt_free_cq(chan->ic_sendcq);
440 		iser_ib_fini_qp(&chan->ic_qp);
441 		mutex_destroy(&chan->ic_lock);
442 		mutex_destroy(&chan->ic_sq_post_lock);
443 		kmem_free(chan, sizeof (iser_chan_t));
444 		return (NULL);
445 	}
446 	ibt_set_cq_handler(chan->ic_recvcq, iser_ib_recvcq_handler, chan);
447 	ibt_enable_cq_notify(chan->ic_recvcq, IBT_NEXT_COMPLETION);
448 
449 	/* Setup the channel arguments */
450 	iser_ib_setup_chanargs(hca_port, chan->ic_sendcq, chan->ic_recvcq,
451 	    sq_size, rq_size, hca->hca_pdhdl, &chanargs);
452 
453 	status = ibt_alloc_rc_channel(hca->hca_hdl,
454 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chan->ic_chanhdl, NULL);
455 	if (status != IBT_SUCCESS) {
456 		ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel: failed "
457 		    "ibt_alloc_rc_channel: status (%d)", status);
458 		(void) ibt_free_cq(chan->ic_sendcq);
459 		(void) ibt_free_cq(chan->ic_recvcq);
460 		iser_ib_fini_qp(&chan->ic_qp);
461 		mutex_destroy(&chan->ic_lock);
462 		mutex_destroy(&chan->ic_sq_post_lock);
463 		kmem_free(chan, sizeof (iser_chan_t));
464 		return (NULL);
465 	}
466 
467 	/* Set the 'channel' as the client private data */
468 	(void) ibt_set_chan_private(chan->ic_chanhdl, chan);
469 
470 	ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel success: "
471 	    "chanhdl (0x%p), IP:[%llx to %llx], lgid (%llx:%llx), HCA(%llx) %d",
472 	    (void *)chan->ic_chanhdl,
473 	    (longlong_t)local_ip->un.ip4addr,
474 	    (longlong_t)remote_ip->un.ip4addr,
475 	    (longlong_t)lgid.gid_prefix, (longlong_t)lgid.gid_guid,
476 	    (longlong_t)hca->hca_guid, hca_port);
477 
478 	return (chan);
479 }
480 
481 /*
482  * iser_ib_open_rc_channel
483  * This function opens a RC connection on the given allocated RC channel
484  */
485 int
486 iser_ib_open_rc_channel(iser_chan_t *chan)
487 {
488 	ibt_ip_cm_info_t	ipcm_info;
489 	iser_private_data_t	iser_priv_data;
490 	ibt_chan_open_args_t	ocargs;
491 	ibt_rc_returns_t	ocreturns;
492 	int			status;
493 
494 	mutex_enter(&chan->ic_lock);
495 
496 	/*
497 	 * For connection establishment, the initiator sends a CM REQ using the
498 	 * iSER RDMA-Aware Service ID. Included are the source and destination
499 	 * IP addresses, and the src port.
500 	 */
501 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
502 	ipcm_info.src_addr = chan->ic_localip;
503 	ipcm_info.dst_addr = chan->ic_remoteip;
504 	ipcm_info.src_port = chan->ic_lport;
505 
506 	/*
507 	 * The CM Private Data field defines the iSER connection parameters
508 	 * such as zero based virtual address exception (ZBVAE) and Send with
509 	 * invalidate Exception (SIE).
510 	 *
511 	 * Solaris IBT does not currently support ZBVAE or SIE.
512 	 */
513 	iser_priv_data.rsvd1	= 0;
514 	iser_priv_data.sie	= 1;
515 	iser_priv_data.zbvae	= 1;
516 
517 	status = ibt_format_ip_private_data(&ipcm_info,
518 	    sizeof (iser_private_data_t), &iser_priv_data);
519 	if (status != IBT_SUCCESS) {
520 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
521 		mutex_exit(&chan->ic_lock);
522 		return (status);
523 	}
524 
525 	/*
526 	 * Set the SID we are attempting to connect to, based upon the
527 	 * remote port number.
528 	 */
529 	chan->ic_ibt_path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, chan->ic_rport);
530 
531 	/* Set up the args for the channel open */
532 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
533 	ocargs.oc_path			= &chan->ic_ibt_path;
534 	ocargs.oc_cm_handler		= iser_ib_cm_handler;
535 	ocargs.oc_cm_clnt_private	= iser_state;
536 	ocargs.oc_rdma_ra_out		= 4;
537 	ocargs.oc_rdma_ra_in		= 4;
538 	ocargs.oc_path_retry_cnt	= 2;
539 	ocargs.oc_path_rnr_retry_cnt	= 2;
540 	ocargs.oc_priv_data_len		= sizeof (iser_private_data_t);
541 	ocargs.oc_priv_data		= &iser_priv_data;
542 
543 	bzero(&ocreturns, sizeof (ibt_rc_returns_t));
544 
545 	status = ibt_open_rc_channel(chan->ic_chanhdl,
546 	    IBT_OCHAN_NO_FLAGS, IBT_BLOCKING, &ocargs, &ocreturns);
547 
548 	if (status != IBT_SUCCESS) {
549 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
550 		mutex_exit(&chan->ic_lock);
551 		return (status);
552 	}
553 
554 	mutex_exit(&chan->ic_lock);
555 	return (IDM_STATUS_SUCCESS);
556 }
557 
558 /*
559  * iser_ib_close_rc_channel
560  * This function closes the RC channel related to this iser_chan handle.
561  * We invoke this in a non-blocking, no callbacks context.
562  */
563 void
564 iser_ib_close_rc_channel(iser_chan_t *chan)
565 {
566 	int			status;
567 
568 	mutex_enter(&chan->ic_lock);
569 	status = ibt_close_rc_channel(chan->ic_chanhdl, IBT_BLOCKING, NULL,
570 	    0, NULL, NULL, 0);
571 	if (status != IBT_SUCCESS) {
572 		ISER_LOG(CE_NOTE, "iser_ib_close_rc_channel: "
573 		    "ibt_close_rc_channel failed: status (%d)", status);
574 	}
575 	mutex_exit(&chan->ic_lock);
576 }
577 
578 /*
579  * iser_ib_free_rc_channel
580  *
581  * This function tears down an RC channel's QP initialization and frees it.
582  * Note that we do not need synchronization here; the channel has been
583  * closed already, so we should only have completion polling occuring.  Once
584  * complete, we are free to free the IBTF channel, WQ and CQ resources, and
585  * our own related resources.
586  */
587 void
588 iser_ib_free_rc_channel(iser_chan_t *chan)
589 {
590 	iser_qp_t	*iser_qp;
591 
592 	iser_qp = &chan->ic_qp;
593 
594 	/* Ensure the SQ is empty */
595 	while (chan->ic_sq_post_count != 0) {
596 		mutex_exit(&chan->ic_conn->ic_lock);
597 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
598 		mutex_enter(&chan->ic_conn->ic_lock);
599 	}
600 	mutex_destroy(&chan->ic_sq_post_lock);
601 
602 	/* Ensure the RQ is empty */
603 	(void) ibt_flush_channel(chan->ic_chanhdl);
604 	mutex_enter(&iser_qp->qp_lock);
605 	while (iser_qp->rq_level != 0) {
606 		mutex_exit(&iser_qp->qp_lock);
607 		mutex_exit(&chan->ic_conn->ic_lock);
608 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
609 		mutex_enter(&chan->ic_conn->ic_lock);
610 		mutex_enter(&iser_qp->qp_lock);
611 	}
612 
613 	/* Free our QP handle */
614 	mutex_exit(&iser_qp->qp_lock);
615 	(void) iser_ib_fini_qp(iser_qp);
616 
617 	/* Free the IBT channel resources */
618 	(void) ibt_free_channel(chan->ic_chanhdl);
619 	chan->ic_chanhdl = NULL;
620 
621 	/* Free the CQs */
622 	ibt_free_cq(chan->ic_sendcq);
623 	ibt_free_cq(chan->ic_recvcq);
624 
625 	/* Free the chan handle */
626 	mutex_destroy(&chan->ic_lock);
627 	kmem_free(chan, sizeof (iser_chan_t));
628 }
629 
630 /*
631  * iser_ib_post_recv
632  *
633  * This function handles keeping the RQ full on a given channel.
634  * This routine will mostly be run on a taskq, and will check the
635  * current fill level of the RQ, and post as many WRs as necessary
636  * to fill it again.
637  */
638 void
639 iser_ib_post_recv(void *arg)
640 {
641 	ibt_channel_hdl_t chanhdl;
642 	iser_chan_t	*chan;
643 	iser_hca_t	*hca;
644 	iser_msg_t	*msg;
645 	ibt_recv_wr_t	*wrlist, wr[ISER_IB_RQ_POST_MAX];
646 	int		rq_space, msg_ret;
647 	int		total_num, npost;
648 	uint_t		nposted;
649 	int		status, i;
650 	iser_qp_t	*iser_qp;
651 	ib_gid_t	lgid;
652 
653 	chanhdl = (ibt_channel_hdl_t)arg;
654 
655 	/* Pull our iSER channel handle from the private data */
656 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
657 
658 	/* It is possible to run after the channel has been freed */
659 	if (chan == NULL) {
660 		return;
661 	}
662 	mutex_enter(&chan->ic_conn->ic_lock);
663 
664 	/* Bail out if the connection is closed; no need for more recv WRs */
665 	if ((chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSING) ||
666 	    (chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSED)) {
667 		mutex_exit(&chan->ic_conn->ic_lock);
668 		return;
669 	}
670 
671 	/* get the QP handle from the iser_chan */
672 	iser_qp = &chan->ic_qp;
673 
674 	/* get the local gid from the path info */
675 	lgid = chan->ic_ibt_path.pi_prim_cep_path.cep_adds_vect.av_sgid;
676 
677 	/* get the hca port from the path info */
678 	hca = iser_ib_gid2hca(lgid);
679 	if (hca == NULL) {
680 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to retrieve "
681 		    "HCA handle");
682 		mutex_exit(&chan->ic_conn->ic_lock);
683 		return;
684 	}
685 
686 	/* check for space to post on the RQ */
687 	mutex_enter(&iser_qp->qp_lock);
688 	rq_space = iser_qp->rq_depth - iser_qp->rq_level;
689 	if (rq_space == 0) {
690 		/* The RQ is full, clear the pending flag and return */
691 		iser_qp->rq_taskqpending = B_FALSE;
692 		mutex_exit(&iser_qp->qp_lock);
693 		mutex_exit(&chan->ic_conn->ic_lock);
694 		return;
695 	}
696 
697 	/* Keep track of the lowest value for rq_min_post_level */
698 	if (iser_qp->rq_level < iser_qp->rq_min_post_level)
699 		iser_qp->rq_min_post_level = iser_qp->rq_level;
700 
701 	mutex_exit(&iser_qp->qp_lock);
702 
703 	/* we've room to post, so pull from the msg cache */
704 	msg = iser_msg_get(hca, rq_space, &msg_ret);
705 	if (msg == NULL) {
706 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: no message handles "
707 		    "available in msg cache currently");
708 		/*
709 		 * There are no messages on the cache. Wait a half-
710 		 * second, then try again.
711 		 */
712 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
713 		status = ddi_taskq_dispatch(iser_taskq, iser_ib_post_recv,
714 		    (void *)chanhdl, DDI_NOSLEEP);
715 		if (status != DDI_SUCCESS) {
716 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
717 			    "redispatch routine");
718 			/* Failed to dispatch, clear pending flag */
719 			mutex_enter(&iser_qp->qp_lock);
720 			iser_qp->rq_taskqpending = B_FALSE;
721 			mutex_exit(&iser_qp->qp_lock);
722 		}
723 		mutex_exit(&chan->ic_conn->ic_lock);
724 		return;
725 	}
726 
727 	if (msg_ret != rq_space) {
728 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: requested number of "
729 		    "messages not allocated: requested (%d) allocated (%d)",
730 		    rq_space, msg_ret);
731 		/* We got some, but not all, of our requested depth */
732 		rq_space = msg_ret;
733 	}
734 
735 	/*
736 	 * Now, walk through the allocated WRs and post them,
737 	 * ISER_IB_RQ_POST_MAX (or less) at a time.
738 	 */
739 	wrlist = &wr[0];
740 	total_num = rq_space;
741 
742 	while (total_num) {
743 		/* determine the number to post on this iteration */
744 		npost = (total_num > ISER_IB_RQ_POST_MAX) ?
745 		    ISER_IB_RQ_POST_MAX : total_num;
746 
747 		/* build a list of WRs from the msg list */
748 		for (i = 0; i < npost; i++) {
749 			wrlist[i].wr_id		= (ibt_wrid_t)(uintptr_t)msg;
750 			wrlist[i].wr_nds	= ISER_IB_SGLIST_SIZE;
751 			wrlist[i].wr_sgl	= &msg->msg_ds;
752 			msg = msg->nextp;
753 		}
754 
755 		/* post the list to the RQ */
756 		nposted = 0;
757 		status = ibt_post_recv(chanhdl, wrlist, npost, &nposted);
758 		if ((status != IBT_SUCCESS) || (nposted != npost)) {
759 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: ibt_post_recv "
760 			    "failed: requested (%d) posted (%d) status (%d)",
761 			    npost, nposted, status);
762 			total_num -= nposted;
763 			break;
764 		}
765 
766 		/* decrement total number to post by the number posted */
767 		total_num -= nposted;
768 	}
769 
770 	mutex_enter(&iser_qp->qp_lock);
771 	if (total_num != 0) {
772 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to fill RQ, "
773 		    "failed to post (%d) WRs", total_num);
774 		iser_qp->rq_level += rq_space - total_num;
775 	} else {
776 		iser_qp->rq_level += rq_space;
777 	}
778 
779 	/*
780 	 * Now that we've filled the RQ, check that all of the recv WRs
781 	 * haven't just been immediately consumed. If so, taskqpending is
782 	 * still B_TRUE, so we need to fire off a taskq thread to post
783 	 * more WRs.
784 	 */
785 	if (iser_qp->rq_level == 0) {
786 		mutex_exit(&iser_qp->qp_lock);
787 		status = ddi_taskq_dispatch(iser_taskq, iser_ib_post_recv,
788 		    (void *)chanhdl, DDI_NOSLEEP);
789 		if (status != DDI_SUCCESS) {
790 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
791 			    "dispatch followup routine");
792 			/* Failed to dispatch, clear pending flag */
793 			mutex_enter(&iser_qp->qp_lock);
794 			iser_qp->rq_taskqpending = B_FALSE;
795 			mutex_exit(&iser_qp->qp_lock);
796 		}
797 	} else {
798 		/*
799 		 * We're done, we've filled the RQ. Clear the taskq
800 		 * flag so that we can run again.
801 		 */
802 		iser_qp->rq_taskqpending = B_FALSE;
803 		mutex_exit(&iser_qp->qp_lock);
804 	}
805 
806 	mutex_exit(&chan->ic_conn->ic_lock);
807 }
808 
809 /*
810  * iser_ib_handle_portup_event()
811  * This handles the IBT_EVENT_PORT_UP unaffiliated asynchronous event.
812  *
813  * To facilitate a seamless bringover of the port and configure the CM service
814  * for inbound iSER service requests on this newly active port, the existing
815  * IDM services will be checked for iSER support.
816  * If an iSER service was already created, then this service will simply be
817  * bound to the gid of the newly active port. If on the other hand, the CM
818  * service did not exist, i.e. only socket communication, then a new CM
819  * service will be first registered with the saved service parameters and
820  * then bound to the newly active port.
821  *
822  */
823 /* ARGSUSED */
824 static void
825 iser_ib_handle_portup_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
826 {
827 	iser_hca_t		*hca;
828 	ib_gid_t		gid;
829 	idm_svc_t		*idm_svc;
830 	int			status;
831 
832 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: HCA(0x%llx) port(%d)",
833 	    (longlong_t)event->ev_hca_guid, event->ev_port);
834 
835 	/*
836 	 * Query all ports on the HCA and update the port information
837 	 * maintainted in the iser_hca_t structure
838 	 */
839 	hca = iser_ib_guid2hca(event->ev_hca_guid);
840 	if (hca == NULL) {
841 
842 		/* HCA is just made available, first port on that HCA */
843 		hca = iser_ib_alloc_hca(event->ev_hca_guid);
844 
845 		mutex_enter(&iser_state->is_hcalist_lock);
846 		list_insert_tail(&iser_state->is_hcalist, hca);
847 		iser_state->is_num_hcas++;
848 		mutex_exit(&iser_state->is_hcalist_lock);
849 
850 	} else {
851 
852 		status = iser_ib_update_hcaports(hca);
853 
854 		if (status != IBT_SUCCESS) {
855 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
856 			    "status(0x%x): iser_ib_update_hcaports failed: "
857 			    "HCA(0x%llx) port(%d)", status,
858 			    (longlong_t)event->ev_hca_guid, event->ev_port);
859 			return;
860 		}
861 	}
862 
863 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
864 
865 	/*
866 	 * Iterate through the global list of IDM target services
867 	 * and check for existing iSER CM service.
868 	 */
869 	mutex_enter(&idm.idm_global_mutex);
870 	for (idm_svc = list_head(&idm.idm_tgt_svc_list);
871 	    idm_svc != NULL;
872 	    idm_svc = list_next(&idm.idm_tgt_svc_list, idm_svc)) {
873 
874 
875 		if (idm_svc->is_iser_svc == NULL) {
876 
877 			/* Establish a new CM service for iSER requests */
878 			status = iser_tgt_svc_create(
879 			    &idm_svc->is_svc_req, idm_svc);
880 
881 			if (status != IBT_SUCCESS) {
882 				ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
883 				    "status(0x%x): iser_tgt_svc_create failed: "
884 				    "HCA(0x%llx) port(%d)", status,
885 				    (longlong_t)event->ev_hca_guid,
886 				    event->ev_port);
887 
888 				continue;
889 			}
890 		}
891 
892 		status = iser_ib_activate_port(
893 		    idm_svc, event->ev_hca_guid, gid);
894 		if (status != IBT_SUCCESS) {
895 
896 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
897 			    "status(0x%x): Bind service on port "
898 			    "(%llx:%llx) failed",
899 			    status, (longlong_t)gid.gid_prefix,
900 			    (longlong_t)gid.gid_guid);
901 
902 			continue;
903 		}
904 		ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: service bound "
905 		    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
906 		    event->ev_port);
907 	}
908 	mutex_exit(&idm.idm_global_mutex);
909 
910 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event success: "
911 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
912 	    event->ev_port);
913 }
914 
915 /*
916  * iser_ib_handle_portdown_event()
917  * This handles the IBT_EVENT_PORT_DOWN unaffiliated asynchronous error.
918  *
919  * Unconfigure the CM service on the deactivated port and teardown the
920  * connections that are using the CM service.
921  */
922 /* ARGSUSED */
923 static void
924 iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
925 {
926 	iser_hca_t		*hca;
927 	ib_gid_t		gid;
928 	int			status;
929 
930 	/*
931 	 * Query all ports on the HCA and update the port information
932 	 * maintainted in the iser_hca_t structure
933 	 */
934 	hca = iser_ib_guid2hca(event->ev_hca_guid);
935 	ASSERT(hca != NULL);
936 
937 	status = iser_ib_update_hcaports(hca);
938 	if (status != IBT_SUCCESS) {
939 		ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event status(0x%x): "
940 		    "ibt_ib_update_hcaports failed: HCA(0x%llx) port(%d)",
941 		    status, (longlong_t)event->ev_hca_guid, event->ev_port);
942 		return;
943 	}
944 
945 	/* get the gid of the new port */
946 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
947 	iser_ib_deactivate_port(event->ev_hca_guid, gid);
948 
949 	ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event success: "
950 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
951 	    event->ev_port);
952 }
953 
954 /*
955  * iser_ib_handle_hca_detach_event()
956  * Quiesce all activity bound for the port, teardown the connection, unbind
957  * iSER services on all ports and release the HCA handle.
958  */
959 /* ARGSUSED */
960 static void
961 iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
962 {
963 	iser_hca_t	*nexthca, *hca;
964 	int		i, status;
965 
966 	ISER_LOG(CE_NOTE, "iser_ib_handle_hca_detach_event: HCA(0x%llx)",
967 	    (longlong_t)event->ev_hca_guid);
968 
969 	hca = iser_ib_guid2hca(event->ev_hca_guid);
970 	for (i = 0; i < hca->hca_num_ports; i++) {
971 		iser_ib_deactivate_port(hca->hca_guid,
972 		    hca->hca_port_info[i].p_sgid_tbl[0]);
973 	}
974 
975 	/*
976 	 * Update the HCA list maintained in the iser_state. Free the
977 	 * resources allocated to the HCA, i.e. caches, protection domain
978 	 */
979 	mutex_enter(&iser_state->is_hcalist_lock);
980 
981 	for (hca = list_head(&iser_state->is_hcalist);
982 	    hca != NULL;
983 	    hca = nexthca) {
984 
985 		nexthca = list_next(&iser_state->is_hcalist, hca);
986 
987 		if (hca->hca_guid == event->ev_hca_guid) {
988 
989 			list_remove(&iser_state->is_hcalist, hca);
990 			iser_state->is_num_hcas--;
991 
992 			status = iser_ib_free_hca(hca);
993 			if (status != DDI_SUCCESS) {
994 				ISER_LOG(CE_WARN, "iser_ib_handle_hca_detach: "
995 				    "Failed to free hca(%p)", (void *)hca);
996 				list_insert_tail(&iser_state->is_hcalist, hca);
997 				iser_state->is_num_hcas++;
998 			}
999 			/* No way to return status to IBT if this fails */
1000 		}
1001 	}
1002 	mutex_exit(&iser_state->is_hcalist_lock);
1003 
1004 }
1005 
1006 /*
1007  * iser_ib_async_handler
1008  * An IBT Asynchronous Event handler is registered it with the framework and
1009  * passed via the ibt_attach() routine. This function handles the following
1010  * asynchronous events.
1011  * IBT_EVENT_PORT_UP
1012  * IBT_ERROR_PORT_DOWN
1013  * IBT_HCA_ATTACH_EVENT
1014  * IBT_HCA_DETACH_EVENT
1015  */
1016 /* ARGSUSED */
1017 void
1018 iser_ib_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1019     ibt_async_event_t *event)
1020 {
1021 	switch (code) {
1022 	case IBT_EVENT_PORT_UP:
1023 		iser_ib_handle_portup_event(hdl, event);
1024 		break;
1025 
1026 	case IBT_ERROR_PORT_DOWN:
1027 		iser_ib_handle_portdown_event(hdl, event);
1028 		break;
1029 
1030 	case IBT_HCA_ATTACH_EVENT:
1031 		/*
1032 		 * A new HCA device is available for use, ignore this
1033 		 * event because the corresponding IBT_EVENT_PORT_UP
1034 		 * events will get triggered and handled accordingly.
1035 		 */
1036 		break;
1037 
1038 	case IBT_HCA_DETACH_EVENT:
1039 		iser_ib_handle_hca_detach_event(hdl, event);
1040 		break;
1041 
1042 	default:
1043 		break;
1044 	}
1045 }
1046 
1047 /*
1048  * iser_ib_init_hcas
1049  *
1050  * This function opens all the HCA devices, gathers the HCA state information
1051  * and adds the HCA handle for each HCA found in the iser_soft_state.
1052  */
1053 static int
1054 iser_ib_init_hcas(void)
1055 {
1056 	ib_guid_t	*guid;
1057 	int		num_hcas;
1058 	int		i;
1059 	iser_hca_t	*hca;
1060 
1061 	/* Retrieve the HCA list */
1062 	num_hcas = ibt_get_hca_list(&guid);
1063 	if (num_hcas == 0) {
1064 		/*
1065 		 * This shouldn't happen, but might if we have all HCAs
1066 		 * detach prior to initialization.
1067 		 */
1068 		return (DDI_FAILURE);
1069 	}
1070 
1071 	/* Initialize the hcalist lock */
1072 	mutex_init(&iser_state->is_hcalist_lock, NULL, MUTEX_DRIVER, NULL);
1073 
1074 	/* Create the HCA list */
1075 	list_create(&iser_state->is_hcalist, sizeof (iser_hca_t),
1076 	    offsetof(iser_hca_t, hca_node));
1077 
1078 	for (i = 0; i < num_hcas; i++) {
1079 
1080 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: initializing HCA "
1081 		    "(0x%llx)", (longlong_t)guid[i]);
1082 
1083 		hca = iser_ib_alloc_hca(guid[i]);
1084 		if (hca == NULL) {
1085 			/* This shouldn't happen, teardown and fail */
1086 			(void) iser_ib_fini_hcas();
1087 			(void) ibt_free_hca_list(guid, num_hcas);
1088 			return (DDI_FAILURE);
1089 		}
1090 
1091 		mutex_enter(&iser_state->is_hcalist_lock);
1092 		list_insert_tail(&iser_state->is_hcalist, hca);
1093 		iser_state->is_num_hcas++;
1094 		mutex_exit(&iser_state->is_hcalist_lock);
1095 
1096 	}
1097 
1098 	/* Free the IBT HCA list */
1099 	(void) ibt_free_hca_list(guid, num_hcas);
1100 
1101 	/* Check that we've initialized at least one HCA */
1102 	mutex_enter(&iser_state->is_hcalist_lock);
1103 	if (list_is_empty(&iser_state->is_hcalist)) {
1104 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: failed to initialize "
1105 		    "any HCAs");
1106 
1107 		mutex_exit(&iser_state->is_hcalist_lock);
1108 		(void) iser_ib_fini_hcas();
1109 		return (DDI_FAILURE);
1110 	}
1111 	mutex_exit(&iser_state->is_hcalist_lock);
1112 
1113 	return (DDI_SUCCESS);
1114 }
1115 
1116 /*
1117  * iser_ib_fini_hcas
1118  *
1119  * Teardown the iSER HCA list initialized above.
1120  */
1121 static int
1122 iser_ib_fini_hcas(void)
1123 {
1124 	iser_hca_t	*nexthca, *hca;
1125 	int		status;
1126 
1127 	mutex_enter(&iser_state->is_hcalist_lock);
1128 	for (hca = list_head(&iser_state->is_hcalist);
1129 	    hca != NULL;
1130 	    hca = nexthca) {
1131 
1132 		nexthca = list_next(&iser_state->is_hcalist, hca);
1133 
1134 		list_remove(&iser_state->is_hcalist, hca);
1135 
1136 		status = iser_ib_free_hca(hca);
1137 		if (status != IBT_SUCCESS) {
1138 			ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to free "
1139 			    "HCA during fini");
1140 			list_insert_tail(&iser_state->is_hcalist, hca);
1141 			return (DDI_FAILURE);
1142 		}
1143 
1144 		iser_state->is_num_hcas--;
1145 
1146 	}
1147 	mutex_exit(&iser_state->is_hcalist_lock);
1148 	list_destroy(&iser_state->is_hcalist);
1149 	mutex_destroy(&iser_state->is_hcalist_lock);
1150 
1151 	return (DDI_SUCCESS);
1152 }
1153 
1154 /*
1155  * iser_ib_alloc_hca
1156  *
1157  * This function opens the given HCA device, gathers the HCA state information
1158  * and adds the HCA handle
1159  */
1160 static iser_hca_t *
1161 iser_ib_alloc_hca(ib_guid_t guid)
1162 {
1163 	iser_hca_t	*hca;
1164 	int		status;
1165 
1166 	/* Allocate an iser_hca_t HCA handle */
1167 	hca = (iser_hca_t *)kmem_zalloc(sizeof (iser_hca_t), KM_SLEEP);
1168 
1169 	/* Open this HCA */
1170 	status = ibt_open_hca(iser_state->is_ibhdl, guid, &hca->hca_hdl);
1171 	if (status != IBT_SUCCESS) {
1172 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_open_hca failed:"
1173 		    " guid (0x%llx) status (0x%x)", (longlong_t)guid, status);
1174 		kmem_free(hca, sizeof (iser_hca_t));
1175 		return (NULL);
1176 	}
1177 
1178 	hca->hca_guid		= guid;
1179 	hca->hca_clnt_hdl	= iser_state->is_ibhdl;
1180 
1181 	/* Query the HCA */
1182 	status = ibt_query_hca(hca->hca_hdl, &hca->hca_attr);
1183 	if (status != IBT_SUCCESS) {
1184 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_query_hca "
1185 		    "failure: guid (0x%llx) status (0x%x)",
1186 		    (longlong_t)guid, status);
1187 		(void) ibt_close_hca(hca->hca_hdl);
1188 		kmem_free(hca, sizeof (iser_hca_t));
1189 		return (NULL);
1190 	}
1191 
1192 	/* Query all ports on the HCA */
1193 	status = ibt_query_hca_ports(hca->hca_hdl, 0,
1194 	    &hca->hca_port_info, &hca->hca_num_ports,
1195 	    &hca->hca_port_info_sz);
1196 	if (status != IBT_SUCCESS) {
1197 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: "
1198 		    "ibt_query_hca_ports failure: guid (0x%llx) "
1199 		    "status (0x%x)", (longlong_t)guid, status);
1200 		(void) ibt_close_hca(hca->hca_hdl);
1201 		kmem_free(hca, sizeof (iser_hca_t));
1202 		return (NULL);
1203 	}
1204 
1205 	/* Allocate a single PD on this HCA */
1206 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS,
1207 	    &hca->hca_pdhdl);
1208 	if (status != IBT_SUCCESS) {
1209 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_alloc_pd "
1210 		    "failure: guid (0x%llx) status (0x%x)",
1211 		    (longlong_t)guid, status);
1212 		(void) ibt_close_hca(hca->hca_hdl);
1213 		ibt_free_portinfo(hca->hca_port_info, hca->hca_port_info_sz);
1214 		kmem_free(hca, sizeof (iser_hca_t));
1215 		return (NULL);
1216 	}
1217 
1218 	/* Initialize the message and data MR caches for this HCA */
1219 	iser_init_hca_caches(hca);
1220 
1221 	return (hca);
1222 }
1223 
1224 static int
1225 iser_ib_free_hca(iser_hca_t *hca)
1226 {
1227 	int			status;
1228 	ibt_hca_portinfo_t	*hca_port_info;
1229 	uint_t			hca_port_info_sz;
1230 
1231 	ASSERT(hca != NULL);
1232 	if (hca->hca_failed)
1233 		return (DDI_FAILURE);
1234 
1235 	hca_port_info = hca->hca_port_info;
1236 	hca_port_info_sz = hca->hca_port_info_sz;
1237 
1238 	/*
1239 	 * Free the memory regions before freeing
1240 	 * the associated protection domain
1241 	 */
1242 	iser_fini_hca_caches(hca);
1243 
1244 	status = ibt_free_pd(hca->hca_hdl, hca->hca_pdhdl);
1245 	if (status != IBT_SUCCESS) {
1246 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: failed to free PD "
1247 		    "status=0x%x", status);
1248 		goto out_caches;
1249 	}
1250 
1251 	status = ibt_close_hca(hca->hca_hdl);
1252 	if (status != IBT_SUCCESS) {
1253 		ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to close HCA "
1254 		    "status=0x%x", status);
1255 		goto out_pd;
1256 	}
1257 
1258 	ibt_free_portinfo(hca_port_info, hca_port_info_sz);
1259 
1260 	kmem_free(hca, sizeof (iser_hca_t));
1261 	return (DDI_SUCCESS);
1262 
1263 	/*
1264 	 * We only managed to partially tear down the HCA, try to put it back
1265 	 * like it was before returning.
1266 	 */
1267 out_pd:
1268 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS, &hca->hca_pdhdl);
1269 	if (status != IBT_SUCCESS) {
1270 		hca->hca_failed = B_TRUE;
1271 		/* Report error and exit */
1272 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: could not re-alloc PD "
1273 		    "status=0x%x", status);
1274 		return (DDI_FAILURE);
1275 	}
1276 
1277 out_caches:
1278 	iser_init_hca_caches(hca);
1279 
1280 	return (DDI_FAILURE);
1281 }
1282 
1283 static int
1284 iser_ib_update_hcaports(iser_hca_t *hca)
1285 {
1286 	ibt_hca_portinfo_t	*pinfop, *oldpinfop;
1287 	uint_t			size, oldsize, nport;
1288 	int			status;
1289 
1290 	ASSERT(hca != NULL);
1291 
1292 	status = ibt_query_hca_ports(hca->hca_hdl, 0, &pinfop, &nport, &size);
1293 	if (status != IBT_SUCCESS) {
1294 		ISER_LOG(CE_NOTE, "ibt_query_hca_ports failed: %d", status);
1295 		return (status);
1296 	}
1297 
1298 	oldpinfop = hca->hca_port_info;
1299 	oldsize	= hca->hca_port_info_sz;
1300 	hca->hca_port_info = pinfop;
1301 	hca->hca_port_info_sz = size;
1302 
1303 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1304 
1305 	return (IBT_SUCCESS);
1306 }
1307 
1308 /*
1309  * iser_ib_gid2hca
1310  * Given a gid, find the corresponding hca
1311  */
1312 iser_hca_t *
1313 iser_ib_gid2hca(ib_gid_t gid)
1314 {
1315 
1316 	iser_hca_t	*hca;
1317 	int		i;
1318 
1319 	mutex_enter(&iser_state->is_hcalist_lock);
1320 	for (hca = list_head(&iser_state->is_hcalist);
1321 	    hca != NULL;
1322 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1323 
1324 		for (i = 0; i < hca->hca_num_ports; i++) {
1325 			if ((hca->hca_port_info[i].p_sgid_tbl[0].gid_prefix ==
1326 			    gid.gid_prefix) &&
1327 			    (hca->hca_port_info[i].p_sgid_tbl[0].gid_guid ==
1328 			    gid.gid_guid)) {
1329 
1330 				mutex_exit(&iser_state->is_hcalist_lock);
1331 
1332 				return (hca);
1333 			}
1334 		}
1335 	}
1336 	mutex_exit(&iser_state->is_hcalist_lock);
1337 	return (NULL);
1338 }
1339 
1340 /*
1341  * iser_ib_guid2hca
1342  * Given a HCA guid, find the corresponding HCA
1343  */
1344 iser_hca_t *
1345 iser_ib_guid2hca(ib_guid_t guid)
1346 {
1347 
1348 	iser_hca_t	*hca;
1349 
1350 	mutex_enter(&iser_state->is_hcalist_lock);
1351 	for (hca = list_head(&iser_state->is_hcalist);
1352 	    hca != NULL;
1353 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1354 
1355 		if (hca->hca_guid == guid) {
1356 			mutex_exit(&iser_state->is_hcalist_lock);
1357 			return (hca);
1358 		}
1359 	}
1360 	mutex_exit(&iser_state->is_hcalist_lock);
1361 	return (NULL);
1362 }
1363 
1364 /*
1365  * iser_ib_conv_sockaddr2ibtaddr
1366  * This function converts a socket address into the IBT format
1367  */
1368 void iser_ib_conv_sockaddr2ibtaddr(
1369     idm_sockaddr_t *saddr, ibt_ip_addr_t *ibt_addr)
1370 {
1371 	if (saddr == NULL) {
1372 		ibt_addr->family = AF_UNSPEC;
1373 		ibt_addr->un.ip4addr = 0;
1374 	} else {
1375 		switch (saddr->sin.sa_family) {
1376 		case AF_INET:
1377 
1378 			ibt_addr->family	= saddr->sin4.sin_family;
1379 			ibt_addr->un.ip4addr	= saddr->sin4.sin_addr.s_addr;
1380 			break;
1381 
1382 		case AF_INET6:
1383 
1384 			ibt_addr->family	= saddr->sin6.sin6_family;
1385 			ibt_addr->un.ip6addr	= saddr->sin6.sin6_addr;
1386 			break;
1387 
1388 		default:
1389 			ibt_addr->family = AF_UNSPEC;
1390 		}
1391 
1392 	}
1393 }
1394 
1395 /*
1396  * iser_ib_conv_ibtaddr2sockaddr
1397  * This function converts an IBT ip address handle to a sockaddr
1398  */
1399 void iser_ib_conv_ibtaddr2sockaddr(struct sockaddr_storage *ss,
1400     ibt_ip_addr_t *ibt_addr, in_port_t port)
1401 {
1402 	struct sockaddr_in *sin;
1403 	struct sockaddr_in6 *sin6;
1404 
1405 	switch (ibt_addr->family) {
1406 	case AF_INET:
1407 	case AF_UNSPEC:
1408 
1409 		sin = (struct sockaddr_in *)ibt_addr;
1410 		sin->sin_port = ntohs(port);
1411 		bcopy(sin, ss, sizeof (struct sockaddr_in));
1412 		break;
1413 
1414 	case AF_INET6:
1415 
1416 		sin6 = (struct sockaddr_in6 *)ibt_addr;
1417 		sin6->sin6_port = ntohs(port);
1418 		bcopy(sin6, ss, sizeof (struct sockaddr_in6));
1419 		break;
1420 
1421 	default:
1422 		ISER_LOG(CE_NOTE, "iser_ib_conv_ibtaddr2sockaddr: "
1423 		    "unknown family type: 0x%x", ibt_addr->family);
1424 	}
1425 }
1426 
1427 /*
1428  * iser_ib_setup_cq
1429  * This function sets up the Completion Queue size and allocates the specified
1430  * Completion Queue
1431  */
1432 static int
1433 iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size, ibt_cq_hdl_t *cq_hdl)
1434 {
1435 
1436 	ibt_cq_attr_t		cq_attr;
1437 	int			status;
1438 
1439 	cq_attr.cq_size		= cq_size;
1440 	cq_attr.cq_sched	= 0;
1441 	cq_attr.cq_flags	= IBT_CQ_NO_FLAGS;
1442 
1443 	/* Allocate a Completion Queue */
1444 	status = ibt_alloc_cq(hca_hdl, &cq_attr, cq_hdl, NULL);
1445 	if (status != IBT_SUCCESS) {
1446 		ISER_LOG(CE_NOTE, "iser_ib_setup_cq: ibt_alloc_cq failure (%d)",
1447 		    status);
1448 		return (status);
1449 	}
1450 
1451 	return (ISER_STATUS_SUCCESS);
1452 }
1453 
1454 /*
1455  * iser_ib_setup_chanargs
1456  *
1457  */
1458 static void
1459 iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
1460     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
1461     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs)
1462 {
1463 
1464 	bzero(cargs, sizeof (ibt_rc_chan_alloc_args_t));
1465 
1466 	/*
1467 	 * Set up the size of the channels send queue, receive queue and the
1468 	 * maximum number of elements in a scatter gather list of work requests
1469 	 * posted to the send and receive queues.
1470 	 */
1471 	cargs->rc_sizes.cs_sq		= sq_size;
1472 	cargs->rc_sizes.cs_rq		= rq_size;
1473 	cargs->rc_sizes.cs_sq_sgl	= ISER_IB_SGLIST_SIZE;
1474 	cargs->rc_sizes.cs_rq_sgl	= ISER_IB_SGLIST_SIZE;
1475 
1476 	/*
1477 	 * All Work requests signaled on a WR basis will receive a send
1478 	 * request completion.
1479 	 */
1480 	cargs->rc_flags			= IBT_ALL_SIGNALED;
1481 
1482 	/* Enable RDMA read and RDMA write on the channel end points */
1483 	cargs->rc_control		= IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1484 
1485 	/* Set the local hca port on which the channel is allocated */
1486 	cargs->rc_hca_port_num		= hca_port;
1487 
1488 	/* Set the Send and Receive Completion Queue handles */
1489 	cargs->rc_scq			= scq_hdl;
1490 	cargs->rc_rcq			= rcq_hdl;
1491 
1492 	/* Set the protection domain associated with the channel */
1493 	cargs->rc_pd			= hca_pdhdl;
1494 
1495 	/* No SRQ usage */
1496 	cargs->rc_srq			= NULL;
1497 }
1498 
1499 /*
1500  * iser_ib_init_qp
1501  * Initialize the QP handle
1502  */
1503 void
1504 iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size)
1505 {
1506 	/* Initialize the handle lock */
1507 	mutex_init(&chan->ic_qp.qp_lock, NULL, MUTEX_DRIVER, NULL);
1508 
1509 	/* Record queue sizes */
1510 	chan->ic_qp.sq_size = sq_size;
1511 	chan->ic_qp.rq_size = rq_size;
1512 
1513 	/* Initialize the RQ monitoring data */
1514 	chan->ic_qp.rq_depth  = rq_size;
1515 	chan->ic_qp.rq_level  = 0;
1516 	chan->ic_qp.rq_lwm = (chan->ic_recvcq_sz * ISER_IB_RQ_LWM_PCT) / 100;
1517 
1518 	/* Initialize the taskq flag */
1519 	chan->ic_qp.rq_taskqpending = B_FALSE;
1520 }
1521 
1522 /*
1523  * iser_ib_fini_qp
1524  * Teardown the QP handle
1525  */
1526 void
1527 iser_ib_fini_qp(iser_qp_t *qp)
1528 {
1529 	/* Destroy the handle lock */
1530 	mutex_destroy(&qp->qp_lock);
1531 }
1532 
1533 static int
1534 iser_ib_activate_port(idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid)
1535 {
1536 	iser_svc_t	*iser_svc;
1537 	iser_sbind_t	*is_sbind;
1538 	int		status;
1539 
1540 	iser_svc = idm_svc->is_iser_svc;
1541 
1542 	/*
1543 	 * Save the address of the service bind handle in the
1544 	 * iser_svc_t to undo the service binding at a later time
1545 	 */
1546 	is_sbind = kmem_zalloc(sizeof (iser_sbind_t), KM_SLEEP);
1547 	is_sbind->is_gid	= gid;
1548 	is_sbind->is_guid	= guid;
1549 
1550 	status  = ibt_bind_service(iser_svc->is_srvhdl, gid, NULL,
1551 	    idm_svc, &is_sbind->is_sbindhdl);
1552 
1553 	if (status != IBT_SUCCESS) {
1554 		ISER_LOG(CE_NOTE, "iser_ib_activate_port: status(0x%x): "
1555 		    "Bind service(%llx) on port(%llx:%llx) failed",
1556 		    status, (longlong_t)iser_svc->is_svcid,
1557 		    (longlong_t)gid.gid_prefix, (longlong_t)gid.gid_guid);
1558 
1559 		kmem_free(is_sbind, sizeof (iser_sbind_t));
1560 
1561 		return (status);
1562 	}
1563 
1564 	list_insert_tail(&iser_svc->is_sbindlist, is_sbind);
1565 
1566 	return (IBT_SUCCESS);
1567 }
1568 
1569 static void
1570 iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid)
1571 {
1572 	iser_svc_t	*iser_svc;
1573 	iser_conn_t	*iser_conn;
1574 	iser_sbind_t	*is_sbind;
1575 	idm_conn_t	*idm_conn;
1576 
1577 	/*
1578 	 * Iterate through the global list of IDM target connections.
1579 	 * Issue a TRANSPORT_FAIL for any connections on this port, and
1580 	 * if there is a bound service running on the port, tear it down.
1581 	 */
1582 	mutex_enter(&idm.idm_global_mutex);
1583 	for (idm_conn = list_head(&idm.idm_tgt_conn_list);
1584 	    idm_conn != NULL;
1585 	    idm_conn = list_next(&idm.idm_tgt_conn_list, idm_conn)) {
1586 
1587 		if (idm_conn->ic_transport_type != IDM_TRANSPORT_TYPE_ISER) {
1588 			/* this is not an iSER connection, skip it */
1589 			continue;
1590 		}
1591 
1592 		iser_conn = idm_conn->ic_transport_private;
1593 		if (iser_conn->ic_chan->ic_ibt_path.pi_hca_guid != hca_guid) {
1594 			/* this iSER connection is on a different port */
1595 			continue;
1596 		}
1597 
1598 		/* Fail the transport for this connection */
1599 		idm_conn_event(idm_conn, CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
1600 
1601 		if (idm_conn->ic_conn_type == CONN_TYPE_INI) {
1602 			/* initiator connection, nothing else to do */
1603 			continue;
1604 		}
1605 
1606 		/* Check for a service binding */
1607 		iser_svc = idm_conn->ic_svc_binding->is_iser_svc;
1608 		is_sbind = iser_ib_get_bind(iser_svc, hca_guid, gid);
1609 		if (is_sbind != NULL) {
1610 			/* This service is still bound, tear it down */
1611 			ibt_unbind_service(iser_svc->is_srvhdl,
1612 			    is_sbind->is_sbindhdl);
1613 			list_remove(&iser_svc->is_sbindlist, is_sbind);
1614 			kmem_free(is_sbind, sizeof (iser_sbind_t));
1615 		}
1616 	}
1617 	mutex_exit(&idm.idm_global_mutex);
1618 }
1619 
1620 static iser_sbind_t *
1621 iser_ib_get_bind(iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid)
1622 {
1623 	iser_sbind_t	*is_sbind;
1624 
1625 	for (is_sbind = list_head(&iser_svc->is_sbindlist);
1626 	    is_sbind != NULL;
1627 	    is_sbind = list_next(&iser_svc->is_sbindlist, is_sbind)) {
1628 
1629 		if ((is_sbind->is_guid == hca_guid) &&
1630 		    (is_sbind->is_gid.gid_prefix == gid.gid_prefix) &&
1631 		    (is_sbind->is_gid.gid_guid == gid.gid_guid)) {
1632 			return (is_sbind);
1633 		}
1634 	}
1635 	return (NULL);
1636 }
1637