xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c (revision d99cb22f7f0de8584336bda08cb86c562ffbab55)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/types.h>
78 #include <sys/ddi.h>
79 #include <sys/sunddi.h>
80 #include <sys/ib/clients/rds/rdsib_cm.h>
81 #include <sys/ib/clients/rds/rdsib_ib.h>
82 #include <sys/ib/clients/rds/rdsib_buf.h>
83 #include <sys/ib/clients/rds/rdsib_ep.h>
84 #include <sys/ib/clients/rds/rds_kstat.h>
85 
86 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl,
87     ibt_async_code_t code, ibt_async_event_t *event);
88 
89 static struct ibt_clnt_modinfo_s rds_ib_modinfo = {
90 	IBTI_V2,
91 	IBT_NETWORK,
92 	rds_async_handler,
93 	NULL,
94 	"RDS"
95 };
96 
97 /* performance tunables */
98 uint_t		rds_no_interrupts = 0;
99 uint_t		rds_poll_percent_full = 25;
100 uint_t		rds_wc_signal = IBT_NEXT_SOLICITED;
101 uint_t		rds_waittime_ms = 100; /* ms */
102 
103 extern dev_info_t *rdsib_dev_info;
104 extern void rds_close_sessions();
105 
106 static void
107 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp)
108 {
109 	/* The SQ size should not be more than that supported by the HCA */
110 	if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) ||
111 	    ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) {
112 		RDS_DPRINTF0("RDSIB", "MaxDataSendBuffers + %d is greater "
113 		    "than that supported by the HCA driver "
114 		    "(%d + %d > %d or %d), lowering it to a supported value.",
115 		    RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS,
116 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
117 
118 		MaxDataSendBuffers = (hattrp->hca_max_chan_sz >
119 		    hattrp->hca_max_cq_sz) ?
120 		    hattrp->hca_max_cq_sz - RDS_NUM_ACKS :
121 		    hattrp->hca_max_chan_sz - RDS_NUM_ACKS;
122 	}
123 
124 	/* The RQ size should not be more than that supported by the HCA */
125 	if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) ||
126 	    (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) {
127 		RDS_DPRINTF0("RDSIB", "MaxDataRecvBuffers is greater than that "
128 		    "supported by the HCA driver (%d > %d or %d), lowering it "
129 		    "to a supported value.", MaxDataRecvBuffers,
130 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
131 
132 		MaxDataRecvBuffers = (hattrp->hca_max_chan_sz >
133 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
134 		    hattrp->hca_max_chan_sz;
135 	}
136 
137 	/* The SQ size should not be more than that supported by the HCA */
138 	if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) ||
139 	    (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) {
140 		RDS_DPRINTF0("RDSIB", "MaxCtrlSendBuffers is greater than that "
141 		    "supported by the HCA driver (%d > %d or %d), lowering it "
142 		    "to a supported value.", MaxCtrlSendBuffers,
143 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
144 
145 		MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz >
146 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
147 		    hattrp->hca_max_chan_sz;
148 	}
149 
150 	/* The RQ size should not be more than that supported by the HCA */
151 	if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) ||
152 	    (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) {
153 		RDS_DPRINTF0("RDSIB", "MaxCtrlRecvBuffers is greater than that "
154 		    "supported by the HCA driver (%d > %d or %d), lowering it "
155 		    "to a supported value.", MaxCtrlRecvBuffers,
156 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
157 
158 		MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz >
159 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
160 		    hattrp->hca_max_chan_sz;
161 	}
162 
163 	/* The MaxRecvMemory should be less than that supported by the HCA */
164 	if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) {
165 		RDS_DPRINTF0("RDSIB", "MaxRecvMemory is greater than that "
166 		    "supported by the HCA driver (%d > %d), lowering it to %d",
167 		    NDataRX * RdsPktSize, hattrp->hca_max_memr_len,
168 		    hattrp->hca_max_memr_len);
169 
170 		NDataRX = hattrp->hca_max_memr_len/RdsPktSize;
171 	}
172 }
173 
174 /*
175  * Called from attach
176  */
177 int
178 rdsib_initialize_ib()
179 {
180 	ib_guid_t	*guidp;
181 	rds_hca_t	*hcap, *hcap1;
182 	uint_t		ix, hcaix, nhcas;
183 	int		ret;
184 
185 	RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep);
186 
187 	ASSERT(rdsib_statep != NULL);
188 	if (rdsib_statep == NULL) {
189 		RDS_DPRINTF1("rdsib_initialize_ib",
190 		    "RDS Statep not initialized");
191 		return (-1);
192 	}
193 
194 	/* How many hcas are there? */
195 	nhcas = ibt_get_hca_list(&guidp);
196 	if (nhcas == 0) {
197 		RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available");
198 		return (-1);
199 	}
200 
201 	RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas);
202 
203 	/* Register with IBTF */
204 	ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep,
205 	    &rdsib_statep->rds_ibhdl);
206 	if (ret != IBT_SUCCESS) {
207 		RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d",
208 		    ret);
209 		(void) ibt_free_hca_list(guidp, nhcas);
210 		return (-1);
211 	}
212 
213 	/*
214 	 * Open each HCA and gather its information. Don't care about HCAs
215 	 * that cannot be opened. It is OK as long as atleast one HCA can be
216 	 * opened.
217 	 * Initialize a HCA only if all the information is available.
218 	 */
219 	hcap1 = NULL;
220 	for (ix = 0, hcaix = 0; ix < nhcas; ix++) {
221 		RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]);
222 
223 		hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP);
224 
225 		ret = ibt_open_hca(rdsib_statep->rds_ibhdl, guidp[ix],
226 		    &hcap->hca_hdl);
227 		if (ret != IBT_SUCCESS) {
228 			RDS_DPRINTF2("rdsib_initialize_ib",
229 			    "ibt_open_hca: 0x%llx failed: %d", guidp[ix], ret);
230 			kmem_free(hcap, sizeof (rds_hca_t));
231 			continue;
232 		}
233 
234 		hcap->hca_guid = guidp[ix];
235 
236 		ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr);
237 		if (ret != IBT_SUCCESS) {
238 			RDS_DPRINTF2("rdsib_initialize_ib",
239 			    "Query HCA: 0x%llx failed:  %d", guidp[ix], ret);
240 			ret = ibt_close_hca(hcap->hca_hdl);
241 			ASSERT(ret == IBT_SUCCESS);
242 			kmem_free(hcap, sizeof (rds_hca_t));
243 			continue;
244 		}
245 
246 		ret = ibt_query_hca_ports(hcap->hca_hdl, 0,
247 		    &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz);
248 		if (ret != IBT_SUCCESS) {
249 			RDS_DPRINTF2("rdsib_initialize_ib",
250 			    "Query HCA 0x%llx ports failed: %d", guidp[ix],
251 			    ret);
252 			ret = ibt_close_hca(hcap->hca_hdl);
253 			ASSERT(ret == IBT_SUCCESS);
254 			kmem_free(hcap, sizeof (rds_hca_t));
255 			continue;
256 		}
257 
258 		/* Only one PD per HCA is allocated, so do it here */
259 		ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS,
260 		    &hcap->hca_pdhdl);
261 		if (ret != IBT_SUCCESS) {
262 			RDS_DPRINTF2("rdsib_initialize_ib",
263 			    "ibt_alloc_pd 0x%llx failed: %d", guidp[ix], ret);
264 			(void) ibt_free_portinfo(hcap->hca_pinfop,
265 			    hcap->hca_pinfo_sz);
266 			ret = ibt_close_hca(hcap->hca_hdl);
267 			ASSERT(ret == IBT_SUCCESS);
268 			kmem_free(hcap, sizeof (rds_hca_t));
269 			continue;
270 		}
271 
272 		rdsib_validate_chan_sizes(&hcap->hca_attr);
273 
274 		/* this HCA is fully initialized, go to the next one */
275 		hcaix++;
276 		hcap->hca_nextp = hcap1;
277 		hcap1 = hcap;
278 	}
279 
280 	/* free the HCA list, we are done with it */
281 	(void) ibt_free_hca_list(guidp, nhcas);
282 
283 	if (hcaix == 0) {
284 		/* Failed to Initialize even one HCA */
285 		RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized");
286 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
287 		rdsib_statep->rds_ibhdl = NULL;
288 		return (-1);
289 	}
290 
291 	if (hcaix < nhcas) {
292 		RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
293 		    (nhcas - hcaix), nhcas);
294 	}
295 
296 	rdsib_statep->rds_hcalistp = hcap1;
297 	rdsib_statep->rds_nhcas = hcaix;
298 
299 	RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep);
300 
301 	return (0);
302 }
303 
304 /*
305  * Called from detach
306  */
307 void
308 rdsib_deinitialize_ib()
309 {
310 	rds_hca_t	*hcap, *nextp;
311 	int		ret;
312 
313 	RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep);
314 
315 	/* close and destroy all the sessions */
316 	rds_close_sessions(NULL);
317 
318 	/* Release all HCA resources */
319 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
320 	hcap = rdsib_statep->rds_hcalistp;
321 	rdsib_statep->rds_hcalistp = NULL;
322 	rdsib_statep->rds_nhcas = 0;
323 	rw_exit(&rdsib_statep->rds_hca_lock);
324 
325 	while (hcap != NULL) {
326 		nextp = hcap->hca_nextp;
327 
328 		ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
329 		ASSERT(ret == IBT_SUCCESS);
330 
331 		(void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz);
332 
333 		ret = ibt_close_hca(hcap->hca_hdl);
334 		ASSERT(ret == IBT_SUCCESS);
335 
336 		kmem_free(hcap, sizeof (rds_hca_t));
337 		hcap = nextp;
338 	}
339 
340 	/* Deregister with IBTF */
341 	if (rdsib_statep->rds_ibhdl != NULL) {
342 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
343 		rdsib_statep->rds_ibhdl = NULL;
344 	}
345 
346 	RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p",
347 	    rdsib_statep);
348 }
349 
350 /*
351  * Called on open of first RDS socket
352  */
353 int
354 rdsib_open_ib()
355 {
356 	int	ret;
357 
358 	RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep);
359 
360 	/* Enable incoming connection requests */
361 	if (rdsib_statep->rds_srvhdl == NULL) {
362 		rdsib_statep->rds_srvhdl =
363 		    rds_register_service(rdsib_statep->rds_ibhdl);
364 		if (rdsib_statep->rds_srvhdl == NULL) {
365 			RDS_DPRINTF2("rdsib_open_ib",
366 			    "Service registration failed");
367 			return (-1);
368 		} else {
369 			/* bind the service on all available ports */
370 			ret = rds_bind_service(rdsib_statep);
371 			if (ret != 0) {
372 				RDS_DPRINTF2("rdsib_open_ib",
373 				    "Bind service failed: %d", ret);
374 			}
375 		}
376 	}
377 
378 	RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep);
379 
380 	return (0);
381 }
382 
383 /*
384  * Called when all ports are closed.
385  */
386 void
387 rdsib_close_ib()
388 {
389 	int	ret;
390 
391 	RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep);
392 
393 	/* Disable incoming connection requests */
394 	if (rdsib_statep->rds_srvhdl != NULL) {
395 		ret = ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
396 		if (ret != 0) {
397 			RDS_DPRINTF2("rdsib_close_ib",
398 			    "ibt_unbind_all_services failed: %d\n", ret);
399 		}
400 		ret = ibt_deregister_service(rdsib_statep->rds_ibhdl,
401 		    rdsib_statep->rds_srvhdl);
402 		if (ret != 0) {
403 			RDS_DPRINTF2("rdsib_close_ib",
404 			    "ibt_deregister_service failed: %d\n", ret);
405 		} else {
406 			rdsib_statep->rds_srvhdl = NULL;
407 		}
408 
409 		ret = ibt_unbind_all_services(rdsib_statep->rds_old_srvhdl);
410 		if (ret != 0) {
411 			RDS_DPRINTF2("rdsib_close_ib",
412 			    "ibt_unbind_all_services failed for old service"
413 			    ": %d\n", ret);
414 		}
415 		ret = ibt_deregister_service(rdsib_statep->rds_ibhdl,
416 		    rdsib_statep->rds_old_srvhdl);
417 		if (ret != 0) {
418 			RDS_DPRINTF2("rdsib_close_ib",
419 			    "ibt_deregister_service failed for old service:"
420 			    "%d\n", ret);
421 		} else {
422 			rdsib_statep->rds_old_srvhdl = NULL;
423 		}
424 	}
425 
426 	RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep);
427 }
428 
429 /* Return hcap, given the hca guid */
430 rds_hca_t *
431 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid)
432 {
433 	rds_hca_t	*hcap;
434 
435 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
436 	    "guid: %llx", statep, hca_guid);
437 
438 	rw_enter(&statep->rds_hca_lock, RW_READER);
439 
440 	hcap = statep->rds_hcalistp;
441 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
442 		hcap = hcap->hca_nextp;
443 	}
444 
445 	rw_exit(&statep->rds_hca_lock);
446 
447 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
448 
449 	return (hcap);
450 }
451 
452 /* Return hcap, given a gid */
453 rds_hca_t *
454 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
455 {
456 	rds_hca_t	*hcap;
457 	uint_t		ix;
458 
459 	RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
460 	    statep, gid.gid_prefix, gid.gid_guid);
461 
462 	rw_enter(&statep->rds_hca_lock, RW_READER);
463 
464 	hcap = statep->rds_hcalistp;
465 	while (hcap != NULL) {
466 		for (ix = 0; ix < hcap->hca_nports; ix++) {
467 			if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
468 			    gid.gid_prefix) &&
469 			    (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
470 			    gid.gid_guid)) {
471 				RDS_DPRINTF4("rds_gid_to_hcap",
472 				    "gid found in hcap: 0x%p", hcap);
473 				rw_exit(&statep->rds_hca_lock);
474 				return (hcap);
475 			}
476 		}
477 		hcap = hcap->hca_nextp;
478 	}
479 
480 	rw_exit(&statep->rds_hca_lock);
481 
482 	return (NULL);
483 }
484 
485 /* This is called from the send CQ handler */
486 void
487 rds_send_acknowledgement(rds_ep_t *ep)
488 {
489 	int	ret;
490 	uint_t	ix;
491 
492 	RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep);
493 
494 	mutex_enter(&ep->ep_lock);
495 
496 	ASSERT(ep->ep_rdmacnt != 0);
497 
498 	/*
499 	 * The previous ACK completed successfully, send the next one
500 	 * if more messages were received after sending the last ACK
501 	 */
502 	if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) {
503 		*(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
504 		mutex_exit(&ep->ep_lock);
505 
506 		/* send acknowledgement */
507 		RDS_INCR_TXACKS();
508 		ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
509 		if (ret != IBT_SUCCESS) {
510 			RDS_DPRINTF1("rds_send_acknowledgement",
511 			    "EP(%p): ibt_post_send for acknowledgement "
512 			    "failed: %d, SQ depth: %d",
513 			    ep, ret, ep->ep_sndpool.pool_nbusy);
514 			mutex_enter(&ep->ep_lock);
515 			ep->ep_rdmacnt--;
516 			mutex_exit(&ep->ep_lock);
517 		}
518 	} else {
519 		/* ACKed all messages, no more to ACK */
520 		ep->ep_rdmacnt--;
521 		mutex_exit(&ep->ep_lock);
522 		return;
523 	}
524 
525 	RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep);
526 }
527 
528 static int
529 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
530 {
531 	ibt_wc_t	wc;
532 	uint_t		npolled;
533 	rds_buf_t	*bp;
534 	rds_ctrl_pkt_t	*cpkt;
535 	rds_qp_t	*recvqp;
536 	int		ret = IBT_SUCCESS;
537 
538 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep);
539 
540 	bzero(&wc, sizeof (ibt_wc_t));
541 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
542 	if (ret != IBT_SUCCESS) {
543 		if (ret != IBT_CQ_EMPTY) {
544 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
545 			    "returned: %d", ep, cq, ret);
546 		} else {
547 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
548 			    "returned: IBT_CQ_EMPTY", ep, cq);
549 		}
550 		return (ret);
551 	}
552 
553 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
554 
555 	if (wc.wc_status != IBT_WC_SUCCESS) {
556 		mutex_enter(&ep->ep_recvqp.qp_lock);
557 		ep->ep_recvqp.qp_level--;
558 		mutex_exit(&ep->ep_recvqp.qp_lock);
559 
560 		/* Free the buffer */
561 		bp->buf_state = RDS_RCVBUF_FREE;
562 		rds_free_recv_buf(bp, 1);
563 
564 		/* Receive completion failure */
565 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
566 			RDS_DPRINTF2("rds_poll_ctrl_completions",
567 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
568 			    ep, cq, wc.wc_id, wc.wc_status);
569 		}
570 		return (ret);
571 	}
572 
573 	/* there is one less in the RQ */
574 	recvqp = &ep->ep_recvqp;
575 	mutex_enter(&recvqp->qp_lock);
576 	recvqp->qp_level--;
577 	if ((recvqp->qp_taskqpending == B_FALSE) &&
578 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
579 		/* Time to post more buffers into the RQ */
580 		recvqp->qp_taskqpending = B_TRUE;
581 		mutex_exit(&recvqp->qp_lock);
582 
583 		ret = ddi_taskq_dispatch(rds_taskq,
584 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
585 		if (ret != DDI_SUCCESS) {
586 			RDS_DPRINTF1(LABEL, "ddi_taskq_dispatch failed: %d",
587 			    ret);
588 			mutex_enter(&recvqp->qp_lock);
589 			recvqp->qp_taskqpending = B_FALSE;
590 			mutex_exit(&recvqp->qp_lock);
591 		}
592 	} else {
593 		mutex_exit(&recvqp->qp_lock);
594 	}
595 
596 	cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
597 	rds_handle_control_message(ep->ep_sp, cpkt);
598 
599 	bp->buf_state = RDS_RCVBUF_FREE;
600 	rds_free_recv_buf(bp, 1);
601 
602 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep);
603 
604 	return (ret);
605 }
606 
607 #define	RDS_POST_FEW_ATATIME	100
608 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
609 void
610 rds_post_recv_buf(void *arg)
611 {
612 	ibt_channel_hdl_t	chanhdl;
613 	rds_ep_t		*ep;
614 	rds_session_t		*sp;
615 	rds_qp_t		*recvqp;
616 	rds_bufpool_t		*gp;
617 	rds_buf_t		*bp, *bp1;
618 	ibt_recv_wr_t		*wrp, wr[RDS_POST_FEW_ATATIME];
619 	rds_hca_t		*hcap;
620 	uint_t			npost, nspace, rcv_len;
621 	uint_t			ix, jx, kx;
622 	int			ret;
623 
624 	chanhdl = (ibt_channel_hdl_t)arg;
625 	RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl);
626 	RDS_INCR_POST_RCV_BUF_CALLS();
627 
628 	ep = (rds_ep_t *)ibt_get_chan_private(chanhdl);
629 	ASSERT(ep != NULL);
630 	sp = ep->ep_sp;
631 	recvqp = &ep->ep_recvqp;
632 
633 	RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep);
634 
635 	/* get the hcap for the HCA hosting this channel */
636 	hcap = rds_get_hcap(rdsib_statep, ep->ep_hca_guid);
637 	if (hcap == NULL) {
638 		RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
639 		    ep->ep_hca_guid);
640 		return;
641 	}
642 
643 	/* Make sure the session is still connected */
644 	rw_enter(&sp->session_lock, RW_READER);
645 	if ((sp->session_state != RDS_SESSION_STATE_INIT) &&
646 	    (sp->session_state != RDS_SESSION_STATE_CONNECTED)) {
647 		RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
648 		    "in active state (%d)", ep, sp->session_state);
649 		rw_exit(&sp->session_lock);
650 		return;
651 	}
652 	rw_exit(&sp->session_lock);
653 
654 	/* how many can be posted */
655 	mutex_enter(&recvqp->qp_lock);
656 	nspace = recvqp->qp_depth - recvqp->qp_level;
657 	if (nspace == 0) {
658 		RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
659 		recvqp->qp_taskqpending = B_FALSE;
660 		mutex_exit(&recvqp->qp_lock);
661 		return;
662 	}
663 	mutex_exit(&recvqp->qp_lock);
664 
665 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
666 		gp = &rds_dpool;
667 		rcv_len = RdsPktSize;
668 	} else {
669 		gp = &rds_cpool;
670 		rcv_len = RDS_CTRLPKT_SIZE;
671 	}
672 
673 	bp = rds_get_buf(gp, nspace, &jx);
674 	if (bp == NULL) {
675 		RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep);
676 		/* try again later */
677 		ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf,
678 		    (void *)ep->ep_chanhdl, DDI_NOSLEEP);
679 		if (ret != DDI_SUCCESS) {
680 			RDS_DPRINTF1(LABEL, "ddi_taskq_dispatch failed: %d",
681 			    ret);
682 			mutex_enter(&recvqp->qp_lock);
683 			recvqp->qp_taskqpending = B_FALSE;
684 			mutex_exit(&recvqp->qp_lock);
685 		}
686 		return;
687 	}
688 
689 	if (jx != nspace) {
690 		RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers "
691 		    "needed: %d available: %d", ep, nspace, jx);
692 		nspace = jx;
693 	}
694 
695 	bp1 = bp;
696 	for (ix = 0; ix < nspace; ix++) {
697 		bp1->buf_ep = ep;
698 		ASSERT(bp1->buf_state == RDS_RCVBUF_FREE);
699 		bp1->buf_state = RDS_RCVBUF_POSTED;
700 		bp1->buf_ds.ds_key = hcap->hca_lkey;
701 		bp1->buf_ds.ds_len = rcv_len;
702 		bp1 = bp1->buf_nextp;
703 	}
704 
705 #if 0
706 	wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t),
707 	    KM_SLEEP);
708 #else
709 	wrp = &wr[0];
710 #endif
711 
712 	npost = nspace;
713 	while (npost) {
714 		jx = (npost > RDS_POST_FEW_ATATIME) ?
715 		    RDS_POST_FEW_ATATIME : npost;
716 		for (ix = 0; ix < jx; ix++) {
717 			wrp[ix].wr_id = (uintptr_t)bp;
718 			wrp[ix].wr_nds = 1;
719 			wrp[ix].wr_sgl = &bp->buf_ds;
720 			bp = bp->buf_nextp;
721 		}
722 
723 		ret = ibt_post_recv(chanhdl, wrp, jx, &kx);
724 		if ((ret != IBT_SUCCESS) || (kx != jx)) {
725 			RDS_DPRINTF1(LABEL, "ibt_post_recv for %d WRs failed: "
726 			    "%d", npost, ret);
727 			npost -= kx;
728 			break;
729 		}
730 
731 		npost -= jx;
732 	}
733 
734 	mutex_enter(&recvqp->qp_lock);
735 	if (npost != 0) {
736 		RDS_DPRINTF2("rds_post_recv_buf",
737 		    "EP(%p) Failed to post %d WRs", ep, npost);
738 		recvqp->qp_level += (nspace - npost);
739 	} else {
740 		recvqp->qp_level += nspace;
741 	}
742 
743 	/*
744 	 * sometimes, the recv WRs can get consumed as soon as they are
745 	 * posted. In that case, taskq thread to post more WRs to the RQ will
746 	 * not be scheduled as the taskqpending flag is still set.
747 	 */
748 	if (recvqp->qp_level == 0) {
749 		mutex_exit(&recvqp->qp_lock);
750 		ret = ddi_taskq_dispatch(rds_taskq,
751 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
752 		if (ret != DDI_SUCCESS) {
753 			RDS_DPRINTF1("rds_post_recv_buf",
754 			    "ddi_taskq_dispatch failed: %d", ret);
755 			mutex_enter(&recvqp->qp_lock);
756 			recvqp->qp_taskqpending = B_FALSE;
757 			mutex_exit(&recvqp->qp_lock);
758 		}
759 	} else {
760 		recvqp->qp_taskqpending = B_FALSE;
761 		mutex_exit(&recvqp->qp_lock);
762 	}
763 
764 #if 0
765 	kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t));
766 #endif
767 
768 	RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep);
769 }
770 
771 static int
772 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
773 {
774 	ibt_wc_t	wc;
775 	rds_buf_t	*bp;
776 	rds_data_hdr_t	*pktp;
777 	rds_qp_t	*recvqp;
778 	uint_t		npolled;
779 	int		ret = IBT_SUCCESS;
780 
781 
782 	RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep);
783 
784 	bzero(&wc, sizeof (ibt_wc_t));
785 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
786 	if (ret != IBT_SUCCESS) {
787 		if (ret != IBT_CQ_EMPTY) {
788 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
789 			    "returned: %d", ep, cq, ret);
790 		} else {
791 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
792 			    "returned: IBT_CQ_EMPTY", ep, cq);
793 		}
794 		return (ret);
795 	}
796 
797 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
798 	ASSERT(bp->buf_state == RDS_RCVBUF_POSTED);
799 	bp->buf_state = RDS_RCVBUF_ONSOCKQ;
800 	bp->buf_nextp = NULL;
801 
802 	if (wc.wc_status != IBT_WC_SUCCESS) {
803 		mutex_enter(&ep->ep_recvqp.qp_lock);
804 		ep->ep_recvqp.qp_level--;
805 		mutex_exit(&ep->ep_recvqp.qp_lock);
806 
807 		/* free the buffer */
808 		bp->buf_state = RDS_RCVBUF_FREE;
809 		rds_free_recv_buf(bp, 1);
810 
811 		/* Receive completion failure */
812 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
813 			RDS_DPRINTF2("rds_poll_data_completions",
814 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
815 			    ep, cq, wc.wc_id, wc.wc_status);
816 			RDS_INCR_RXERRS();
817 		}
818 		return (ret);
819 	}
820 
821 	/* there is one less in the RQ */
822 	recvqp = &ep->ep_recvqp;
823 	mutex_enter(&recvqp->qp_lock);
824 	recvqp->qp_level--;
825 	if ((recvqp->qp_taskqpending == B_FALSE) &&
826 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
827 		/* Time to post more buffers into the RQ */
828 		recvqp->qp_taskqpending = B_TRUE;
829 		mutex_exit(&recvqp->qp_lock);
830 
831 		ret = ddi_taskq_dispatch(rds_taskq,
832 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
833 		if (ret != DDI_SUCCESS) {
834 			RDS_DPRINTF1(LABEL, "ddi_taskq_dispatch failed: %d",
835 			    ret);
836 			mutex_enter(&recvqp->qp_lock);
837 			recvqp->qp_taskqpending = B_FALSE;
838 			mutex_exit(&recvqp->qp_lock);
839 		}
840 	} else {
841 		mutex_exit(&recvqp->qp_lock);
842 	}
843 
844 	pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
845 	ASSERT(pktp->dh_datalen != 0);
846 
847 	RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x "
848 	    "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
849 	    ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
850 	    pktp->dh_npkts, pktp->dh_psn);
851 
852 	RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp,
853 	    pktp->dh_npkts, pktp->dh_psn);
854 
855 	if (pktp->dh_npkts == 1) {
856 		/* single pkt or last packet */
857 		if (pktp->dh_psn != 0) {
858 			/* last packet of a segmented message */
859 			ASSERT(ep->ep_seglbp != NULL);
860 			ep->ep_seglbp->buf_nextp = bp;
861 			ep->ep_seglbp = bp;
862 			rds_received_msg(ep, ep->ep_segfbp);
863 			ep->ep_segfbp = NULL;
864 			ep->ep_seglbp = NULL;
865 		} else {
866 			/* single packet */
867 			rds_received_msg(ep, bp);
868 		}
869 	} else {
870 		/* multi-pkt msg */
871 		if (pktp->dh_psn == 0) {
872 			/* first packet */
873 			ASSERT(ep->ep_segfbp == NULL);
874 			ep->ep_segfbp = bp;
875 			ep->ep_seglbp = bp;
876 		} else {
877 			/* intermediate packet */
878 			ASSERT(ep->ep_segfbp != NULL);
879 			ep->ep_seglbp->buf_nextp = bp;
880 			ep->ep_seglbp = bp;
881 		}
882 	}
883 
884 	RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep);
885 
886 	return (ret);
887 }
888 
889 void
890 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg)
891 {
892 	rds_ep_t	*ep;
893 	int		ret = IBT_SUCCESS;
894 	int		(*func)(ibt_cq_hdl_t, rds_ep_t *);
895 
896 	ep = (rds_ep_t *)arg;
897 
898 	RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep);
899 
900 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
901 		func = rds_poll_data_completions;
902 	} else {
903 		func = rds_poll_ctrl_completions;
904 	}
905 
906 	do {
907 		ret = func(cq, ep);
908 	} while (ret != IBT_CQ_EMPTY);
909 
910 	/* enable the CQ */
911 	ret = ibt_enable_cq_notify(cq, rds_wc_signal);
912 	if (ret != IBT_SUCCESS) {
913 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
914 		    "failed: %d", ep, cq, ret);
915 		return;
916 	}
917 
918 	do {
919 		ret = func(cq, ep);
920 	} while (ret != IBT_CQ_EMPTY);
921 
922 	RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep);
923 }
924 
925 void
926 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock)
927 {
928 	ibt_wc_t	wc[RDS_NUM_DATA_SEND_WCS];
929 	uint_t		npolled, nret, send_error = 0;
930 	rds_buf_t	*headp, *tailp, *bp;
931 	int		ret, ix;
932 
933 	RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep);
934 
935 	headp = NULL;
936 	tailp = NULL;
937 	npolled = 0;
938 	do {
939 		ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret);
940 		if (ret != IBT_SUCCESS) {
941 			if (ret != IBT_CQ_EMPTY) {
942 				RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): "
943 				    "ibt_poll_cq returned: %d", ep, cq, ret);
944 			} else {
945 				RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): "
946 				    "ibt_poll_cq returned: IBT_CQ_EMPTY",
947 				    ep, cq);
948 			}
949 
950 			break;
951 		}
952 
953 		for (ix = 0; ix < nret; ix++) {
954 			if (wc[ix].wc_status == IBT_WC_SUCCESS) {
955 				if (wc[ix].wc_type == IBT_WRC_RDMAW) {
956 					rds_send_acknowledgement(ep);
957 					continue;
958 				}
959 
960 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
961 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
962 				bp->buf_state = RDS_SNDBUF_FREE;
963 			} else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) {
964 				RDS_INCR_TXERRS();
965 				RDS_DPRINTF5("rds_poll_send_completions",
966 				    "EP(%p): WC ID: %p ERROR: %d", ep,
967 				    wc[ix].wc_id, wc[ix].wc_status);
968 
969 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
970 					mutex_enter(&ep->ep_lock);
971 					ep->ep_rdmacnt--;
972 					mutex_exit(&ep->ep_lock);
973 					continue;
974 				}
975 
976 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
977 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
978 				bp->buf_state = RDS_SNDBUF_FREE;
979 			} else {
980 				RDS_INCR_TXERRS();
981 				RDS_DPRINTF2("rds_poll_send_completions",
982 				    "EP(%p): WC ID: %p ERROR: %d", ep,
983 				    wc[ix].wc_id, wc[ix].wc_status);
984 				if (send_error == 0) {
985 					rds_session_t	*sp = ep->ep_sp;
986 
987 					/* don't let anyone send anymore */
988 					rw_enter(&sp->session_lock, RW_WRITER);
989 					if (sp->session_state !=
990 					    RDS_SESSION_STATE_ERROR) {
991 						sp->session_state =
992 						    RDS_SESSION_STATE_ERROR;
993 						/* Make this the active end */
994 						sp->session_type =
995 						    RDS_SESSION_ACTIVE;
996 					}
997 					rw_exit(&sp->session_lock);
998 				}
999 
1000 				send_error++;
1001 
1002 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1003 					mutex_enter(&ep->ep_lock);
1004 					ep->ep_rdmacnt--;
1005 					mutex_exit(&ep->ep_lock);
1006 					continue;
1007 				}
1008 
1009 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1010 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1011 				bp->buf_state = RDS_SNDBUF_FREE;
1012 			}
1013 
1014 			bp->buf_nextp = NULL;
1015 			if (headp) {
1016 				tailp->buf_nextp = bp;
1017 				tailp = bp;
1018 			} else {
1019 				headp = bp;
1020 				tailp = bp;
1021 			}
1022 
1023 			npolled++;
1024 		}
1025 
1026 		if (rds_no_interrupts && (npolled > 100)) {
1027 			break;
1028 		}
1029 
1030 		if (rds_no_interrupts == 1) {
1031 			break;
1032 		}
1033 	} while (ret != IBT_CQ_EMPTY);
1034 
1035 	RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
1036 	    npolled, send_error);
1037 
1038 	/* put the buffers to the pool */
1039 	if (npolled != 0) {
1040 		rds_free_send_buf(ep, headp, tailp, npolled, lock);
1041 	}
1042 
1043 	if (send_error != 0) {
1044 		rds_handle_send_error(ep);
1045 	}
1046 
1047 	RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep);
1048 }
1049 
1050 void
1051 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg)
1052 {
1053 	rds_ep_t	*ep;
1054 	int		ret;
1055 
1056 	ep = (rds_ep_t *)arg;
1057 
1058 	RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep);
1059 
1060 	/* enable the CQ */
1061 	ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION);
1062 	if (ret != IBT_SUCCESS) {
1063 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1064 		    "failed: %d", ep, cq, ret);
1065 		return;
1066 	}
1067 
1068 	rds_poll_send_completions(cq, ep, B_FALSE);
1069 
1070 	RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep);
1071 }
1072 
1073 void
1074 rds_ep_free_rc_channel(rds_ep_t *ep)
1075 {
1076 	int ret;
1077 
1078 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep);
1079 
1080 	ASSERT(mutex_owned(&ep->ep_lock));
1081 
1082 	/* free the QP */
1083 	if (ep->ep_chanhdl != NULL) {
1084 		/* wait until the RQ is empty */
1085 		(void) ibt_flush_channel(ep->ep_chanhdl);
1086 		(void) rds_is_recvq_empty(ep, B_TRUE);
1087 		ret = ibt_free_channel(ep->ep_chanhdl);
1088 		if (ret != IBT_SUCCESS) {
1089 			RDS_DPRINTF1("rds_ep_free_rc_channel", "EP(%p) "
1090 			    "ibt_free_channel returned: %d", ep, ret);
1091 		}
1092 		ep->ep_chanhdl = NULL;
1093 	} else {
1094 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1095 		    "EP(%p) Channel is ALREADY FREE", ep);
1096 	}
1097 
1098 	/* free the Send CQ */
1099 	if (ep->ep_sendcq != NULL) {
1100 		ret = ibt_free_cq(ep->ep_sendcq);
1101 		if (ret != IBT_SUCCESS) {
1102 			RDS_DPRINTF1("rds_ep_free_rc_channel",
1103 			    "EP(%p) - for sendcq, ibt_free_cq returned %d",
1104 			    ep, ret);
1105 		}
1106 		ep->ep_sendcq = NULL;
1107 	} else {
1108 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1109 		    "EP(%p) SendCQ is ALREADY FREE", ep);
1110 	}
1111 
1112 	/* free the Recv CQ */
1113 	if (ep->ep_recvcq != NULL) {
1114 		ret = ibt_free_cq(ep->ep_recvcq);
1115 		if (ret != IBT_SUCCESS) {
1116 			RDS_DPRINTF1("rds_ep_free_rc_channel",
1117 			    "EP(%p) - for recvcq, ibt_free_cq returned %d",
1118 			    ep, ret);
1119 		}
1120 		ep->ep_recvcq = NULL;
1121 	} else {
1122 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1123 		    "EP(%p) RecvCQ is ALREADY FREE", ep);
1124 	}
1125 
1126 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep);
1127 }
1128 
1129 /* Allocate resources for RC channel */
1130 ibt_channel_hdl_t
1131 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port)
1132 {
1133 	int				ret = IBT_SUCCESS;
1134 	ibt_cq_attr_t			scqattr, rcqattr;
1135 	ibt_rc_chan_alloc_args_t	chanargs;
1136 	ibt_channel_hdl_t		chanhdl;
1137 	rds_session_t			*sp;
1138 	rds_hca_t			*hcap;
1139 
1140 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
1141 	    ep, hca_port);
1142 
1143 	/* Update the EP with the right IP address and HCA guid */
1144 	sp = ep->ep_sp;
1145 	ASSERT(sp != NULL);
1146 	rw_enter(&sp->session_lock, RW_READER);
1147 	mutex_enter(&ep->ep_lock);
1148 	ep->ep_myip = sp->session_myip;
1149 	ep->ep_remip = sp->session_remip;
1150 	hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
1151 	ep->ep_hca_guid = hcap->hca_guid;
1152 	mutex_exit(&ep->ep_lock);
1153 	rw_exit(&sp->session_lock);
1154 
1155 	/* reset taskqpending flag here */
1156 	ep->ep_recvqp.qp_taskqpending = B_FALSE;
1157 
1158 	if (ep->ep_type == RDS_EP_TYPE_CTRL) {
1159 		scqattr.cq_size = MaxCtrlSendBuffers;
1160 		scqattr.cq_sched = NULL;
1161 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1162 
1163 		rcqattr.cq_size = MaxCtrlRecvBuffers;
1164 		rcqattr.cq_sched = NULL;
1165 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1166 
1167 		chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers;
1168 		chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers;
1169 		chanargs.rc_sizes.cs_sq_sgl = 1;
1170 		chanargs.rc_sizes.cs_rq_sgl = 1;
1171 	} else {
1172 		scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS;
1173 		scqattr.cq_sched = NULL;
1174 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1175 
1176 		rcqattr.cq_size = MaxDataRecvBuffers;
1177 		rcqattr.cq_sched = NULL;
1178 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1179 
1180 		chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS;
1181 		chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers;
1182 		chanargs.rc_sizes.cs_sq_sgl = 1;
1183 		chanargs.rc_sizes.cs_rq_sgl = 1;
1184 	}
1185 
1186 	mutex_enter(&ep->ep_lock);
1187 	if (ep->ep_sendcq == NULL) {
1188 		/* returned size is always greater than the requested size */
1189 		ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr,
1190 		    &ep->ep_sendcq, NULL);
1191 		if (ret != IBT_SUCCESS) {
1192 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ "
1193 			    "failed, size = %d: %d", scqattr.cq_size, ret);
1194 			mutex_exit(&ep->ep_lock);
1195 			return (NULL);
1196 		}
1197 
1198 		(void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler,
1199 		    ep);
1200 
1201 		if (rds_no_interrupts == 0) {
1202 			ret = ibt_enable_cq_notify(ep->ep_sendcq,
1203 			    IBT_NEXT_COMPLETION);
1204 			if (ret != IBT_SUCCESS) {
1205 				RDS_DPRINTF2(LABEL,
1206 				    "ibt_enable_cq_notify failed: %d", ret);
1207 				(void) ibt_free_cq(ep->ep_sendcq);
1208 				ep->ep_sendcq = NULL;
1209 				mutex_exit(&ep->ep_lock);
1210 				return (NULL);
1211 			}
1212 		}
1213 	}
1214 
1215 	if (ep->ep_recvcq == NULL) {
1216 		/* returned size is always greater than the requested size */
1217 		ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr,
1218 		    &ep->ep_recvcq, NULL);
1219 		if (ret != IBT_SUCCESS) {
1220 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ "
1221 			    "failed, size = %d: %d", rcqattr.cq_size, ret);
1222 			(void) ibt_free_cq(ep->ep_sendcq);
1223 			ep->ep_sendcq = NULL;
1224 			mutex_exit(&ep->ep_lock);
1225 			return (NULL);
1226 		}
1227 
1228 		(void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler,
1229 		    ep);
1230 
1231 		ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal);
1232 		if (ret != IBT_SUCCESS) {
1233 			RDS_DPRINTF2(LABEL,
1234 			    "ibt_enable_cq_notify failed: %d", ret);
1235 			(void) ibt_free_cq(ep->ep_recvcq);
1236 			ep->ep_recvcq = NULL;
1237 			(void) ibt_free_cq(ep->ep_sendcq);
1238 			ep->ep_sendcq = NULL;
1239 			mutex_exit(&ep->ep_lock);
1240 			return (NULL);
1241 		}
1242 	}
1243 
1244 	chanargs.rc_flags = IBT_ALL_SIGNALED;
1245 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1246 	    IBT_CEP_ATOMIC;
1247 	chanargs.rc_hca_port_num = hca_port;
1248 	chanargs.rc_scq = ep->ep_sendcq;
1249 	chanargs.rc_rcq = ep->ep_recvcq;
1250 	chanargs.rc_pd = hcap->hca_pdhdl;
1251 	chanargs.rc_srq = NULL;
1252 
1253 	ret = ibt_alloc_rc_channel(hcap->hca_hdl,
1254 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL);
1255 	if (ret != IBT_SUCCESS) {
1256 		RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d",
1257 		    ret);
1258 		(void) ibt_free_cq(ep->ep_recvcq);
1259 		ep->ep_recvcq = NULL;
1260 		(void) ibt_free_cq(ep->ep_sendcq);
1261 		ep->ep_sendcq = NULL;
1262 		mutex_exit(&ep->ep_lock);
1263 		return (NULL);
1264 	}
1265 	mutex_exit(&ep->ep_lock);
1266 
1267 	/* Chan private should contain the ep */
1268 	(void) ibt_set_chan_private(chanhdl, ep);
1269 
1270 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl);
1271 
1272 	return (chanhdl);
1273 }
1274 
1275 
1276 #if 0
1277 
1278 /* Return node guid given a port gid */
1279 ib_guid_t
1280 rds_gid_to_node_guid(ib_gid_t gid)
1281 {
1282 	ibt_node_info_t	nodeinfo;
1283 	int		ret;
1284 
1285 	RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
1286 	    gid.gid_prefix, gid.gid_guid);
1287 
1288 	ret = ibt_gid_to_node_info(gid, &nodeinfo);
1289 	if (ret != IBT_SUCCESS) {
1290 		RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
1291 		    "failed", gid.gid_prefix, gid.gid_guid);
1292 		return (0LL);
1293 	}
1294 
1295 	RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
1296 	    nodeinfo.n_node_guid);
1297 
1298 	return (nodeinfo.n_node_guid);
1299 }
1300 
1301 #endif
1302 
1303 static void
1304 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl,
1305     ibt_async_event_t *event)
1306 {
1307 	rds_hca_t		*hcap;
1308 	ibt_hca_portinfo_t	*newpinfop, *oldpinfop;
1309 	uint_t			newsize, oldsize, nport;
1310 	ib_gid_t		gid;
1311 	int			ret;
1312 
1313 	RDS_DPRINTF2("rds_handle_portup_event",
1314 	    "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
1315 
1316 	/* If RDS service is not registered then no bind is needed */
1317 	if (statep->rds_srvhdl == NULL) {
1318 		RDS_DPRINTF2("rds_handle_portup_event",
1319 		    "RDS Service is not registered, so no action needed");
1320 		return;
1321 	}
1322 
1323 	hcap = rds_get_hcap(statep, event->ev_hca_guid);
1324 	if (hcap == NULL) {
1325 		RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
1326 		    "not in our list", event->ev_hca_guid);
1327 		return;
1328 	}
1329 
1330 	ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
1331 	if (ret != IBT_SUCCESS) {
1332 		RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret);
1333 		return;
1334 	}
1335 
1336 	oldpinfop = hcap->hca_pinfop;
1337 	oldsize = hcap->hca_pinfo_sz;
1338 	hcap->hca_pinfop = newpinfop;
1339 	hcap->hca_pinfo_sz = newsize;
1340 
1341 	/* structure copy */
1342 	gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0];
1343 
1344 	/* bind RDS service on the port, pass statep as cm_private */
1345 	ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep, NULL);
1346 	if (ret != IBT_SUCCESS) {
1347 		RDS_DPRINTF2(LABEL, "Bind service for HCA: 0x%llx Port: %d "
1348 		    "gid %llx:%llx returned: %d", event->ev_hca_guid,
1349 		    event->ev_port, gid.gid_prefix, gid.gid_guid, ret);
1350 	}
1351 
1352 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1353 
1354 	RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
1355 	    event->ev_hca_guid);
1356 }
1357 
1358 static void
1359 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1360     ibt_async_event_t *event)
1361 {
1362 	rds_state_t		*statep;
1363 
1364 	RDS_DPRINTF2("rds_async_handler", "Async code: %d", code);
1365 
1366 	switch (code) {
1367 	case IBT_EVENT_PORT_UP:
1368 		statep = (rds_state_t *)clntp;
1369 		rds_handle_portup_event(statep, hdl, event);
1370 		break;
1371 
1372 	default:
1373 		RDS_DPRINTF2(LABEL, "Async event: %d not handled", code);
1374 	}
1375 
1376 	RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code);
1377 }
1378