xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c (revision cd03c4aeba7c3ecfdadc0536a3bd40987f6c6063)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 
82 /*
83  * This file contains CM related work:
84  *
85  * Service registration/deregistration
86  * Path lookup
87  * CM connection callbacks
88  * CM active and passive connection establishment
89  * Connection failover
90  */
91 
92 #define	SRCIP	src_addr.un.ip4addr
93 #define	DSTIP	dst_addr.un.ip4addr
94 
95 /*
96  * Handle an incoming CM REQ
97  */
98 /* ARGSUSED */
99 static ibt_cm_status_t
100 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
101     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
102 {
103 	ibt_cm_req_rcv_t	*reqp;
104 	ib_gid_t		lgid, rgid;
105 	rds_cm_private_data_t	cmp;
106 	rds_session_t		*sp;
107 	rds_ep_t		*ep;
108 	ibt_channel_hdl_t	chanhdl;
109 	ibt_ip_cm_info_t	ipcm_info;
110 	int			ret;
111 
112 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
113 
114 	reqp = &evp->cm_event.req;
115 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
116 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
117 
118 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
119 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
120 
121 	/* validate service id */
122 	if (reqp->req_service_id == RDS_SERVICE_ID) {
123 		RDS_DPRINTF0(LABEL, "Version Mismatch: Remote system "
124 		    "(GUID: 0x%llx) is running an older version of RDS",
125 		    rgid.gid_guid);
126 		return (IBT_CM_REJECT);
127 	}
128 
129 	/*
130 	 * CM private data brings IP information
131 	 * Private data received is a stream of bytes and may not be properly
132 	 * aligned. So, bcopy the data onto the stack before accessing it.
133 	 */
134 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
135 	    sizeof (rds_cm_private_data_t));
136 
137 	/* extract the CM IP info */
138 	ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
139 	    &ipcm_info);
140 	if (ret != IBT_SUCCESS) {
141 		RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
142 		    ret);
143 		return (IBT_CM_REJECT);
144 	}
145 
146 	RDS_DPRINTF2("rds_handle_cm_req",
147 	    "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
148 	    ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
149 
150 	if (cmp.cmp_version != RDS_VERSION) {
151 		RDS_DPRINTF0(LABEL, "Version Mismatch: Local version: %d "
152 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
153 		return (IBT_CM_REJECT);
154 	}
155 
156 	/* RDS supports V4 addresses only */
157 	if ((ipcm_info.src_addr.family != AF_INET) ||
158 	    (ipcm_info.dst_addr.family != AF_INET)) {
159 		RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
160 		    "src: %d dst: %d", ipcm_info.src_addr.family,
161 		    ipcm_info.dst_addr.family);
162 		return (IBT_CM_REJECT);
163 	}
164 
165 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
166 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
167 		    cmp.cmp_arch, RDS_THIS_ARCH);
168 		return (IBT_CM_REJECT);
169 	}
170 
171 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
172 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
173 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
174 		return (IBT_CM_REJECT);
175 	}
176 
177 	/* user_buffer_size should be same on all nodes */
178 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
179 		RDS_DPRINTF2(LABEL,
180 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
181 		    UserBufferSize, cmp.cmp_user_buffer_size);
182 		return (IBT_CM_REJECT);
183 	}
184 
185 	/*
186 	 * RDS needs more time to process a failover REQ so send an MRA.
187 	 * Otherwise, the remote may retry the REQ and fail the connection.
188 	 */
189 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
190 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
191 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
192 		    10000000 /* 10 sec */, NULL, 0);
193 	}
194 
195 	/* Is there a session to the destination node? */
196 	rw_enter(&statep->rds_sessionlock, RW_READER);
197 	sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
198 	rw_exit(&statep->rds_sessionlock);
199 
200 	if (sp == NULL) {
201 		/*
202 		 * currently there is no session to the destination
203 		 * remote ip in the private data is the local ip and vice
204 		 * versa
205 		 */
206 		sp = rds_session_create(statep, ipcm_info.DSTIP,
207 		    ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
208 		if (sp == NULL) {
209 			/* Check the list anyway. */
210 			rw_enter(&statep->rds_sessionlock, RW_READER);
211 			sp = rds_session_lkup(statep, ipcm_info.SRCIP,
212 			    rgid.gid_guid);
213 			rw_exit(&statep->rds_sessionlock);
214 			if (sp == NULL) {
215 				/*
216 				 * The only way this can fail is due to lack
217 				 * of kernel resources
218 				 */
219 				return (IBT_CM_REJECT);
220 			}
221 		}
222 	}
223 
224 	rw_enter(&sp->session_lock, RW_WRITER);
225 
226 	/* catch peer-to-peer case as soon as possible */
227 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
228 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
229 		/* Check possible peer-to-peer case here */
230 		if (sp->session_type != RDS_SESSION_PASSIVE) {
231 			RDS_DPRINTF2("rds_handle_cm_req",
232 			    "SP(%p) Peer-peer connection handling", sp);
233 			if (lgid.gid_guid > rgid.gid_guid) {
234 				/* this node is active so reject this request */
235 				rw_exit(&sp->session_lock);
236 				return (IBT_CM_REJECT);
237 			} else {
238 				/* this node is passive, change the session */
239 				sp->session_type = RDS_SESSION_PASSIVE;
240 				sp->session_lgid = lgid;
241 				sp->session_rgid = rgid;
242 			}
243 		}
244 	}
245 
246 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
247 
248 	switch (sp->session_state) {
249 	case RDS_SESSION_STATE_CONNECTED:
250 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
251 		sp->session_state = RDS_SESSION_STATE_ERROR;
252 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
253 		    "RDS_SESSION_STATE_ERROR", sp);
254 
255 		/* FALLTHRU */
256 	case RDS_SESSION_STATE_ERROR:
257 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
258 		sp->session_type = RDS_SESSION_PASSIVE;
259 		rw_exit(&sp->session_lock);
260 
261 		rds_session_close(sp, IBT_NOCALLBACKS, 1);
262 
263 		/* move the session to init state */
264 		rw_enter(&sp->session_lock, RW_WRITER);
265 		ret = rds_session_reinit(sp, lgid);
266 		sp->session_myip = ipcm_info.DSTIP;
267 		sp->session_lgid = lgid;
268 		sp->session_rgid = rgid;
269 		if (ret != 0) {
270 			rds_session_fini(sp);
271 			sp->session_state = RDS_SESSION_STATE_FAILED;
272 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
273 			    "RDS_SESSION_STATE_FAILED", sp);
274 			rw_exit(&sp->session_lock);
275 			return (IBT_CM_REJECT);
276 		} else {
277 			sp->session_state = RDS_SESSION_STATE_INIT;
278 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
279 			    "RDS_SESSION_STATE_INIT", sp);
280 		}
281 
282 		if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
283 			ep = &sp->session_ctrlep;
284 		} else {
285 			ep = &sp->session_dataep;
286 		}
287 		break;
288 	case RDS_SESSION_STATE_CREATED:
289 	case RDS_SESSION_STATE_FAILED:
290 	case RDS_SESSION_STATE_FINI:
291 		/*
292 		 * Initialize both channels, we accept this connection
293 		 * only if both channels are initialized
294 		 */
295 		sp->session_type = RDS_SESSION_PASSIVE;
296 		sp->session_lgid = lgid;
297 		sp->session_rgid = rgid;
298 		sp->session_state = RDS_SESSION_STATE_CREATED;
299 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
300 		    "RDS_SESSION_STATE_CREATED", sp);
301 		ret = rds_session_init(sp);
302 		if (ret != 0) {
303 			/* Seems like there are not enough resources */
304 			sp->session_state = RDS_SESSION_STATE_FAILED;
305 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
306 			    "RDS_SESSION_STATE_FAILED", sp);
307 			rw_exit(&sp->session_lock);
308 			return (IBT_CM_REJECT);
309 		}
310 		sp->session_state = RDS_SESSION_STATE_INIT;
311 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
312 		    "RDS_SESSION_STATE_INIT", sp);
313 
314 		/* FALLTHRU */
315 	case RDS_SESSION_STATE_INIT:
316 		/*
317 		 * When re-using an existing session, make sure the
318 		 * session is still through the same HCA. Otherwise, the
319 		 * memory registrations have to moved to the new HCA.
320 		 */
321 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
322 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
323 				RDS_DPRINTF2("rds_handle_cm_req",
324 				    "Existing Session but different gid "
325 				    "existing: 0x%llx, new: 0x%llx, "
326 				    "sending an MRA",
327 				    sp->session_lgid.gid_guid, lgid.gid_guid);
328 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
329 				    evp->cm_session_id, 10000000 /* 10 sec */,
330 				    NULL, 0);
331 				ret = rds_session_reinit(sp, lgid);
332 				if (ret != 0) {
333 					rds_session_fini(sp);
334 					sp->session_state =
335 					    RDS_SESSION_STATE_FAILED;
336 					sp->session_failover = 0;
337 					RDS_DPRINTF3("rds_failover_session",
338 					    "SP(%p) State "
339 					    "RDS_SESSION_STATE_FAILED", sp);
340 					rw_exit(&sp->session_lock);
341 					return (IBT_CM_REJECT);
342 				}
343 			}
344 			ep = &sp->session_dataep;
345 		} else {
346 			ep = &sp->session_ctrlep;
347 		}
348 
349 		break;
350 	default:
351 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
352 		    "state: %d", sp, sp->session_state);
353 		rw_exit(&sp->session_lock);
354 		return (IBT_CM_REJECT);
355 	}
356 
357 	sp->session_failover = 0; /* reset any previous value */
358 	if (cmp.cmp_failover) {
359 		RDS_DPRINTF2("rds_handle_cm_req",
360 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
361 		sp->session_failover = 1;
362 	}
363 
364 	mutex_enter(&ep->ep_lock);
365 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
366 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
367 		sp->session_type = RDS_SESSION_PASSIVE;
368 		rw_exit(&sp->session_lock);
369 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
370 		rw_exit(&sp->session_lock);
371 		/*
372 		 * Peer to peer connection. There is an active
373 		 * connection pending on this ep. The one with
374 		 * greater port guid becomes active and the
375 		 * other becomes passive.
376 		 */
377 		RDS_DPRINTF2("rds_handle_cm_req",
378 		    "EP(%p) Peer-peer connection handling", ep);
379 		if (lgid.gid_guid > rgid.gid_guid) {
380 			/* this node is active so reject this request */
381 			mutex_exit(&ep->ep_lock);
382 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
383 			    "Rejecting passive in favor of active", sp, ep);
384 			return (IBT_CM_REJECT);
385 		} else {
386 			/*
387 			 * This session is not the active end, change it
388 			 * to passive end.
389 			 */
390 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
391 
392 			rw_enter(&sp->session_lock, RW_WRITER);
393 			sp->session_type = RDS_SESSION_PASSIVE;
394 			sp->session_lgid = lgid;
395 			sp->session_rgid = rgid;
396 			rw_exit(&sp->session_lock);
397 		}
398 	} else {
399 		rw_exit(&sp->session_lock);
400 	}
401 
402 	ep->ep_lbufid = cmp.cmp_last_bufid;
403 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
404 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
405 	cmp.cmp_last_bufid = ep->ep_rbufid;
406 	cmp.cmp_ack_addr = ep->ep_ack_addr;
407 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
408 	mutex_exit(&ep->ep_lock);
409 
410 	/* continue with accepting the connection request for this channel */
411 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
412 	if (chanhdl == NULL) {
413 		mutex_enter(&ep->ep_lock);
414 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
415 		mutex_exit(&ep->ep_lock);
416 		return (IBT_CM_REJECT);
417 	}
418 
419 	/* pre-post recv buffers in the RQ */
420 	rds_post_recv_buf((void *)chanhdl);
421 
422 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
423 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
424 	rargsp->cm_ret.rep.cm_channel = chanhdl;
425 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
426 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
427 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
428 
429 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
430 	    sp, ep, chanhdl);
431 
432 	return (IBT_CM_ACCEPT);
433 }
434 
435 /*
436  * Handle an incoming CM REP
437  * Pre-post recv buffers for the QP
438  */
439 /* ARGSUSED */
440 static ibt_cm_status_t
441 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
442     void *rcmp, ibt_priv_data_len_t rcmp_len)
443 {
444 	rds_ep_t	*ep;
445 	rds_cm_private_data_t	cmp;
446 
447 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
448 
449 	/* pre-post recv buffers in the RQ */
450 	rds_post_recv_buf((void *)evp->cm_channel);
451 
452 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
453 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
454 	    sizeof (rds_cm_private_data_t));
455 	ep->ep_lbufid = cmp.cmp_last_bufid;
456 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
457 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
458 
459 	rargsp->cm_ret_len = 0;
460 
461 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
462 
463 	return (IBT_CM_ACCEPT);
464 }
465 
466 /*
467  * Handle CONN EST
468  */
469 static ibt_cm_status_t
470 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
471 {
472 	rds_session_t	*sp;
473 	rds_ep_t	*ep;
474 
475 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
476 
477 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
478 	    ep->ep_state);
479 
480 	mutex_enter(&ep->ep_lock);
481 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
482 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
483 	ep->ep_state = RDS_EP_STATE_CONNECTED;
484 	ep->ep_chanhdl = evp->cm_channel;
485 	sp = ep->ep_sp;
486 	mutex_exit(&ep->ep_lock);
487 
488 	(void) rds_session_active(sp);
489 
490 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
491 	return (IBT_CM_ACCEPT);
492 }
493 
494 /*
495  * Handle CONN CLOSED
496  */
497 static ibt_cm_status_t
498 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
499 {
500 	rds_ep_t	*ep;
501 	rds_session_t	*sp;
502 
503 	/* Catch DREQs but ignore DREPs */
504 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
505 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
506 		    "Ignoring Event: %d received", evp->cm_event.closed);
507 		return (IBT_CM_ACCEPT);
508 	}
509 
510 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
511 	sp = ep->ep_sp;
512 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Enter", ep);
513 
514 	mutex_enter(&ep->ep_lock);
515 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
516 		/* Ignore this DREQ */
517 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
518 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
519 		mutex_exit(&ep->ep_lock);
520 		return (IBT_CM_ACCEPT);
521 	}
522 	ep->ep_state = RDS_EP_STATE_CLOSING;
523 	mutex_exit(&ep->ep_lock);
524 
525 	rw_enter(&sp->session_lock, RW_WRITER);
526 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
527 	    sp->session_state);
528 
529 	switch (sp->session_state) {
530 	case RDS_SESSION_STATE_CONNECTED:
531 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
532 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
533 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
534 		break;
535 
536 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
537 		sp->session_state = RDS_SESSION_STATE_CLOSED;
538 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
539 		    "RDS_SESSION_STATE_CLOSED", sp);
540 		rds_passive_session_fini(sp);
541 		sp->session_state = RDS_SESSION_STATE_FINI;
542 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
543 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
544 		break;
545 
546 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
547 	case RDS_SESSION_STATE_ERROR:
548 	case RDS_SESSION_STATE_CLOSED:
549 		break;
550 
551 	case RDS_SESSION_STATE_INIT:
552 		sp->session_state = RDS_SESSION_STATE_ERROR;
553 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
554 		    "RDS_SESSION_STATE_ERROR", sp);
555 		rds_passive_session_fini(sp);
556 		sp->session_state = RDS_SESSION_STATE_FAILED;
557 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
558 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
559 		break;
560 
561 	default:
562 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
563 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
564 		rds_passive_session_fini(sp);
565 		sp->session_state = RDS_SESSION_STATE_FAILED;
566 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
567 		    "RDS_SESSION_STATE_FAILED", sp);
568 	}
569 	rw_exit(&sp->session_lock);
570 
571 	mutex_enter(&ep->ep_lock);
572 	ep->ep_state = RDS_EP_STATE_CLOSED;
573 	mutex_exit(&ep->ep_lock);
574 
575 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
576 	return (IBT_CM_ACCEPT);
577 }
578 
579 /*
580  * Handle EVENT FAILURE
581  */
582 static ibt_cm_status_t
583 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
584 {
585 	rds_ep_t	*ep;
586 	rds_session_t	*sp;
587 	int		ret;
588 
589 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
590 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
591 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
592 	    evp->cm_event.failed.cf_reason);
593 
594 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
595 		RDS_DPRINTF0(LABEL,
596 		    "Received REJ with reason IBT_CM_INVALID_SID: "
597 		    "The remote system could be running an older RDS version");
598 	}
599 
600 	if (evp->cm_channel == NULL) {
601 		return (IBT_CM_ACCEPT);
602 	}
603 
604 	if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
605 	    (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
606 		/*
607 		 * This end is active, just ignore, ibt_open_rc_channel()
608 		 * caller will take care of cleanup.
609 		 */
610 		RDS_DPRINTF2("rds_handle_cm_event_failure",
611 		    "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
612 		return (IBT_CM_ACCEPT);
613 	}
614 
615 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
616 	sp = ep->ep_sp;
617 
618 	rw_enter(&sp->session_lock, RW_WRITER);
619 	if (sp->session_type == RDS_SESSION_PASSIVE) {
620 		RDS_DPRINTF2("rds_handle_cm_event_failure",
621 		    "SP(%p) - state: %d", sp, sp->session_state);
622 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
623 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
624 			sp->session_state = RDS_SESSION_STATE_ERROR;
625 			RDS_DPRINTF3("rds_handle_cm_event_failure",
626 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
627 
628 			/*
629 			 * Store the cm_channel for freeing later
630 			 * Active side frees it on ibt_open_rc_channel
631 			 * failure
632 			 */
633 			if (ep->ep_chanhdl == NULL) {
634 				ep->ep_chanhdl = evp->cm_channel;
635 			}
636 			rw_exit(&sp->session_lock);
637 
638 			/*
639 			 * rds_passive_session_fini should not be called
640 			 * directly in the CM handler. It will cause a deadlock.
641 			 */
642 			ret = ddi_taskq_dispatch(rds_taskq,
643 			    rds_cleanup_passive_session, (void *)sp,
644 			    DDI_NOSLEEP);
645 			if (ret != DDI_SUCCESS) {
646 				RDS_DPRINTF1("rds_handle_cm_event_failure",
647 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
648 			}
649 			return (IBT_CM_ACCEPT);
650 		}
651 	}
652 	rw_exit(&sp->session_lock);
653 
654 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
655 	return (IBT_CM_ACCEPT);
656 }
657 
658 /*
659  * CM Handler
660  *
661  * Called by IBCM
662  * The cm_private type differs for active and passive events.
663  */
664 ibt_cm_status_t
665 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
666     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
667     ibt_priv_data_len_t ret_len_max)
668 {
669 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
670 
671 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
672 
673 	switch (eventp->cm_type) {
674 	case IBT_CM_EVENT_REQ_RCV:
675 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
676 		    ret_args, ret_priv_data, ret_len_max);
677 		break;
678 	case IBT_CM_EVENT_REP_RCV:
679 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
680 		    ret_len_max);
681 		break;
682 	case IBT_CM_EVENT_MRA_RCV:
683 		/* Not supported */
684 		break;
685 	case IBT_CM_EVENT_CONN_EST:
686 		ret = rds_handle_cm_conn_est(eventp);
687 		break;
688 	case IBT_CM_EVENT_CONN_CLOSED:
689 		ret = rds_handle_cm_conn_closed(eventp);
690 		break;
691 	case IBT_CM_EVENT_FAILURE:
692 		ret = rds_handle_cm_event_failure(eventp);
693 		break;
694 	case IBT_CM_EVENT_LAP_RCV:
695 		/* Not supported */
696 		RDS_DPRINTF2(LABEL, "LAP message received");
697 		break;
698 	case IBT_CM_EVENT_APR_RCV:
699 		/* Not supported */
700 		RDS_DPRINTF2(LABEL, "APR message received");
701 		break;
702 	default:
703 		break;
704 	}
705 
706 	RDS_DPRINTF2("rds_cm_handler", "Return");
707 
708 	return (ret);
709 }
710 
711 /* This is based on OFED Linux RDS */
712 #define	RDS_PORT_NUM	6556
713 
714 /*
715  * Register the wellknown service with service id: RDS_SERVICE_ID
716  * Incoming connection requests should arrive on this service id.
717  */
718 ibt_srv_hdl_t
719 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
720 {
721 	ibt_srv_hdl_t	srvhdl;
722 	ibt_srv_desc_t	srvdesc;
723 	int		ret;
724 
725 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
726 
727 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
728 	srvdesc.sd_handler = rds_cm_handler;
729 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
730 
731 	/*
732 	 * Register the old service id for backward compatibility
733 	 * REQs received on this service id would be rejected
734 	 */
735 	ret = ibt_register_service(rds_ibhdl, &srvdesc, RDS_SERVICE_ID,
736 	    1, &rdsib_statep->rds_old_srvhdl, NULL);
737 	if (ret != IBT_SUCCESS) {
738 		RDS_DPRINTF2(LABEL,
739 		    "RDS Service (0x%llx) Registration Failed: %d",
740 		    RDS_SERVICE_ID, ret);
741 		return (NULL);
742 	}
743 
744 	/*
745 	 * This is the new service id as per:
746 	 * Annex A11: RDMA IP CM Service
747 	 */
748 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
749 	    RDS_PORT_NUM);
750 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
751 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
752 	if (ret != IBT_SUCCESS) {
753 		RDS_DPRINTF2(LABEL,
754 		    "RDS Service (0x%llx) Registration Failed: %d",
755 		    rdsib_statep->rds_service_id, ret);
756 		return (NULL);
757 	}
758 
759 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
760 	return (srvhdl);
761 }
762 
763 /* Bind the RDS service on all ports */
764 int
765 rds_bind_service(rds_state_t *statep)
766 {
767 	rds_hca_t	*hcap;
768 	ib_gid_t	gid;
769 	uint_t		jx, nbinds = 0, nports = 0;
770 	int		ret;
771 
772 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
773 
774 	hcap = statep->rds_hcalistp;
775 	while (hcap != NULL) {
776 		for (jx = 0; jx < hcap->hca_nports; jx++) {
777 			nports++;
778 			if (hcap->hca_pinfop[jx].p_linkstate !=
779 			    IBT_PORT_ACTIVE) {
780 				/*
781 				 * service bind will be called in the async
782 				 * handler when the port comes up
783 				 */
784 				continue;
785 			}
786 
787 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
788 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
789 			    "gid: %llx:%llx", hcap->hca_guid,
790 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
791 			    gid.gid_guid);
792 
793 			/* pass statep as cm_private */
794 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
795 			    NULL, statep, NULL);
796 			if (ret != IBT_SUCCESS) {
797 				RDS_DPRINTF2(LABEL, "Bind service for "
798 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
799 				    "failed: %d", hcap->hca_guid,
800 				    hcap->hca_pinfop[jx].p_port_num,
801 				    gid.gid_prefix, gid.gid_guid, ret);
802 				continue;
803 			}
804 
805 			nbinds++;
806 
807 			/* bind the old service, ignore if it fails */
808 			ret = ibt_bind_service(statep->rds_old_srvhdl, gid,
809 			    NULL, statep, NULL);
810 			if (ret != IBT_SUCCESS) {
811 				RDS_DPRINTF2(LABEL, "Bind service for "
812 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
813 				    "failed: %d", hcap->hca_guid,
814 				    hcap->hca_pinfop[jx].p_port_num,
815 				    gid.gid_prefix, gid.gid_guid, ret);
816 			}
817 		}
818 		hcap = hcap->hca_nextp;
819 	}
820 
821 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
822 	    nbinds, nports);
823 
824 #if 0
825 	if (nbinds == 0) {
826 		return (-1);
827 	}
828 #endif
829 
830 	RDS_DPRINTF2("rds_bind_service", "Return");
831 
832 	return (0);
833 }
834 
835 /* Open an RC connection */
836 int
837 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
838     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
839 {
840 	rds_session_t		*sp;
841 	ibt_chan_open_args_t	ocargs;
842 	ibt_rc_returns_t	ocrets;
843 	rds_cm_private_data_t	cmp;
844 	uint8_t			hca_port;
845 	ibt_channel_hdl_t	hdl;
846 	ibt_status_t		ret = 0;
847 	ibt_ip_cm_info_t	ipcm_info;
848 
849 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
850 
851 	sp = ep->ep_sp;
852 
853 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
854 	ipcm_info.src_addr.family = AF_INET;
855 	ipcm_info.SRCIP = sp->session_myip;
856 	ipcm_info.dst_addr.family = AF_INET;
857 	ipcm_info.DSTIP = sp->session_remip;
858 	ipcm_info.src_port = 6556; /* based on OFED RDS */
859 	ret = ibt_format_ip_private_data(&ipcm_info,
860 	    sizeof (rds_cm_private_data_t), &cmp);
861 	if (ret != IBT_SUCCESS) {
862 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
863 		    "failed: %d", sp, ep, ret);
864 		return (-1);
865 	}
866 
867 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
868 
869 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
870 	if (hdl == NULL) {
871 		return (-1);
872 	}
873 
874 	cmp.cmp_version = RDS_VERSION;
875 	cmp.cmp_arch = RDS_THIS_ARCH;
876 	cmp.cmp_eptype = ep->ep_type;
877 	cmp.cmp_failover = sp->session_failover;
878 	cmp.cmp_last_bufid = ep->ep_rbufid;
879 	cmp.cmp_user_buffer_size = UserBufferSize;
880 	cmp.cmp_ack_addr = ep->ep_ack_addr;
881 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
882 
883 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
884 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
885 	ocargs.oc_path = pinfo;
886 	ocargs.oc_cm_handler = rds_cm_handler;
887 	ocargs.oc_cm_clnt_private = NULL;
888 	ocargs.oc_rdma_ra_out = 4;
889 	ocargs.oc_rdma_ra_in = 4;
890 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
891 	ocargs.oc_priv_data = &cmp;
892 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
893 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
894 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
895 	    mode, &ocargs, &ocrets);
896 	if (ret != IBT_SUCCESS) {
897 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
898 		    "failed: %d", sp, ep, ret);
899 		(void) ibt_flush_channel(hdl);
900 		(void) ibt_free_channel(hdl);
901 
902 		mutex_enter(&ep->ep_lock);
903 		/* don't cleanup if this failure is due to peer-peer race */
904 		if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
905 			/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
906 			ep->ep_state = RDS_EP_STATE_ERROR;
907 			rds_ep_free_rc_channel(ep);
908 		}
909 		mutex_exit(&ep->ep_lock);
910 
911 		return (-1);
912 	}
913 
914 	*chanhdl = hdl;
915 
916 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
917 	    *chanhdl);
918 
919 	return (0);
920 }
921 
922 int
923 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
924 {
925 	int	ret;
926 
927 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
928 	    chanhdl, mode);
929 
930 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
931 
932 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
933 
934 	return (ret);
935 }
936