xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c (revision d99cb22f7f0de8584336bda08cb86c562ffbab55)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 
82 /*
83  * This file contains CM related work:
84  *
85  * Service registration/deregistration
86  * Path lookup
87  * CM connection callbacks
88  * CM active and passive connection establishment
89  * Connection failover
90  */
91 
92 #define	SRCIP	src_addr.un.ip4addr
93 #define	DSTIP	dst_addr.un.ip4addr
94 
95 /*
96  * Handle an incoming CM REQ
97  */
98 /* ARGSUSED */
99 static ibt_cm_status_t
100 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
101     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
102 {
103 	ibt_cm_req_rcv_t	*reqp;
104 	ib_gid_t		lgid, rgid;
105 	rds_cm_private_data_t	cmp;
106 	rds_session_t		*sp;
107 	rds_ep_t		*ep;
108 	ibt_channel_hdl_t	chanhdl;
109 	ibt_ip_cm_info_t	ipcm_info;
110 	int			ret;
111 
112 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
113 
114 	reqp = &evp->cm_event.req;
115 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
116 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
117 
118 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
119 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
120 
121 	/* validate service id */
122 	if (reqp->req_service_id == RDS_SERVICE_ID) {
123 		RDS_DPRINTF0(LABEL, "Version Mismatch: Remote system "
124 		    "(GUID: 0x%llx) is running an older version of RDS",
125 		    rgid.gid_guid);
126 		return (IBT_CM_REJECT);
127 	}
128 
129 	/*
130 	 * CM private data brings IP information
131 	 * Private data received is a stream of bytes and may not be properly
132 	 * aligned. So, bcopy the data onto the stack before accessing it.
133 	 */
134 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
135 	    sizeof (rds_cm_private_data_t));
136 
137 	/* extract the CM IP info */
138 	ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
139 	    &ipcm_info);
140 	if (ret != IBT_SUCCESS) {
141 		RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
142 		    ret);
143 		return (IBT_CM_REJECT);
144 	}
145 
146 	RDS_DPRINTF2("rds_handle_cm_req",
147 	    "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
148 	    ntohl(ipcm_info.SRCIP), ntohl(ipcm_info.DSTIP), cmp.cmp_eptype);
149 
150 	if (cmp.cmp_version != RDS_VERSION) {
151 		RDS_DPRINTF0(LABEL, "Version Mismatch: Local version: %d "
152 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
153 		return (IBT_CM_REJECT);
154 	}
155 
156 	/* RDS supports V4 addresses only */
157 	if ((ipcm_info.src_addr.family != AF_INET) ||
158 	    (ipcm_info.dst_addr.family != AF_INET)) {
159 		RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
160 		    "src: %d dst: %d", ipcm_info.src_addr.family,
161 		    ipcm_info.dst_addr.family);
162 		return (IBT_CM_REJECT);
163 	}
164 
165 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
166 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
167 		    cmp.cmp_arch, RDS_THIS_ARCH);
168 		return (IBT_CM_REJECT);
169 	}
170 
171 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
172 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
173 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
174 		return (IBT_CM_REJECT);
175 	}
176 
177 	/* user_buffer_size should be same on all nodes */
178 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
179 		RDS_DPRINTF2(LABEL,
180 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
181 		    UserBufferSize, cmp.cmp_user_buffer_size);
182 		return (IBT_CM_REJECT);
183 	}
184 
185 	/*
186 	 * RDS needs more time to process a failover REQ so send an MRA.
187 	 * Otherwise, the remote may retry the REQ and fail the connection.
188 	 */
189 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
190 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
191 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
192 		    10000000 /* 10 sec */, NULL, 0);
193 	}
194 
195 	/* Is there a session to the destination node? */
196 	rw_enter(&statep->rds_sessionlock, RW_READER);
197 	sp = rds_session_lkup(statep, ntohl(ipcm_info.SRCIP), rgid.gid_guid);
198 	rw_exit(&statep->rds_sessionlock);
199 
200 	if (sp == NULL) {
201 		/*
202 		 * currently there is no session to the destination
203 		 * remote ip in the private data is the local ip and vice
204 		 * versa
205 		 */
206 		sp = rds_session_create(statep, ntohl(ipcm_info.DSTIP),
207 		    ntohl(ipcm_info.SRCIP), reqp, RDS_SESSION_PASSIVE);
208 		if (sp == NULL) {
209 			/* Check the list anyway. */
210 			rw_enter(&statep->rds_sessionlock, RW_READER);
211 			sp = rds_session_lkup(statep, ntohl(ipcm_info.SRCIP),
212 			    rgid.gid_guid);
213 			rw_exit(&statep->rds_sessionlock);
214 			if (sp == NULL) {
215 				/*
216 				 * The only way this can fail is due to lack
217 				 * of kernel resources
218 				 */
219 				return (IBT_CM_REJECT);
220 			}
221 		}
222 	}
223 
224 	rw_enter(&sp->session_lock, RW_WRITER);
225 
226 	/* catch peer-to-peer case as soon as possible */
227 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
228 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
229 		/* Check possible peer-to-peer case here */
230 		if (sp->session_type != RDS_SESSION_PASSIVE) {
231 			RDS_DPRINTF2("rds_handle_cm_req",
232 			    "SP(%p) Peer-peer connection handling", sp);
233 			if (lgid.gid_guid > rgid.gid_guid) {
234 				/* this node is active so reject this request */
235 				rw_exit(&sp->session_lock);
236 				return (IBT_CM_REJECT);
237 			} else {
238 				/* this node is passive, change the session */
239 				sp->session_type = RDS_SESSION_PASSIVE;
240 				sp->session_lgid = lgid;
241 				sp->session_rgid = rgid;
242 			}
243 		}
244 	}
245 
246 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
247 
248 	switch (sp->session_state) {
249 	case RDS_SESSION_STATE_CONNECTED:
250 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
251 		sp->session_state = RDS_SESSION_STATE_ERROR;
252 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
253 		    "RDS_SESSION_STATE_ERROR", sp);
254 
255 		/* FALLTHRU */
256 	case RDS_SESSION_STATE_ERROR:
257 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
258 		sp->session_type = RDS_SESSION_PASSIVE;
259 		rw_exit(&sp->session_lock);
260 
261 		/* Handling this will take some time, so send an MRA */
262 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
263 		    10000000 /* 10 sec */, NULL, 0);
264 
265 		/*
266 		 * Any pending completions don't get flushed until the channel
267 		 * is closed. So, passing 0 here will not wait for pending
268 		 * completions in rds_session_close before closing the channel
269 		 */
270 		rds_session_close(sp, IBT_NOCALLBACKS, 0);
271 
272 		/* move the session to init state */
273 		rw_enter(&sp->session_lock, RW_WRITER);
274 		ret = rds_session_reinit(sp, lgid);
275 		sp->session_myip = ntohl(ipcm_info.DSTIP);
276 		sp->session_lgid = lgid;
277 		sp->session_rgid = rgid;
278 		if (ret != 0) {
279 			rds_session_fini(sp);
280 			sp->session_state = RDS_SESSION_STATE_FAILED;
281 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
282 			    "RDS_SESSION_STATE_FAILED", sp);
283 			rw_exit(&sp->session_lock);
284 			return (IBT_CM_REJECT);
285 		} else {
286 			sp->session_state = RDS_SESSION_STATE_INIT;
287 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
288 			    "RDS_SESSION_STATE_INIT", sp);
289 		}
290 
291 		if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
292 			ep = &sp->session_ctrlep;
293 		} else {
294 			ep = &sp->session_dataep;
295 		}
296 		break;
297 	case RDS_SESSION_STATE_CREATED:
298 	case RDS_SESSION_STATE_FAILED:
299 	case RDS_SESSION_STATE_FINI:
300 		/*
301 		 * Initialize both channels, we accept this connection
302 		 * only if both channels are initialized
303 		 */
304 		sp->session_type = RDS_SESSION_PASSIVE;
305 		sp->session_lgid = lgid;
306 		sp->session_rgid = rgid;
307 		sp->session_state = RDS_SESSION_STATE_CREATED;
308 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
309 		    "RDS_SESSION_STATE_CREATED", sp);
310 		ret = rds_session_init(sp);
311 		if (ret != 0) {
312 			/* Seems like there are not enough resources */
313 			sp->session_state = RDS_SESSION_STATE_FAILED;
314 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
315 			    "RDS_SESSION_STATE_FAILED", sp);
316 			rw_exit(&sp->session_lock);
317 			return (IBT_CM_REJECT);
318 		}
319 		sp->session_state = RDS_SESSION_STATE_INIT;
320 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
321 		    "RDS_SESSION_STATE_INIT", sp);
322 
323 		/* FALLTHRU */
324 	case RDS_SESSION_STATE_INIT:
325 		/*
326 		 * When re-using an existing session, make sure the
327 		 * session is still through the same HCA. Otherwise, the
328 		 * memory registrations have to moved to the new HCA.
329 		 */
330 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
331 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
332 				RDS_DPRINTF2("rds_handle_cm_req",
333 				    "Existing Session but different gid "
334 				    "existing: 0x%llx, new: 0x%llx, "
335 				    "sending an MRA",
336 				    sp->session_lgid.gid_guid, lgid.gid_guid);
337 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
338 				    evp->cm_session_id, 10000000 /* 10 sec */,
339 				    NULL, 0);
340 				ret = rds_session_reinit(sp, lgid);
341 				if (ret != 0) {
342 					rds_session_fini(sp);
343 					sp->session_state =
344 					    RDS_SESSION_STATE_FAILED;
345 					sp->session_failover = 0;
346 					RDS_DPRINTF3("rds_failover_session",
347 					    "SP(%p) State "
348 					    "RDS_SESSION_STATE_FAILED", sp);
349 					rw_exit(&sp->session_lock);
350 					return (IBT_CM_REJECT);
351 				}
352 			}
353 			ep = &sp->session_dataep;
354 		} else {
355 			ep = &sp->session_ctrlep;
356 		}
357 
358 		break;
359 	default:
360 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
361 		    "state: %d", sp, sp->session_state);
362 		rw_exit(&sp->session_lock);
363 		return (IBT_CM_REJECT);
364 	}
365 
366 	sp->session_failover = 0; /* reset any previous value */
367 	if (cmp.cmp_failover) {
368 		RDS_DPRINTF2("rds_handle_cm_req",
369 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
370 		sp->session_failover = 1;
371 	}
372 
373 	mutex_enter(&ep->ep_lock);
374 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
375 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
376 		sp->session_type = RDS_SESSION_PASSIVE;
377 		rw_exit(&sp->session_lock);
378 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
379 		rw_exit(&sp->session_lock);
380 		/*
381 		 * Peer to peer connection. There is an active
382 		 * connection pending on this ep. The one with
383 		 * greater port guid becomes active and the
384 		 * other becomes passive.
385 		 */
386 		RDS_DPRINTF2("rds_handle_cm_req",
387 		    "EP(%p) Peer-peer connection handling", ep);
388 		if (lgid.gid_guid > rgid.gid_guid) {
389 			/* this node is active so reject this request */
390 			mutex_exit(&ep->ep_lock);
391 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
392 			    "Rejecting passive in favor of active", sp, ep);
393 			return (IBT_CM_REJECT);
394 		} else {
395 			/*
396 			 * This session is not the active end, change it
397 			 * to passive end.
398 			 */
399 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
400 
401 			rw_enter(&sp->session_lock, RW_WRITER);
402 			sp->session_type = RDS_SESSION_PASSIVE;
403 			sp->session_lgid = lgid;
404 			sp->session_rgid = rgid;
405 			rw_exit(&sp->session_lock);
406 		}
407 	} else {
408 		rw_exit(&sp->session_lock);
409 	}
410 
411 	ep->ep_lbufid = cmp.cmp_last_bufid;
412 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
413 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
414 	cmp.cmp_last_bufid = ep->ep_rbufid;
415 	cmp.cmp_ack_addr = ep->ep_ack_addr;
416 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
417 	mutex_exit(&ep->ep_lock);
418 
419 	/* continue with accepting the connection request for this channel */
420 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
421 	if (chanhdl == NULL) {
422 		mutex_enter(&ep->ep_lock);
423 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
424 		mutex_exit(&ep->ep_lock);
425 		return (IBT_CM_REJECT);
426 	}
427 
428 	/* pre-post recv buffers in the RQ */
429 	rds_post_recv_buf((void *)chanhdl);
430 
431 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
432 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
433 	rargsp->cm_ret.rep.cm_channel = chanhdl;
434 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
435 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
436 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
437 
438 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
439 	    sp, ep, chanhdl);
440 
441 	return (IBT_CM_ACCEPT);
442 }
443 
444 /*
445  * Handle an incoming CM REP
446  * Pre-post recv buffers for the QP
447  */
448 /* ARGSUSED */
449 static ibt_cm_status_t
450 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
451     void *rcmp, ibt_priv_data_len_t rcmp_len)
452 {
453 	rds_ep_t	*ep;
454 	rds_cm_private_data_t	cmp;
455 
456 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
457 
458 	/* pre-post recv buffers in the RQ */
459 	rds_post_recv_buf((void *)evp->cm_channel);
460 
461 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
462 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
463 	    sizeof (rds_cm_private_data_t));
464 	ep->ep_lbufid = cmp.cmp_last_bufid;
465 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
466 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
467 
468 	rargsp->cm_ret_len = 0;
469 
470 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
471 
472 	return (IBT_CM_ACCEPT);
473 }
474 
475 /*
476  * Handle CONN EST
477  */
478 static ibt_cm_status_t
479 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
480 {
481 	rds_session_t	*sp;
482 	rds_ep_t	*ep;
483 
484 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
485 
486 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
487 	    ep->ep_state);
488 
489 	mutex_enter(&ep->ep_lock);
490 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
491 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
492 	ep->ep_state = RDS_EP_STATE_CONNECTED;
493 	ep->ep_chanhdl = evp->cm_channel;
494 	sp = ep->ep_sp;
495 	mutex_exit(&ep->ep_lock);
496 
497 	(void) rds_session_active(sp);
498 
499 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
500 	return (IBT_CM_ACCEPT);
501 }
502 
503 /*
504  * Handle CONN CLOSED
505  */
506 static ibt_cm_status_t
507 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
508 {
509 	rds_ep_t	*ep;
510 	rds_session_t	*sp;
511 
512 	/* Catch DREQs but ignore DREPs */
513 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
514 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
515 		    "Ignoring Event: %d received", evp->cm_event.closed);
516 		return (IBT_CM_ACCEPT);
517 	}
518 
519 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
520 	sp = ep->ep_sp;
521 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter",
522 	    ep, evp->cm_channel);
523 
524 	mutex_enter(&ep->ep_lock);
525 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
526 		/* Ignore this DREQ */
527 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
528 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
529 		mutex_exit(&ep->ep_lock);
530 		return (IBT_CM_ACCEPT);
531 	}
532 	ep->ep_state = RDS_EP_STATE_CLOSING;
533 	mutex_exit(&ep->ep_lock);
534 
535 	rw_enter(&sp->session_lock, RW_WRITER);
536 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
537 	    sp->session_state);
538 
539 	switch (sp->session_state) {
540 	case RDS_SESSION_STATE_CONNECTED:
541 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
542 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
543 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
544 		break;
545 
546 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
547 		sp->session_state = RDS_SESSION_STATE_CLOSED;
548 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
549 		    "RDS_SESSION_STATE_CLOSED", sp);
550 		rds_passive_session_fini(sp);
551 		sp->session_state = RDS_SESSION_STATE_FINI;
552 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
553 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
554 		break;
555 
556 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
557 	case RDS_SESSION_STATE_ERROR:
558 	case RDS_SESSION_STATE_CLOSED:
559 		break;
560 
561 	case RDS_SESSION_STATE_INIT:
562 		sp->session_state = RDS_SESSION_STATE_ERROR;
563 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
564 		    "RDS_SESSION_STATE_ERROR", sp);
565 		rds_passive_session_fini(sp);
566 		sp->session_state = RDS_SESSION_STATE_FAILED;
567 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
568 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
569 		break;
570 
571 	default:
572 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
573 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
574 		rds_passive_session_fini(sp);
575 		sp->session_state = RDS_SESSION_STATE_FAILED;
576 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
577 		    "RDS_SESSION_STATE_FAILED", sp);
578 	}
579 	rw_exit(&sp->session_lock);
580 
581 	mutex_enter(&ep->ep_lock);
582 	ep->ep_state = RDS_EP_STATE_CLOSED;
583 	mutex_exit(&ep->ep_lock);
584 
585 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
586 	return (IBT_CM_ACCEPT);
587 }
588 
589 /*
590  * Handle EVENT FAILURE
591  */
592 static ibt_cm_status_t
593 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
594 {
595 	rds_ep_t	*ep;
596 	rds_session_t	*sp;
597 	int		ret;
598 
599 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
600 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
601 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
602 	    evp->cm_event.failed.cf_reason);
603 
604 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
605 		RDS_DPRINTF0(LABEL,
606 		    "Received REJ with reason IBT_CM_INVALID_SID: "
607 		    "The remote system could be running an older RDS version");
608 	}
609 
610 	if (evp->cm_channel == NULL) {
611 		return (IBT_CM_ACCEPT);
612 	}
613 
614 	if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
615 	    (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
616 		/*
617 		 * This end is active, just ignore, ibt_open_rc_channel()
618 		 * caller will take care of cleanup.
619 		 */
620 		RDS_DPRINTF2("rds_handle_cm_event_failure",
621 		    "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
622 		return (IBT_CM_ACCEPT);
623 	}
624 
625 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
626 	sp = ep->ep_sp;
627 
628 	rw_enter(&sp->session_lock, RW_WRITER);
629 	if (sp->session_type == RDS_SESSION_PASSIVE) {
630 		RDS_DPRINTF2("rds_handle_cm_event_failure",
631 		    "SP(%p) - state: %d", sp, sp->session_state);
632 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
633 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
634 			sp->session_state = RDS_SESSION_STATE_ERROR;
635 			RDS_DPRINTF3("rds_handle_cm_event_failure",
636 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
637 
638 			/*
639 			 * Store the cm_channel for freeing later
640 			 * Active side frees it on ibt_open_rc_channel
641 			 * failure
642 			 */
643 			if (ep->ep_chanhdl == NULL) {
644 				ep->ep_chanhdl = evp->cm_channel;
645 			}
646 			rw_exit(&sp->session_lock);
647 
648 			/*
649 			 * rds_passive_session_fini should not be called
650 			 * directly in the CM handler. It will cause a deadlock.
651 			 */
652 			ret = ddi_taskq_dispatch(rds_taskq,
653 			    rds_cleanup_passive_session, (void *)sp,
654 			    DDI_NOSLEEP);
655 			if (ret != DDI_SUCCESS) {
656 				RDS_DPRINTF1("rds_handle_cm_event_failure",
657 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
658 			}
659 			return (IBT_CM_ACCEPT);
660 		}
661 	}
662 	rw_exit(&sp->session_lock);
663 
664 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
665 	return (IBT_CM_ACCEPT);
666 }
667 
668 /*
669  * CM Handler
670  *
671  * Called by IBCM
672  * The cm_private type differs for active and passive events.
673  */
674 ibt_cm_status_t
675 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
676     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
677     ibt_priv_data_len_t ret_len_max)
678 {
679 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
680 
681 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
682 
683 	switch (eventp->cm_type) {
684 	case IBT_CM_EVENT_REQ_RCV:
685 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
686 		    ret_args, ret_priv_data, ret_len_max);
687 		break;
688 	case IBT_CM_EVENT_REP_RCV:
689 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
690 		    ret_len_max);
691 		break;
692 	case IBT_CM_EVENT_MRA_RCV:
693 		/* Not supported */
694 		break;
695 	case IBT_CM_EVENT_CONN_EST:
696 		ret = rds_handle_cm_conn_est(eventp);
697 		break;
698 	case IBT_CM_EVENT_CONN_CLOSED:
699 		ret = rds_handle_cm_conn_closed(eventp);
700 		break;
701 	case IBT_CM_EVENT_FAILURE:
702 		ret = rds_handle_cm_event_failure(eventp);
703 		break;
704 	case IBT_CM_EVENT_LAP_RCV:
705 		/* Not supported */
706 		RDS_DPRINTF2(LABEL, "LAP message received");
707 		break;
708 	case IBT_CM_EVENT_APR_RCV:
709 		/* Not supported */
710 		RDS_DPRINTF2(LABEL, "APR message received");
711 		break;
712 	default:
713 		break;
714 	}
715 
716 	RDS_DPRINTF2("rds_cm_handler", "Return");
717 
718 	return (ret);
719 }
720 
721 /* This is based on OFED Linux RDS */
722 #define	RDS_PORT_NUM	6556
723 
724 /*
725  * Register the wellknown service with service id: RDS_SERVICE_ID
726  * Incoming connection requests should arrive on this service id.
727  */
728 ibt_srv_hdl_t
729 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
730 {
731 	ibt_srv_hdl_t	srvhdl;
732 	ibt_srv_desc_t	srvdesc;
733 	int		ret;
734 
735 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
736 
737 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
738 	srvdesc.sd_handler = rds_cm_handler;
739 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
740 
741 	/*
742 	 * Register the old service id for backward compatibility
743 	 * REQs received on this service id would be rejected
744 	 */
745 	ret = ibt_register_service(rds_ibhdl, &srvdesc, RDS_SERVICE_ID,
746 	    1, &rdsib_statep->rds_old_srvhdl, NULL);
747 	if (ret != IBT_SUCCESS) {
748 		RDS_DPRINTF2(LABEL,
749 		    "RDS Service (0x%llx) Registration Failed: %d",
750 		    RDS_SERVICE_ID, ret);
751 		return (NULL);
752 	}
753 
754 	/*
755 	 * This is the new service id as per:
756 	 * Annex A11: RDMA IP CM Service
757 	 */
758 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
759 	    RDS_PORT_NUM);
760 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
761 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
762 	if (ret != IBT_SUCCESS) {
763 		RDS_DPRINTF2(LABEL,
764 		    "RDS Service (0x%llx) Registration Failed: %d",
765 		    rdsib_statep->rds_service_id, ret);
766 		return (NULL);
767 	}
768 
769 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
770 	return (srvhdl);
771 }
772 
773 /* Bind the RDS service on all ports */
774 int
775 rds_bind_service(rds_state_t *statep)
776 {
777 	rds_hca_t	*hcap;
778 	ib_gid_t	gid;
779 	uint_t		jx, nbinds = 0, nports = 0;
780 	int		ret;
781 
782 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
783 
784 	hcap = statep->rds_hcalistp;
785 	while (hcap != NULL) {
786 		for (jx = 0; jx < hcap->hca_nports; jx++) {
787 			nports++;
788 			if (hcap->hca_pinfop[jx].p_linkstate !=
789 			    IBT_PORT_ACTIVE) {
790 				/*
791 				 * service bind will be called in the async
792 				 * handler when the port comes up
793 				 */
794 				continue;
795 			}
796 
797 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
798 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
799 			    "gid: %llx:%llx", hcap->hca_guid,
800 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
801 			    gid.gid_guid);
802 
803 			/* pass statep as cm_private */
804 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
805 			    NULL, statep, NULL);
806 			if (ret != IBT_SUCCESS) {
807 				RDS_DPRINTF2(LABEL, "Bind service for "
808 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
809 				    "failed: %d", hcap->hca_guid,
810 				    hcap->hca_pinfop[jx].p_port_num,
811 				    gid.gid_prefix, gid.gid_guid, ret);
812 				continue;
813 			}
814 
815 			nbinds++;
816 
817 			/* bind the old service, ignore if it fails */
818 			ret = ibt_bind_service(statep->rds_old_srvhdl, gid,
819 			    NULL, statep, NULL);
820 			if (ret != IBT_SUCCESS) {
821 				RDS_DPRINTF2(LABEL, "Bind service for "
822 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
823 				    "failed: %d", hcap->hca_guid,
824 				    hcap->hca_pinfop[jx].p_port_num,
825 				    gid.gid_prefix, gid.gid_guid, ret);
826 			}
827 		}
828 		hcap = hcap->hca_nextp;
829 	}
830 
831 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
832 	    nbinds, nports);
833 
834 #if 0
835 	if (nbinds == 0) {
836 		return (-1);
837 	}
838 #endif
839 
840 	RDS_DPRINTF2("rds_bind_service", "Return");
841 
842 	return (0);
843 }
844 
845 /* Open an RC connection */
846 int
847 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
848     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
849 {
850 	rds_session_t		*sp;
851 	ibt_chan_open_args_t	ocargs;
852 	ibt_rc_returns_t	ocrets;
853 	rds_cm_private_data_t	cmp;
854 	uint8_t			hca_port;
855 	ibt_channel_hdl_t	hdl;
856 	ibt_status_t		ret = 0;
857 	ibt_ip_cm_info_t	ipcm_info;
858 
859 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
860 
861 	sp = ep->ep_sp;
862 
863 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
864 	ipcm_info.src_addr.family = AF_INET;
865 	ipcm_info.SRCIP = htonl(sp->session_myip);
866 	ipcm_info.dst_addr.family = AF_INET;
867 	ipcm_info.DSTIP = htonl(sp->session_remip);
868 	ipcm_info.src_port = htons(RDS_PORT_NUM);
869 	ret = ibt_format_ip_private_data(&ipcm_info,
870 	    sizeof (rds_cm_private_data_t), &cmp);
871 	if (ret != IBT_SUCCESS) {
872 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
873 		    "failed: %d", sp, ep, ret);
874 		return (-1);
875 	}
876 
877 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
878 
879 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
880 	if (hdl == NULL) {
881 		return (-1);
882 	}
883 
884 	cmp.cmp_version = RDS_VERSION;
885 	cmp.cmp_arch = RDS_THIS_ARCH;
886 	cmp.cmp_eptype = ep->ep_type;
887 	cmp.cmp_failover = sp->session_failover;
888 	cmp.cmp_last_bufid = ep->ep_rbufid;
889 	cmp.cmp_user_buffer_size = UserBufferSize;
890 	cmp.cmp_ack_addr = ep->ep_ack_addr;
891 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
892 
893 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
894 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
895 	ocargs.oc_path = pinfo;
896 	ocargs.oc_cm_handler = rds_cm_handler;
897 	ocargs.oc_cm_clnt_private = NULL;
898 	ocargs.oc_rdma_ra_out = 4;
899 	ocargs.oc_rdma_ra_in = 4;
900 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
901 	ocargs.oc_priv_data = &cmp;
902 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
903 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
904 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
905 	    mode, &ocargs, &ocrets);
906 	if (ret != IBT_SUCCESS) {
907 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
908 		    "failed: %d", sp, ep, ret);
909 		(void) ibt_flush_channel(hdl);
910 		(void) ibt_free_channel(hdl);
911 
912 		mutex_enter(&ep->ep_lock);
913 		/* don't cleanup if this failure is due to peer-peer race */
914 		if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
915 			/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
916 			ep->ep_state = RDS_EP_STATE_ERROR;
917 			rds_ep_free_rc_channel(ep);
918 		}
919 		mutex_exit(&ep->ep_lock);
920 
921 		return (-1);
922 	}
923 
924 	*chanhdl = hdl;
925 
926 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
927 	    *chanhdl);
928 
929 	return (0);
930 }
931 
932 int
933 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
934 {
935 	int	ret;
936 
937 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
938 	    chanhdl, mode);
939 
940 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
941 
942 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
943 
944 	return (ret);
945 }
946