xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c (revision 5763ba1e357fad1d57b5875c499307b7ea6e2cd4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 
82 /*
83  * This file contains CM related work:
84  *
85  * Service registration/deregistration
86  * Path lookup
87  * CM connection callbacks
88  * CM active and passive connection establishment
89  * Connection failover
90  */
91 
92 #define	SRCIP	src_addr.un.ip4addr
93 #define	DSTIP	dst_addr.un.ip4addr
94 
95 /*
96  * Handle an incoming CM REQ
97  */
98 /* ARGSUSED */
99 static ibt_cm_status_t
100 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
101     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
102 {
103 	ibt_cm_req_rcv_t	*reqp;
104 	ib_gid_t		lgid, rgid;
105 	rds_cm_private_data_t	cmp;
106 	rds_session_t		*sp;
107 	rds_ep_t		*ep;
108 	ibt_channel_hdl_t	chanhdl;
109 	ibt_ip_cm_info_t	ipcm_info;
110 	int			ret;
111 
112 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
113 
114 	reqp = &evp->cm_event.req;
115 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
116 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
117 
118 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
119 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
120 
121 	/* validate service id */
122 	if (reqp->req_service_id == RDS_SERVICE_ID) {
123 		RDS_DPRINTF0(LABEL, "Version Mismatch: Remote system "
124 		    "(GUID: 0x%llx) is running an older version of RDS",
125 		    rgid.gid_guid);
126 		return (IBT_CM_REJECT);
127 	}
128 
129 	/*
130 	 * CM private data brings IP information
131 	 * Private data received is a stream of bytes and may not be properly
132 	 * aligned. So, bcopy the data onto the stack before accessing it.
133 	 */
134 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
135 	    sizeof (rds_cm_private_data_t));
136 
137 	/* extract the CM IP info */
138 	ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
139 	    &ipcm_info);
140 	if (ret != IBT_SUCCESS) {
141 		RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
142 		    ret);
143 		return (IBT_CM_REJECT);
144 	}
145 
146 	RDS_DPRINTF2("rds_handle_cm_req",
147 	    "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
148 	    ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
149 
150 	if (cmp.cmp_version != RDS_VERSION) {
151 		RDS_DPRINTF0(LABEL, "Version Mismatch: Local version: %d "
152 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
153 		return (IBT_CM_REJECT);
154 	}
155 
156 	/* RDS supports V4 addresses only */
157 	if ((ipcm_info.src_addr.family != AF_INET) ||
158 	    (ipcm_info.dst_addr.family != AF_INET)) {
159 		RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
160 		    "src: %d dst: %d", ipcm_info.src_addr.family,
161 		    ipcm_info.dst_addr.family);
162 		return (IBT_CM_REJECT);
163 	}
164 
165 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
166 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
167 		    cmp.cmp_arch, RDS_THIS_ARCH);
168 		return (IBT_CM_REJECT);
169 	}
170 
171 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
172 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
173 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
174 		return (IBT_CM_REJECT);
175 	}
176 
177 	/* user_buffer_size should be same on all nodes */
178 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
179 		RDS_DPRINTF2(LABEL,
180 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
181 		    UserBufferSize, cmp.cmp_user_buffer_size);
182 		return (IBT_CM_REJECT);
183 	}
184 
185 	/*
186 	 * RDS needs more time to process a failover REQ so send an MRA.
187 	 * Otherwise, the remote may retry the REQ and fail the connection.
188 	 */
189 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
190 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
191 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
192 		    10000000 /* 10 sec */, NULL, 0);
193 	}
194 
195 	/* Is there a session to the destination node? */
196 	rw_enter(&statep->rds_sessionlock, RW_READER);
197 	sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
198 	rw_exit(&statep->rds_sessionlock);
199 
200 	if (sp == NULL) {
201 		/*
202 		 * currently there is no session to the destination
203 		 * remote ip in the private data is the local ip and vice
204 		 * versa
205 		 */
206 		sp = rds_session_create(statep, ipcm_info.DSTIP,
207 		    ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
208 		if (sp == NULL) {
209 			/* Check the list anyway. */
210 			rw_enter(&statep->rds_sessionlock, RW_READER);
211 			sp = rds_session_lkup(statep, ipcm_info.SRCIP,
212 			    rgid.gid_guid);
213 			rw_exit(&statep->rds_sessionlock);
214 			if (sp == NULL) {
215 				/*
216 				 * The only way this can fail is due to lack
217 				 * of kernel resources
218 				 */
219 				return (IBT_CM_REJECT);
220 			}
221 		}
222 	}
223 
224 	rw_enter(&sp->session_lock, RW_WRITER);
225 
226 	/* catch peer-to-peer case as soon as possible */
227 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
228 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
229 		/* Check possible peer-to-peer case here */
230 		if (sp->session_type != RDS_SESSION_PASSIVE) {
231 			RDS_DPRINTF2("rds_handle_cm_req",
232 			    "SP(%p) Peer-peer connection handling", sp);
233 			if (lgid.gid_guid > rgid.gid_guid) {
234 				/* this node is active so reject this request */
235 				rw_exit(&sp->session_lock);
236 				return (IBT_CM_REJECT);
237 			} else {
238 				/* this node is passive, change the session */
239 				sp->session_type = RDS_SESSION_PASSIVE;
240 				sp->session_lgid = lgid;
241 				sp->session_rgid = rgid;
242 			}
243 		}
244 	}
245 
246 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
247 
248 	switch (sp->session_state) {
249 	case RDS_SESSION_STATE_CONNECTED:
250 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
251 		sp->session_state = RDS_SESSION_STATE_ERROR;
252 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
253 		    "RDS_SESSION_STATE_ERROR", sp);
254 
255 		/* FALLTHRU */
256 	case RDS_SESSION_STATE_ERROR:
257 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
258 		sp->session_type = RDS_SESSION_PASSIVE;
259 		rw_exit(&sp->session_lock);
260 
261 		rds_session_close(sp, IBT_NOCALLBACKS, 1);
262 
263 		/* move the session to init state */
264 		rw_enter(&sp->session_lock, RW_WRITER);
265 		ret = rds_session_reinit(sp, lgid);
266 		sp->session_myip = ipcm_info.DSTIP;
267 		sp->session_lgid = lgid;
268 		sp->session_rgid = rgid;
269 		if (ret != 0) {
270 			rds_session_fini(sp);
271 			sp->session_state = RDS_SESSION_STATE_FAILED;
272 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
273 			    "RDS_SESSION_STATE_FAILED", sp);
274 			rw_exit(&sp->session_lock);
275 			return (IBT_CM_REJECT);
276 		} else {
277 			sp->session_state = RDS_SESSION_STATE_INIT;
278 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
279 			    "RDS_SESSION_STATE_INIT", sp);
280 		}
281 
282 		if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
283 			ep = &sp->session_ctrlep;
284 		} else {
285 			ep = &sp->session_dataep;
286 		}
287 		break;
288 	case RDS_SESSION_STATE_CREATED:
289 	case RDS_SESSION_STATE_FAILED:
290 	case RDS_SESSION_STATE_FINI:
291 		/*
292 		 * Initialize both channels, we accept this connection
293 		 * only if both channels are initialized
294 		 */
295 		sp->session_type = RDS_SESSION_PASSIVE;
296 		sp->session_lgid = lgid;
297 		sp->session_rgid = rgid;
298 		sp->session_state = RDS_SESSION_STATE_CREATED;
299 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
300 		    "RDS_SESSION_STATE_CREATED", sp);
301 		ret = rds_session_init(sp);
302 		if (ret != 0) {
303 			/* Seems like there are not enough resources */
304 			sp->session_state = RDS_SESSION_STATE_FAILED;
305 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
306 			    "RDS_SESSION_STATE_FAILED", sp);
307 			rw_exit(&sp->session_lock);
308 			return (IBT_CM_REJECT);
309 		}
310 		sp->session_state = RDS_SESSION_STATE_INIT;
311 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
312 		    "RDS_SESSION_STATE_INIT", sp);
313 
314 		/* FALLTHRU */
315 	case RDS_SESSION_STATE_INIT:
316 		/*
317 		 * When re-using an existing session, make sure the
318 		 * session is still through the same HCA. Otherwise, the
319 		 * memory registrations have to moved to the new HCA.
320 		 */
321 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
322 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
323 				RDS_DPRINTF2("rds_handle_cm_req",
324 				    "Existing Session but different gid "
325 				    "existing: 0x%llx, new: 0x%llx, "
326 				    "sending an MRA",
327 				    sp->session_lgid.gid_guid, lgid.gid_guid);
328 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
329 				    evp->cm_session_id, 10000000 /* 10 sec */,
330 				    NULL, 0);
331 				ret = rds_session_reinit(sp, lgid);
332 				if (ret != 0) {
333 					rds_session_fini(sp);
334 					sp->session_state =
335 					    RDS_SESSION_STATE_FAILED;
336 					sp->session_failover = 0;
337 					RDS_DPRINTF3("rds_failover_session",
338 					    "SP(%p) State "
339 					    "RDS_SESSION_STATE_FAILED", sp);
340 					rw_exit(&sp->session_lock);
341 					return (IBT_CM_REJECT);
342 				}
343 			}
344 			ep = &sp->session_dataep;
345 		} else {
346 			ep = &sp->session_ctrlep;
347 		}
348 
349 		break;
350 	default:
351 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
352 		    "state: %d", sp, sp->session_state);
353 		rw_exit(&sp->session_lock);
354 		return (IBT_CM_REJECT);
355 	}
356 
357 	sp->session_failover = 0; /* reset any previous value */
358 	if (cmp.cmp_failover) {
359 		RDS_DPRINTF2("rds_handle_cm_req",
360 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
361 		sp->session_failover = 1;
362 	}
363 
364 	mutex_enter(&ep->ep_lock);
365 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
366 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
367 		sp->session_type = RDS_SESSION_PASSIVE;
368 		rw_exit(&sp->session_lock);
369 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
370 		rw_exit(&sp->session_lock);
371 		/*
372 		 * Peer to peer connection. There is an active
373 		 * connection pending on this ep. The one with
374 		 * greater port guid becomes active and the
375 		 * other becomes passive.
376 		 */
377 		RDS_DPRINTF2("rds_handle_cm_req",
378 		    "EP(%p) Peer-peer connection handling", ep);
379 		if (lgid.gid_guid > rgid.gid_guid) {
380 			/* this node is active so reject this request */
381 			mutex_exit(&ep->ep_lock);
382 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
383 			    "Rejecting passive in favor of active", sp, ep);
384 			return (IBT_CM_REJECT);
385 		} else {
386 			/*
387 			 * This session is not the active end, change it
388 			 * to passive end.
389 			 */
390 			ASSERT(sp->session_type == RDS_SESSION_ACTIVE);
391 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
392 
393 			rw_enter(&sp->session_lock, RW_WRITER);
394 			sp->session_type = RDS_SESSION_PASSIVE;
395 			sp->session_lgid = lgid;
396 			sp->session_rgid = rgid;
397 			rw_exit(&sp->session_lock);
398 		}
399 	} else {
400 		rw_exit(&sp->session_lock);
401 	}
402 
403 	ep->ep_lbufid = cmp.cmp_last_bufid;
404 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
405 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
406 	cmp.cmp_last_bufid = ep->ep_rbufid;
407 	cmp.cmp_ack_addr = ep->ep_ack_addr;
408 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
409 	mutex_exit(&ep->ep_lock);
410 
411 	/* continue with accepting the connection request for this channel */
412 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
413 	if (chanhdl == NULL) {
414 		mutex_enter(&ep->ep_lock);
415 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
416 		mutex_exit(&ep->ep_lock);
417 		return (IBT_CM_REJECT);
418 	}
419 
420 	/* pre-post recv buffers in the RQ */
421 	rds_post_recv_buf((void *)chanhdl);
422 
423 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
424 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
425 	rargsp->cm_ret.rep.cm_channel = chanhdl;
426 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
427 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
428 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
429 
430 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
431 	    sp, ep, chanhdl);
432 
433 	return (IBT_CM_ACCEPT);
434 }
435 
436 /*
437  * Handle an incoming CM REP
438  * Pre-post recv buffers for the QP
439  */
440 /* ARGSUSED */
441 static ibt_cm_status_t
442 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
443     void *rcmp, ibt_priv_data_len_t rcmp_len)
444 {
445 	rds_ep_t	*ep;
446 	rds_cm_private_data_t	cmp;
447 
448 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
449 
450 	/* pre-post recv buffers in the RQ */
451 	rds_post_recv_buf((void *)evp->cm_channel);
452 
453 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
454 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
455 	    sizeof (rds_cm_private_data_t));
456 	ep->ep_lbufid = cmp.cmp_last_bufid;
457 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
458 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
459 
460 	rargsp->cm_ret_len = 0;
461 
462 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
463 
464 	return (IBT_CM_ACCEPT);
465 }
466 
467 /*
468  * Handle CONN EST
469  */
470 static ibt_cm_status_t
471 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
472 {
473 	rds_session_t	*sp;
474 	rds_ep_t	*ep;
475 
476 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
477 
478 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
479 	    ep->ep_state);
480 
481 	mutex_enter(&ep->ep_lock);
482 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
483 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
484 	ep->ep_state = RDS_EP_STATE_CONNECTED;
485 	ep->ep_chanhdl = evp->cm_channel;
486 	sp = ep->ep_sp;
487 	mutex_exit(&ep->ep_lock);
488 
489 	(void) rds_session_active(sp);
490 
491 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
492 	return (IBT_CM_ACCEPT);
493 }
494 
495 /*
496  * Handle CONN CLOSED
497  */
498 static ibt_cm_status_t
499 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
500 {
501 	rds_ep_t	*ep;
502 	rds_session_t	*sp;
503 
504 	/* Catch DREQs but ignore DREPs */
505 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
506 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
507 		    "Ignoring Event: %d received", evp->cm_event.closed);
508 		return (IBT_CM_ACCEPT);
509 	}
510 
511 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
512 	sp = ep->ep_sp;
513 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Enter", ep);
514 
515 	mutex_enter(&ep->ep_lock);
516 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
517 		/* Ignore this DREQ */
518 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
519 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
520 		mutex_exit(&ep->ep_lock);
521 		return (IBT_CM_ACCEPT);
522 	}
523 	ep->ep_state = RDS_EP_STATE_CLOSING;
524 	mutex_exit(&ep->ep_lock);
525 
526 	rw_enter(&sp->session_lock, RW_WRITER);
527 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
528 	    sp->session_state);
529 
530 	switch (sp->session_state) {
531 	case RDS_SESSION_STATE_CONNECTED:
532 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
533 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
534 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
535 		break;
536 
537 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
538 		sp->session_state = RDS_SESSION_STATE_CLOSED;
539 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
540 		    "RDS_SESSION_STATE_CLOSED", sp);
541 		rds_passive_session_fini(sp);
542 		sp->session_state = RDS_SESSION_STATE_FINI;
543 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
544 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
545 		break;
546 
547 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
548 	case RDS_SESSION_STATE_ERROR:
549 	case RDS_SESSION_STATE_CLOSED:
550 		break;
551 
552 	case RDS_SESSION_STATE_INIT:
553 		sp->session_state = RDS_SESSION_STATE_ERROR;
554 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
555 		    "RDS_SESSION_STATE_ERROR", sp);
556 		rds_passive_session_fini(sp);
557 		sp->session_state = RDS_SESSION_STATE_FAILED;
558 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
559 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
560 		break;
561 
562 	default:
563 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
564 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
565 		rds_passive_session_fini(sp);
566 		sp->session_state = RDS_SESSION_STATE_FAILED;
567 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
568 		    "RDS_SESSION_STATE_FAILED", sp);
569 	}
570 	rw_exit(&sp->session_lock);
571 
572 	mutex_enter(&ep->ep_lock);
573 	ep->ep_state = RDS_EP_STATE_CLOSED;
574 	mutex_exit(&ep->ep_lock);
575 
576 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
577 	return (IBT_CM_ACCEPT);
578 }
579 
580 /*
581  * Handle EVENT FAILURE
582  */
583 static ibt_cm_status_t
584 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
585 {
586 	rds_ep_t	*ep;
587 	rds_session_t	*sp;
588 	int		ret;
589 
590 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
591 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
592 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
593 	    evp->cm_event.failed.cf_reason);
594 
595 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
596 		RDS_DPRINTF0(LABEL,
597 		    "Received REJ with reason IBT_CM_INVALID_SID: "
598 		    "The remote system could be running an older RDS version");
599 	}
600 
601 	if (evp->cm_channel == NULL) {
602 		return (IBT_CM_ACCEPT);
603 	}
604 
605 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
606 	sp = ep->ep_sp;
607 
608 	mutex_enter(&ep->ep_lock);
609 	ep->ep_state = RDS_EP_STATE_ERROR;
610 	mutex_exit(&ep->ep_lock);
611 
612 	rw_enter(&sp->session_lock, RW_WRITER);
613 	if (sp->session_type == RDS_SESSION_PASSIVE) {
614 		RDS_DPRINTF2("rds_handle_cm_event_failure",
615 		    "SP(%p) - state: %d", sp, sp->session_state);
616 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
617 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
618 			sp->session_state = RDS_SESSION_STATE_ERROR;
619 			RDS_DPRINTF3("rds_handle_cm_event_failure",
620 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
621 
622 			/*
623 			 * Store the cm_channel for freeing later
624 			 * Active side frees it on ibt_open_rc_channel
625 			 * failure
626 			 */
627 			if (ep->ep_chanhdl == NULL) {
628 				ep->ep_chanhdl = evp->cm_channel;
629 			}
630 			rw_exit(&sp->session_lock);
631 
632 			/*
633 			 * rds_passive_session_fini should not be called
634 			 * directly in the CM handler. It will cause a deadlock.
635 			 */
636 			ret = ddi_taskq_dispatch(rds_taskq,
637 			    rds_cleanup_passive_session, (void *)sp,
638 			    DDI_NOSLEEP);
639 			if (ret != DDI_SUCCESS) {
640 				RDS_DPRINTF1("rds_handle_cm_event_failure",
641 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
642 			}
643 			return (IBT_CM_ACCEPT);
644 		}
645 	}
646 	rw_exit(&sp->session_lock);
647 
648 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
649 	return (IBT_CM_ACCEPT);
650 }
651 
652 /*
653  * CM Handler
654  *
655  * Called by IBCM
656  * The cm_private type differs for active and passive events.
657  */
658 ibt_cm_status_t
659 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
660     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
661     ibt_priv_data_len_t ret_len_max)
662 {
663 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
664 
665 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
666 
667 	switch (eventp->cm_type) {
668 	case IBT_CM_EVENT_REQ_RCV:
669 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
670 		    ret_args, ret_priv_data, ret_len_max);
671 		break;
672 	case IBT_CM_EVENT_REP_RCV:
673 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
674 		    ret_len_max);
675 		break;
676 	case IBT_CM_EVENT_MRA_RCV:
677 		/* Not supported */
678 		break;
679 	case IBT_CM_EVENT_CONN_EST:
680 		ret = rds_handle_cm_conn_est(eventp);
681 		break;
682 	case IBT_CM_EVENT_CONN_CLOSED:
683 		ret = rds_handle_cm_conn_closed(eventp);
684 		break;
685 	case IBT_CM_EVENT_FAILURE:
686 		ret = rds_handle_cm_event_failure(eventp);
687 		break;
688 	case IBT_CM_EVENT_LAP_RCV:
689 		/* Not supported */
690 		RDS_DPRINTF2(LABEL, "LAP message received");
691 		break;
692 	case IBT_CM_EVENT_APR_RCV:
693 		/* Not supported */
694 		RDS_DPRINTF2(LABEL, "APR message received");
695 		break;
696 	default:
697 		break;
698 	}
699 
700 	RDS_DPRINTF2("rds_cm_handler", "Return");
701 
702 	return (ret);
703 }
704 
705 /* This is based on OFED Linux RDS */
706 #define	RDS_PORT_NUM	6556
707 
708 /*
709  * Register the wellknown service with service id: RDS_SERVICE_ID
710  * Incoming connection requests should arrive on this service id.
711  */
712 ibt_srv_hdl_t
713 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
714 {
715 	ibt_srv_hdl_t	srvhdl;
716 	ibt_srv_desc_t	srvdesc;
717 	int		ret;
718 
719 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
720 
721 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
722 	srvdesc.sd_handler = rds_cm_handler;
723 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
724 
725 	/*
726 	 * Register the old service id for backward compatibility
727 	 * REQs received on this service id would be rejected
728 	 */
729 	ret = ibt_register_service(rds_ibhdl, &srvdesc, RDS_SERVICE_ID,
730 	    1, &rdsib_statep->rds_old_srvhdl, NULL);
731 	if (ret != IBT_SUCCESS) {
732 		RDS_DPRINTF2(LABEL,
733 		    "RDS Service (0x%llx) Registration Failed: %d",
734 		    RDS_SERVICE_ID, ret);
735 		return (NULL);
736 	}
737 
738 	/*
739 	 * This is the new service id as per:
740 	 * Annex A11: RDMA IP CM Service
741 	 */
742 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
743 	    RDS_PORT_NUM);
744 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
745 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
746 	if (ret != IBT_SUCCESS) {
747 		RDS_DPRINTF2(LABEL,
748 		    "RDS Service (0x%llx) Registration Failed: %d",
749 		    rdsib_statep->rds_service_id, ret);
750 		return (NULL);
751 	}
752 
753 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
754 	return (srvhdl);
755 }
756 
757 /* Bind the RDS service on all ports */
758 int
759 rds_bind_service(rds_state_t *statep)
760 {
761 	rds_hca_t	*hcap;
762 	ib_gid_t	gid;
763 	uint_t		jx, nbinds = 0, nports = 0;
764 	int		ret;
765 
766 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
767 
768 	hcap = statep->rds_hcalistp;
769 	while (hcap != NULL) {
770 		for (jx = 0; jx < hcap->hca_nports; jx++) {
771 			nports++;
772 			if (hcap->hca_pinfop[jx].p_linkstate !=
773 			    IBT_PORT_ACTIVE) {
774 				/*
775 				 * service bind will be called in the async
776 				 * handler when the port comes up
777 				 */
778 				continue;
779 			}
780 
781 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
782 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
783 			    "gid: %llx:%llx", hcap->hca_guid,
784 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
785 			    gid.gid_guid);
786 
787 			/* pass statep as cm_private */
788 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
789 			    NULL, statep, NULL);
790 			if (ret != IBT_SUCCESS) {
791 				RDS_DPRINTF2(LABEL, "Bind service for "
792 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
793 				    "failed: %d", hcap->hca_guid,
794 				    hcap->hca_pinfop[jx].p_port_num,
795 				    gid.gid_prefix, gid.gid_guid, ret);
796 				continue;
797 			}
798 
799 			nbinds++;
800 
801 			/* bind the old service, ignore if it fails */
802 			ret = ibt_bind_service(statep->rds_old_srvhdl, gid,
803 			    NULL, statep, NULL);
804 			if (ret != IBT_SUCCESS) {
805 				RDS_DPRINTF2(LABEL, "Bind service for "
806 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
807 				    "failed: %d", hcap->hca_guid,
808 				    hcap->hca_pinfop[jx].p_port_num,
809 				    gid.gid_prefix, gid.gid_guid, ret);
810 			}
811 		}
812 		hcap = hcap->hca_nextp;
813 	}
814 
815 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
816 	    nbinds, nports);
817 
818 #if 0
819 	if (nbinds == 0) {
820 		return (-1);
821 	}
822 #endif
823 
824 	RDS_DPRINTF2("rds_bind_service", "Return");
825 
826 	return (0);
827 }
828 
829 /* Open an RC connection */
830 int
831 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
832     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
833 {
834 	rds_session_t		*sp;
835 	ibt_chan_open_args_t	ocargs;
836 	ibt_rc_returns_t	ocrets;
837 	rds_cm_private_data_t	cmp;
838 	uint8_t			hca_port;
839 	ibt_channel_hdl_t	hdl;
840 	ibt_status_t		ret = 0;
841 	ibt_ip_cm_info_t	ipcm_info;
842 
843 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
844 
845 	sp = ep->ep_sp;
846 
847 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
848 	ipcm_info.src_addr.family = AF_INET;
849 	ipcm_info.SRCIP = sp->session_myip;
850 	ipcm_info.dst_addr.family = AF_INET;
851 	ipcm_info.DSTIP = sp->session_remip;
852 	ipcm_info.src_port = 6556; /* based on OFED RDS */
853 	ret = ibt_format_ip_private_data(&ipcm_info,
854 	    sizeof (rds_cm_private_data_t), &cmp);
855 	if (ret != IBT_SUCCESS) {
856 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
857 		    "failed: %d", sp, ep, ret);
858 		return (-1);
859 	}
860 
861 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
862 
863 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
864 	if (hdl == NULL) {
865 		return (-1);
866 	}
867 
868 	cmp.cmp_version = RDS_VERSION;
869 	cmp.cmp_arch = RDS_THIS_ARCH;
870 	cmp.cmp_eptype = ep->ep_type;
871 	cmp.cmp_failover = sp->session_failover;
872 	cmp.cmp_last_bufid = ep->ep_rbufid;
873 	cmp.cmp_user_buffer_size = UserBufferSize;
874 	cmp.cmp_ack_addr = ep->ep_ack_addr;
875 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
876 
877 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
878 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
879 	ocargs.oc_path = pinfo;
880 	ocargs.oc_cm_handler = rds_cm_handler;
881 	ocargs.oc_cm_clnt_private = NULL;
882 	ocargs.oc_rdma_ra_out = 4;
883 	ocargs.oc_rdma_ra_in = 4;
884 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
885 	ocargs.oc_priv_data = &cmp;
886 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
887 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
888 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
889 	    mode, &ocargs, &ocrets);
890 	if (ret != IBT_SUCCESS) {
891 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
892 		    "failed: %d", sp, ep, ret);
893 		(void) ibt_flush_channel(hdl);
894 		(void) ibt_free_channel(hdl);
895 
896 		/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
897 		mutex_enter(&ep->ep_lock);
898 		rds_ep_free_rc_channel(ep);
899 		mutex_exit(&ep->ep_lock);
900 
901 		return (-1);
902 	}
903 
904 	*chanhdl = hdl;
905 
906 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
907 	    *chanhdl);
908 
909 	return (0);
910 }
911 
912 int
913 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
914 {
915 	int	ret;
916 
917 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
918 	    chanhdl, mode);
919 
920 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
921 
922 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
923 
924 	return (ret);
925 }
926