xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c (revision 015f8fff605f2fbd5fd0072e555576297804d57b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 
82 /*
83  * This file contains CM related work:
84  *
85  * Service registration/deregistration
86  * Path lookup
87  * CM connection callbacks
88  * CM active and passive connection establishment
89  * Connection failover
90  */
91 
92 /*
93  * Handle an incoming CM REQ
94  */
95 /* ARGSUSED */
96 static ibt_cm_status_t
97 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
98     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
99 {
100 	ibt_cm_req_rcv_t	*reqp;
101 	ib_gid_t		lgid, rgid;
102 	rds_cm_private_data_t	cmp;
103 	rds_session_t		*sp;
104 	rds_ep_t		*ep;
105 	ibt_channel_hdl_t	chanhdl;
106 	int			ret;
107 
108 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
109 
110 	reqp = &evp->cm_event.req;
111 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
112 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
113 
114 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
115 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
116 
117 	/* validate service id */
118 	if (reqp->req_service_id == RDS_SERVICE_ID) {
119 		RDS_DPRINTF0(LABEL, "Version Mismatch: Remote system "
120 		    "(GUID: 0x%llx) is running an older version of RDS",
121 		    rgid.gid_guid);
122 		return (IBT_CM_REJECT);
123 	}
124 
125 	/*
126 	 * CM private data brings IP information
127 	 * Private data received is a stream of bytes and may not be properly
128 	 * aligned. So, bcopy the data onto the stack before accessing it.
129 	 */
130 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
131 	    sizeof (rds_cm_private_data_t));
132 
133 	RDS_DPRINTF2(LABEL, "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
134 	    cmp.cmp_localip, cmp.cmp_remip, cmp.cmp_eptype);
135 
136 	if (cmp.cmp_version != RDS_VERSION) {
137 		RDS_DPRINTF0(LABEL, "Version Mismatch: Local version: %d "
138 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
139 		return (IBT_CM_REJECT);
140 	}
141 
142 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
143 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
144 		    cmp.cmp_arch, RDS_THIS_ARCH);
145 		return (IBT_CM_REJECT);
146 	}
147 
148 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
149 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
150 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
151 		return (IBT_CM_REJECT);
152 	}
153 
154 	/* user_buffer_size should be same on all nodes */
155 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
156 		RDS_DPRINTF2(LABEL,
157 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
158 		    UserBufferSize, cmp.cmp_user_buffer_size);
159 		return (IBT_CM_REJECT);
160 	}
161 
162 	/*
163 	 * RDS needs more time to process a failover REQ so send an MRA.
164 	 * Otherwise, the remote may retry the REQ and fail the connection.
165 	 */
166 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
167 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
168 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
169 		    10000000 /* 10 sec */, NULL, 0);
170 	}
171 
172 	/* Is there a session to the destination node? */
173 	rw_enter(&statep->rds_sessionlock, RW_READER);
174 	sp = rds_session_lkup(statep, cmp.cmp_localip, rgid.gid_guid);
175 	rw_exit(&statep->rds_sessionlock);
176 
177 	if (sp == NULL) {
178 		/*
179 		 * currently there is no session to the destination
180 		 * remote ip in the private data is the local ip and vice
181 		 * versa
182 		 */
183 		sp = rds_session_create(statep, cmp.cmp_remip, cmp.cmp_localip,
184 		    reqp, RDS_SESSION_PASSIVE);
185 		if (sp == NULL) {
186 			/* Check the list anyway. */
187 			rw_enter(&statep->rds_sessionlock, RW_READER);
188 			sp = rds_session_lkup(statep, cmp.cmp_localip,
189 			    rgid.gid_guid);
190 			rw_exit(&statep->rds_sessionlock);
191 			if (sp == NULL) {
192 				/*
193 				 * The only way this can fail is due to lack
194 				 * of kernel resources
195 				 */
196 				return (IBT_CM_REJECT);
197 			}
198 		}
199 	}
200 
201 	rw_enter(&sp->session_lock, RW_WRITER);
202 
203 	/* catch peer-to-peer case as soon as possible */
204 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
205 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
206 		/* Check possible peer-to-peer case here */
207 		if (sp->session_type != RDS_SESSION_PASSIVE) {
208 			RDS_DPRINTF2("rds_handle_cm_req",
209 			    "SP(%p) Peer-peer connection handling", sp);
210 			if (lgid.gid_guid > rgid.gid_guid) {
211 				/* this node is active so reject this request */
212 				rw_exit(&sp->session_lock);
213 				return (IBT_CM_REJECT);
214 			} else {
215 				/* this node is passive, change the session */
216 				sp->session_type = RDS_SESSION_PASSIVE;
217 				sp->session_lgid = lgid;
218 				sp->session_rgid = rgid;
219 			}
220 		}
221 	}
222 
223 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
224 
225 	switch (sp->session_state) {
226 	case RDS_SESSION_STATE_CONNECTED:
227 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
228 		sp->session_state = RDS_SESSION_STATE_ERROR;
229 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
230 		    "RDS_SESSION_STATE_ERROR", sp);
231 
232 		/* FALLTHRU */
233 	case RDS_SESSION_STATE_ERROR:
234 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
235 		sp->session_type = RDS_SESSION_PASSIVE;
236 		rw_exit(&sp->session_lock);
237 
238 		rds_session_close(sp, IBT_NOCALLBACKS, 1);
239 
240 		/* move the session to init state */
241 		rw_enter(&sp->session_lock, RW_WRITER);
242 		ret = rds_session_reinit(sp, lgid);
243 		sp->session_myip = cmp.cmp_remip;
244 		sp->session_lgid = lgid;
245 		sp->session_rgid = rgid;
246 		if (ret != 0) {
247 			rds_session_fini(sp);
248 			sp->session_state = RDS_SESSION_STATE_FAILED;
249 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
250 			    "RDS_SESSION_STATE_FAILED", sp);
251 			rw_exit(&sp->session_lock);
252 			return (IBT_CM_REJECT);
253 		} else {
254 			sp->session_state = RDS_SESSION_STATE_INIT;
255 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
256 			    "RDS_SESSION_STATE_INIT", sp);
257 		}
258 
259 		if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
260 			ep = &sp->session_ctrlep;
261 		} else {
262 			ep = &sp->session_dataep;
263 		}
264 		break;
265 	case RDS_SESSION_STATE_CREATED:
266 	case RDS_SESSION_STATE_FAILED:
267 	case RDS_SESSION_STATE_FINI:
268 		/*
269 		 * Initialize both channels, we accept this connection
270 		 * only if both channels are initialized
271 		 */
272 		sp->session_type = RDS_SESSION_PASSIVE;
273 		sp->session_lgid = lgid;
274 		sp->session_rgid = rgid;
275 		sp->session_state = RDS_SESSION_STATE_CREATED;
276 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
277 		    "RDS_SESSION_STATE_CREATED", sp);
278 		ret = rds_session_init(sp);
279 		if (ret != 0) {
280 			/* Seems like there are not enough resources */
281 			sp->session_state = RDS_SESSION_STATE_FAILED;
282 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
283 			    "RDS_SESSION_STATE_FAILED", sp);
284 			rw_exit(&sp->session_lock);
285 			return (IBT_CM_REJECT);
286 		}
287 		sp->session_state = RDS_SESSION_STATE_INIT;
288 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
289 		    "RDS_SESSION_STATE_INIT", sp);
290 
291 		/* FALLTHRU */
292 	case RDS_SESSION_STATE_INIT:
293 		/*
294 		 * When re-using an existing session, make sure the
295 		 * session is still through the same HCA. Otherwise, the
296 		 * memory registrations have to moved to the new HCA.
297 		 */
298 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
299 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
300 				RDS_DPRINTF2("rds_handle_cm_req",
301 				    "Existing Session but different gid "
302 				    "existing: 0x%llx, new: 0x%llx, "
303 				    "sending an MRA",
304 				    sp->session_lgid.gid_guid, lgid.gid_guid);
305 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
306 				    evp->cm_session_id, 10000000 /* 10 sec */,
307 				    NULL, 0);
308 				ret = rds_session_reinit(sp, lgid);
309 				if (ret != 0) {
310 					rds_session_fini(sp);
311 					sp->session_state =
312 					    RDS_SESSION_STATE_FAILED;
313 					sp->session_failover = 0;
314 					RDS_DPRINTF3("rds_failover_session",
315 					    "SP(%p) State "
316 					    "RDS_SESSION_STATE_FAILED", sp);
317 					rw_exit(&sp->session_lock);
318 					return (IBT_CM_REJECT);
319 				}
320 			}
321 			ep = &sp->session_dataep;
322 		} else {
323 			ep = &sp->session_ctrlep;
324 		}
325 
326 		break;
327 	default:
328 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
329 		    "state: %d", sp, sp->session_state);
330 		rw_exit(&sp->session_lock);
331 		return (IBT_CM_REJECT);
332 	}
333 
334 	sp->session_failover = 0; /* reset any previous value */
335 	if (cmp.cmp_failover) {
336 		RDS_DPRINTF2("rds_handle_cm_req",
337 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
338 		sp->session_failover = 1;
339 	}
340 
341 	mutex_enter(&ep->ep_lock);
342 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
343 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
344 		sp->session_type = RDS_SESSION_PASSIVE;
345 		rw_exit(&sp->session_lock);
346 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
347 		rw_exit(&sp->session_lock);
348 		/*
349 		 * Peer to peer connection. There is an active
350 		 * connection pending on this ep. The one with
351 		 * greater port guid becomes active and the
352 		 * other becomes passive.
353 		 */
354 		RDS_DPRINTF2("rds_handle_cm_req",
355 		    "EP(%p) Peer-peer connection handling", ep);
356 		if (lgid.gid_guid > rgid.gid_guid) {
357 			/* this node is active so reject this request */
358 			mutex_exit(&ep->ep_lock);
359 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
360 			    "Rejecting passive in favor of active", sp, ep);
361 			return (IBT_CM_REJECT);
362 		} else {
363 			/*
364 			 * This session is not the active end, change it
365 			 * to passive end.
366 			 */
367 			ASSERT(sp->session_type == RDS_SESSION_ACTIVE);
368 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
369 
370 			rw_enter(&sp->session_lock, RW_WRITER);
371 			sp->session_type = RDS_SESSION_PASSIVE;
372 			sp->session_lgid = lgid;
373 			sp->session_rgid = rgid;
374 			rw_exit(&sp->session_lock);
375 		}
376 	} else {
377 		rw_exit(&sp->session_lock);
378 	}
379 
380 	ep->ep_lbufid = cmp.cmp_last_bufid;
381 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
382 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
383 	cmp.cmp_last_bufid = ep->ep_rbufid;
384 	cmp.cmp_ack_addr = ep->ep_ack_addr;
385 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
386 	mutex_exit(&ep->ep_lock);
387 
388 	/* continue with accepting the connection request for this channel */
389 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
390 	if (chanhdl == NULL) {
391 		mutex_enter(&ep->ep_lock);
392 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
393 		mutex_exit(&ep->ep_lock);
394 		return (IBT_CM_REJECT);
395 	}
396 
397 	/* pre-post recv buffers in the RQ */
398 	rds_post_recv_buf((void *)chanhdl);
399 
400 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
401 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
402 	rargsp->cm_ret.rep.cm_channel = chanhdl;
403 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
404 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
405 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
406 
407 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
408 	    sp, ep, chanhdl);
409 
410 	return (IBT_CM_ACCEPT);
411 }
412 
413 /*
414  * Handle an incoming CM REP
415  * Pre-post recv buffers for the QP
416  */
417 /* ARGSUSED */
418 static ibt_cm_status_t
419 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
420     void *rcmp, ibt_priv_data_len_t rcmp_len)
421 {
422 	rds_ep_t	*ep;
423 	rds_cm_private_data_t	cmp;
424 
425 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
426 
427 	/* pre-post recv buffers in the RQ */
428 	rds_post_recv_buf((void *)evp->cm_channel);
429 
430 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
431 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
432 	    sizeof (rds_cm_private_data_t));
433 	ep->ep_lbufid = cmp.cmp_last_bufid;
434 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
435 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
436 
437 	rargsp->cm_ret_len = 0;
438 
439 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
440 
441 	return (IBT_CM_ACCEPT);
442 }
443 
444 /*
445  * Handle CONN EST
446  */
447 static ibt_cm_status_t
448 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
449 {
450 	rds_session_t	*sp;
451 	rds_ep_t	*ep;
452 
453 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
454 
455 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
456 	    ep->ep_state);
457 
458 	mutex_enter(&ep->ep_lock);
459 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
460 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
461 	ep->ep_state = RDS_EP_STATE_CONNECTED;
462 	ep->ep_chanhdl = evp->cm_channel;
463 	sp = ep->ep_sp;
464 	mutex_exit(&ep->ep_lock);
465 
466 	(void) rds_session_active(sp);
467 
468 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
469 	return (IBT_CM_ACCEPT);
470 }
471 
472 /*
473  * Handle CONN CLOSED
474  */
475 static ibt_cm_status_t
476 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
477 {
478 	rds_ep_t	*ep;
479 	rds_session_t	*sp;
480 
481 	/* Catch DREQs but ignore DREPs */
482 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
483 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
484 		    "Ignoring Event: %d received", evp->cm_event.closed);
485 		return (IBT_CM_ACCEPT);
486 	}
487 
488 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
489 	sp = ep->ep_sp;
490 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Enter", ep);
491 
492 	mutex_enter(&ep->ep_lock);
493 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
494 		/* Ignore this DREQ */
495 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
496 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
497 		mutex_exit(&ep->ep_lock);
498 		return (IBT_CM_ACCEPT);
499 	}
500 	ep->ep_state = RDS_EP_STATE_CLOSING;
501 	mutex_exit(&ep->ep_lock);
502 
503 	rw_enter(&sp->session_lock, RW_WRITER);
504 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
505 	    sp->session_state);
506 
507 	switch (sp->session_state) {
508 	case RDS_SESSION_STATE_CONNECTED:
509 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
510 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
511 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
512 		break;
513 
514 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
515 		sp->session_state = RDS_SESSION_STATE_CLOSED;
516 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
517 		    "RDS_SESSION_STATE_CLOSED", sp);
518 		rds_passive_session_fini(sp);
519 		sp->session_state = RDS_SESSION_STATE_FINI;
520 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
521 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
522 		break;
523 
524 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
525 	case RDS_SESSION_STATE_ERROR:
526 	case RDS_SESSION_STATE_CLOSED:
527 		break;
528 
529 	case RDS_SESSION_STATE_INIT:
530 		sp->session_state = RDS_SESSION_STATE_ERROR;
531 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
532 		    "RDS_SESSION_STATE_ERROR", sp);
533 		rds_passive_session_fini(sp);
534 		sp->session_state = RDS_SESSION_STATE_FAILED;
535 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
536 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
537 		break;
538 
539 	default:
540 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
541 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
542 		rds_passive_session_fini(sp);
543 		sp->session_state = RDS_SESSION_STATE_FAILED;
544 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
545 		    "RDS_SESSION_STATE_FAILED", sp);
546 	}
547 	rw_exit(&sp->session_lock);
548 
549 	mutex_enter(&ep->ep_lock);
550 	ep->ep_state = RDS_EP_STATE_CLOSED;
551 	mutex_exit(&ep->ep_lock);
552 
553 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
554 	return (IBT_CM_ACCEPT);
555 }
556 
557 /*
558  * Handle EVENT FAILURE
559  */
560 static ibt_cm_status_t
561 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
562 {
563 	rds_ep_t	*ep;
564 	rds_session_t	*sp;
565 	int		ret;
566 
567 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
568 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
569 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
570 	    evp->cm_event.failed.cf_reason);
571 
572 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
573 		RDS_DPRINTF0(LABEL,
574 		    "Received REJ with reason IBT_CM_INVALID_SID: "
575 		    "The remote system could be running an older RDS version");
576 	}
577 
578 	if (evp->cm_channel == NULL) {
579 		return (IBT_CM_ACCEPT);
580 	}
581 
582 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
583 	sp = ep->ep_sp;
584 
585 	mutex_enter(&ep->ep_lock);
586 	ep->ep_state = RDS_EP_STATE_ERROR;
587 	mutex_exit(&ep->ep_lock);
588 
589 	rw_enter(&sp->session_lock, RW_WRITER);
590 	if (sp->session_type == RDS_SESSION_PASSIVE) {
591 		RDS_DPRINTF2("rds_handle_cm_event_failure",
592 		    "SP(%p) - state: %d", sp, sp->session_state);
593 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
594 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
595 			sp->session_state = RDS_SESSION_STATE_ERROR;
596 			RDS_DPRINTF3("rds_handle_cm_event_failure",
597 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
598 
599 			/*
600 			 * Store the cm_channel for freeing later
601 			 * Active side frees it on ibt_open_rc_channel
602 			 * failure
603 			 */
604 			if (ep->ep_chanhdl == NULL) {
605 				ep->ep_chanhdl = evp->cm_channel;
606 			}
607 			rw_exit(&sp->session_lock);
608 
609 			/*
610 			 * rds_passive_session_fini should not be called
611 			 * directly in the CM handler. It will cause a deadlock.
612 			 */
613 			ret = ddi_taskq_dispatch(rds_taskq,
614 			    rds_cleanup_passive_session, (void *)sp,
615 			    DDI_NOSLEEP);
616 			if (ret != DDI_SUCCESS) {
617 				RDS_DPRINTF1("rds_handle_cm_event_failure",
618 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
619 			}
620 			return (IBT_CM_ACCEPT);
621 		}
622 	}
623 	rw_exit(&sp->session_lock);
624 
625 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
626 	return (IBT_CM_ACCEPT);
627 }
628 
629 /*
630  * CM Handler
631  *
632  * Called by IBCM
633  * The cm_private type differs for active and passive events.
634  */
635 ibt_cm_status_t
636 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
637     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
638     ibt_priv_data_len_t ret_len_max)
639 {
640 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
641 
642 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
643 
644 	switch (eventp->cm_type) {
645 	case IBT_CM_EVENT_REQ_RCV:
646 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
647 		    ret_args, ret_priv_data, ret_len_max);
648 		break;
649 	case IBT_CM_EVENT_REP_RCV:
650 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
651 		    ret_len_max);
652 		break;
653 	case IBT_CM_EVENT_MRA_RCV:
654 		/* Not supported */
655 		break;
656 	case IBT_CM_EVENT_CONN_EST:
657 		ret = rds_handle_cm_conn_est(eventp);
658 		break;
659 	case IBT_CM_EVENT_CONN_CLOSED:
660 		ret = rds_handle_cm_conn_closed(eventp);
661 		break;
662 	case IBT_CM_EVENT_FAILURE:
663 		ret = rds_handle_cm_event_failure(eventp);
664 		break;
665 	case IBT_CM_EVENT_LAP_RCV:
666 		/* Not supported */
667 		RDS_DPRINTF2(LABEL, "LAP message received");
668 		break;
669 	case IBT_CM_EVENT_APR_RCV:
670 		/* Not supported */
671 		RDS_DPRINTF2(LABEL, "APR message received");
672 		break;
673 	default:
674 		break;
675 	}
676 
677 	RDS_DPRINTF2("rds_cm_handler", "Return");
678 
679 	return (ret);
680 }
681 
682 /* This is based on OFED Linux RDS */
683 #define	RDS_PORT_NUM	6556
684 
685 /*
686  * Register the wellknown service with service id: RDS_SERVICE_ID
687  * Incoming connection requests should arrive on this service id.
688  */
689 ibt_srv_hdl_t
690 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
691 {
692 	ibt_srv_hdl_t	srvhdl;
693 	ibt_srv_desc_t	srvdesc;
694 	int		ret;
695 
696 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
697 
698 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
699 	srvdesc.sd_handler = rds_cm_handler;
700 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
701 
702 	/*
703 	 * Register the old service id for backward compatibility
704 	 * REQs received on this service id would be rejected
705 	 */
706 	ret = ibt_register_service(rds_ibhdl, &srvdesc, RDS_SERVICE_ID,
707 	    1, &rdsib_statep->rds_old_srvhdl, NULL);
708 	if (ret != IBT_SUCCESS) {
709 		RDS_DPRINTF2(LABEL,
710 		    "RDS Service (0x%llx) Registration Failed: %d",
711 		    RDS_SERVICE_ID, ret);
712 		return (NULL);
713 	}
714 
715 	/*
716 	 * This is the new service id as per:
717 	 * Annex A11: RDMA IP CM Service
718 	 */
719 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
720 	    RDS_PORT_NUM);
721 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
722 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
723 	if (ret != IBT_SUCCESS) {
724 		RDS_DPRINTF2(LABEL,
725 		    "RDS Service (0x%llx) Registration Failed: %d",
726 		    rdsib_statep->rds_service_id, ret);
727 		return (NULL);
728 	}
729 
730 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
731 	return (srvhdl);
732 }
733 
734 /* Bind the RDS service on all ports */
735 int
736 rds_bind_service(rds_state_t *statep)
737 {
738 	rds_hca_t	*hcap;
739 	ib_gid_t	gid;
740 	uint_t		jx, nbinds = 0, nports = 0;
741 	int		ret;
742 
743 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
744 
745 	hcap = statep->rds_hcalistp;
746 	while (hcap != NULL) {
747 		for (jx = 0; jx < hcap->hca_nports; jx++) {
748 			nports++;
749 			if (hcap->hca_pinfop[jx].p_linkstate !=
750 			    IBT_PORT_ACTIVE) {
751 				/*
752 				 * service bind will be called in the async
753 				 * handler when the port comes up
754 				 */
755 				continue;
756 			}
757 
758 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
759 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
760 			    "gid: %llx:%llx", hcap->hca_guid,
761 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
762 			    gid.gid_guid);
763 
764 			/* pass statep as cm_private */
765 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
766 			    NULL, statep, NULL);
767 			if (ret != IBT_SUCCESS) {
768 				RDS_DPRINTF2(LABEL, "Bind service for "
769 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
770 				    "failed: %d", hcap->hca_guid,
771 				    hcap->hca_pinfop[jx].p_port_num,
772 				    gid.gid_prefix, gid.gid_guid, ret);
773 				continue;
774 			}
775 
776 			nbinds++;
777 
778 			/* bind the old service, ignore if it fails */
779 			ret = ibt_bind_service(statep->rds_old_srvhdl, gid,
780 			    NULL, statep, NULL);
781 			if (ret != IBT_SUCCESS) {
782 				RDS_DPRINTF2(LABEL, "Bind service for "
783 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
784 				    "failed: %d", hcap->hca_guid,
785 				    hcap->hca_pinfop[jx].p_port_num,
786 				    gid.gid_prefix, gid.gid_guid, ret);
787 			}
788 		}
789 		hcap = hcap->hca_nextp;
790 	}
791 
792 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
793 	    nbinds, nports);
794 
795 #if 0
796 	if (nbinds == 0) {
797 		return (-1);
798 	}
799 #endif
800 
801 	RDS_DPRINTF2("rds_bind_service", "Return");
802 
803 	return (0);
804 }
805 
806 /* Open an RC connection */
807 int
808 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
809     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
810 {
811 	rds_session_t		*sp;
812 	ibt_chan_open_args_t	ocargs;
813 	ibt_rc_returns_t	ocrets;
814 	rds_cm_private_data_t	cmp;
815 	uint8_t			hca_port;
816 	ibt_channel_hdl_t	hdl;
817 	ibt_status_t		ret = 0;
818 	ibt_ip_cm_info_t	ipcm_info;
819 
820 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
821 
822 	sp = ep->ep_sp;
823 
824 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
825 	ipcm_info.src_addr.family = AF_INET;
826 	ipcm_info.src_addr.un.ip4addr = sp->session_myip;
827 	ipcm_info.dst_addr.family = AF_INET;
828 	ipcm_info.dst_addr.un.ip4addr = sp->session_remip;
829 	ipcm_info.src_port = 6556; /* based on OFED RDS */
830 	ret = ibt_format_ip_private_data(&ipcm_info,
831 	    sizeof (rds_cm_private_data_t), &cmp);
832 	if (ret != IBT_SUCCESS) {
833 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
834 		    "failed: %d", sp, ep, ret);
835 		return (-1);
836 	}
837 
838 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
839 
840 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
841 	if (hdl == NULL) {
842 		return (-1);
843 	}
844 
845 	cmp.cmp_version = RDS_VERSION;
846 	cmp.cmp_arch = RDS_THIS_ARCH;
847 	cmp.cmp_remip = sp->session_remip;
848 	cmp.cmp_localip = sp->session_myip;
849 	cmp.cmp_eptype = ep->ep_type;
850 	cmp.cmp_failover = sp->session_failover;
851 	cmp.cmp_last_bufid = ep->ep_rbufid;
852 	cmp.cmp_user_buffer_size = UserBufferSize;
853 	cmp.cmp_ack_addr = ep->ep_ack_addr;
854 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
855 
856 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
857 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
858 	ocargs.oc_path = pinfo;
859 	ocargs.oc_cm_handler = rds_cm_handler;
860 	ocargs.oc_cm_clnt_private = NULL;
861 	ocargs.oc_rdma_ra_out = 4;
862 	ocargs.oc_rdma_ra_in = 4;
863 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
864 	ocargs.oc_priv_data = &cmp;
865 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
866 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
867 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
868 	    mode, &ocargs, &ocrets);
869 	if (ret != IBT_SUCCESS) {
870 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
871 		    "failed: %d", sp, ep, ret);
872 		(void) ibt_flush_channel(hdl);
873 		(void) ibt_free_channel(hdl);
874 		/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
875 		(void) ibt_free_cq(ep->ep_recvcq);
876 		ep->ep_recvcq = NULL;
877 		(void) ibt_free_cq(ep->ep_sendcq);
878 		ep->ep_sendcq = NULL;
879 		return (-1);
880 	}
881 
882 	*chanhdl = hdl;
883 
884 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
885 	    *chanhdl);
886 
887 	return (0);
888 }
889 
890 int
891 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
892 {
893 	int	ret;
894 
895 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
896 	    chanhdl, mode);
897 
898 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
899 
900 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
901 
902 	return (ret);
903 }
904