xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c (revision 00a3eaf3896a33935e11fd5c5fb5c1714225c067)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #include <sys/ib/clients/rds/rdsib_cm.h>
76 #include <sys/ib/clients/rds/rdsib_ib.h>
77 #include <sys/ib/clients/rds/rdsib_buf.h>
78 #include <sys/ib/clients/rds/rdsib_ep.h>
79 
80 /*
81  * This file contains CM related work:
82  *
83  * Service registration/deregistration
84  * Path lookup
85  * CM connection callbacks
86  * CM active and passive connection establishment
87  * Connection failover
88  */
89 
90 #define	SRCIP	src_addr.un.ip4addr
91 #define	DSTIP	dst_addr.un.ip4addr
92 
93 /*
94  * Handle an incoming CM REQ
95  */
96 /* ARGSUSED */
97 static ibt_cm_status_t
98 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
99     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
100 {
101 	ibt_cm_req_rcv_t	*reqp;
102 	ib_gid_t		lgid, rgid;
103 	rds_cm_private_data_t	cmp;
104 	rds_session_t		*sp;
105 	rds_ep_t		*ep;
106 	ibt_channel_hdl_t	chanhdl;
107 	ibt_ip_cm_info_t	ipcm_info;
108 	int			ret;
109 
110 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
111 
112 	reqp = &evp->cm_event.req;
113 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
114 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
115 
116 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
117 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
118 
119 	/*
120 	 * CM private data brings IP information
121 	 * Private data received is a stream of bytes and may not be properly
122 	 * aligned. So, bcopy the data onto the stack before accessing it.
123 	 */
124 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
125 	    sizeof (rds_cm_private_data_t));
126 
127 	/* extract the CM IP info */
128 	ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
129 	    &ipcm_info);
130 	if (ret != IBT_SUCCESS) {
131 		RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
132 		    ret);
133 		return (IBT_CM_REJECT);
134 	}
135 
136 	RDS_DPRINTF2("rds_handle_cm_req",
137 	    "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
138 	    ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
139 
140 	if (cmp.cmp_version != RDS_VERSION) {
141 		RDS_DPRINTF2(LABEL, "Version Mismatch: Local version: %d "
142 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
143 		return (IBT_CM_REJECT);
144 	}
145 
146 	/* RDS supports V4 addresses only */
147 	if ((ipcm_info.src_addr.family != AF_INET) ||
148 	    (ipcm_info.dst_addr.family != AF_INET)) {
149 		RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
150 		    "src: %d dst: %d", ipcm_info.src_addr.family,
151 		    ipcm_info.dst_addr.family);
152 		return (IBT_CM_REJECT);
153 	}
154 
155 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
156 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
157 		    cmp.cmp_arch, RDS_THIS_ARCH);
158 		return (IBT_CM_REJECT);
159 	}
160 
161 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
162 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
163 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
164 		return (IBT_CM_REJECT);
165 	}
166 
167 	/* user_buffer_size should be same on all nodes */
168 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
169 		RDS_DPRINTF2(LABEL,
170 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
171 		    UserBufferSize, cmp.cmp_user_buffer_size);
172 		return (IBT_CM_REJECT);
173 	}
174 
175 	/*
176 	 * RDS needs more time to process a failover REQ so send an MRA.
177 	 * Otherwise, the remote may retry the REQ and fail the connection.
178 	 */
179 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
180 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
181 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
182 		    10000000 /* 10 sec */, NULL, 0);
183 	}
184 
185 	/* Is there a session to the destination node? */
186 	rw_enter(&statep->rds_sessionlock, RW_READER);
187 	sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
188 	rw_exit(&statep->rds_sessionlock);
189 
190 	if (sp == NULL) {
191 		/*
192 		 * currently there is no session to the destination
193 		 * remote ip in the private data is the local ip and vice
194 		 * versa
195 		 */
196 		sp = rds_session_create(statep, ipcm_info.DSTIP,
197 		    ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
198 		if (sp == NULL) {
199 			/* Check the list anyway. */
200 			rw_enter(&statep->rds_sessionlock, RW_READER);
201 			sp = rds_session_lkup(statep, ipcm_info.SRCIP,
202 			    rgid.gid_guid);
203 			rw_exit(&statep->rds_sessionlock);
204 			if (sp == NULL) {
205 				/*
206 				 * The only way this can fail is due to lack
207 				 * of kernel resources
208 				 */
209 				return (IBT_CM_REJECT);
210 			}
211 		}
212 	}
213 
214 	rw_enter(&sp->session_lock, RW_WRITER);
215 
216 	/* catch peer-to-peer case as soon as possible */
217 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
218 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
219 		/* Check possible peer-to-peer case here */
220 		if (sp->session_type != RDS_SESSION_PASSIVE) {
221 			RDS_DPRINTF2("rds_handle_cm_req",
222 			    "SP(%p) Peer-peer connection handling", sp);
223 			if (lgid.gid_guid > rgid.gid_guid) {
224 				/* this node is active so reject this request */
225 				rw_exit(&sp->session_lock);
226 				return (IBT_CM_REJECT);
227 			} else {
228 				/* this node is passive, change the session */
229 				sp->session_type = RDS_SESSION_PASSIVE;
230 				sp->session_lgid = lgid;
231 				sp->session_rgid = rgid;
232 			}
233 		}
234 	}
235 
236 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
237 
238 	switch (sp->session_state) {
239 	case RDS_SESSION_STATE_CONNECTED:
240 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
241 		sp->session_state = RDS_SESSION_STATE_ERROR;
242 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
243 		    "RDS_SESSION_STATE_ERROR", sp);
244 
245 		/* FALLTHRU */
246 	case RDS_SESSION_STATE_ERROR:
247 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
248 		sp->session_type = RDS_SESSION_PASSIVE;
249 		rw_exit(&sp->session_lock);
250 
251 		/* Handling this will take some time, so send an MRA */
252 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
253 		    10000000 /* 10 sec */, NULL, 0);
254 
255 		/*
256 		 * Any pending completions don't get flushed until the channel
257 		 * is closed. So, passing 0 here will not wait for pending
258 		 * completions in rds_session_close before closing the channel
259 		 */
260 		rds_session_close(sp, IBT_NOCALLBACKS, 0);
261 
262 		/* move the session to init state */
263 		rw_enter(&sp->session_lock, RW_WRITER);
264 		ret = rds_session_reinit(sp, lgid);
265 		sp->session_myip = ipcm_info.DSTIP;
266 		sp->session_lgid = lgid;
267 		sp->session_rgid = rgid;
268 		if (ret != 0) {
269 			rds_session_fini(sp);
270 			sp->session_state = RDS_SESSION_STATE_FAILED;
271 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
272 			    "RDS_SESSION_STATE_FAILED", sp);
273 			rw_exit(&sp->session_lock);
274 			return (IBT_CM_REJECT);
275 		} else {
276 			sp->session_state = RDS_SESSION_STATE_INIT;
277 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
278 			    "RDS_SESSION_STATE_INIT", sp);
279 		}
280 
281 		if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
282 			ep = &sp->session_ctrlep;
283 		} else {
284 			ep = &sp->session_dataep;
285 		}
286 		break;
287 	case RDS_SESSION_STATE_CREATED:
288 	case RDS_SESSION_STATE_FAILED:
289 	case RDS_SESSION_STATE_FINI:
290 		/*
291 		 * Initialize both channels, we accept this connection
292 		 * only if both channels are initialized
293 		 */
294 		sp->session_type = RDS_SESSION_PASSIVE;
295 		sp->session_lgid = lgid;
296 		sp->session_rgid = rgid;
297 		sp->session_state = RDS_SESSION_STATE_CREATED;
298 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
299 		    "RDS_SESSION_STATE_CREATED", sp);
300 		ret = rds_session_init(sp);
301 		if (ret != 0) {
302 			/* Seems like there are not enough resources */
303 			sp->session_state = RDS_SESSION_STATE_FAILED;
304 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
305 			    "RDS_SESSION_STATE_FAILED", sp);
306 			rw_exit(&sp->session_lock);
307 			return (IBT_CM_REJECT);
308 		}
309 		sp->session_state = RDS_SESSION_STATE_INIT;
310 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
311 		    "RDS_SESSION_STATE_INIT", sp);
312 
313 		/* FALLTHRU */
314 	case RDS_SESSION_STATE_INIT:
315 		/*
316 		 * When re-using an existing session, make sure the
317 		 * session is still through the same HCA. Otherwise, the
318 		 * memory registrations have to moved to the new HCA.
319 		 */
320 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
321 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
322 				RDS_DPRINTF2("rds_handle_cm_req",
323 				    "Existing Session but different gid "
324 				    "existing: 0x%llx, new: 0x%llx, "
325 				    "sending an MRA",
326 				    sp->session_lgid.gid_guid, lgid.gid_guid);
327 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
328 				    evp->cm_session_id, 10000000 /* 10 sec */,
329 				    NULL, 0);
330 				ret = rds_session_reinit(sp, lgid);
331 				if (ret != 0) {
332 					rds_session_fini(sp);
333 					sp->session_state =
334 					    RDS_SESSION_STATE_FAILED;
335 					sp->session_failover = 0;
336 					RDS_DPRINTF3("rds_failover_session",
337 					    "SP(%p) State "
338 					    "RDS_SESSION_STATE_FAILED", sp);
339 					rw_exit(&sp->session_lock);
340 					return (IBT_CM_REJECT);
341 				}
342 			}
343 			ep = &sp->session_dataep;
344 		} else {
345 			ep = &sp->session_ctrlep;
346 		}
347 
348 		break;
349 	default:
350 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
351 		    "state: %d", sp, sp->session_state);
352 		rw_exit(&sp->session_lock);
353 		return (IBT_CM_REJECT);
354 	}
355 
356 	sp->session_failover = 0; /* reset any previous value */
357 	if (cmp.cmp_failover) {
358 		RDS_DPRINTF2("rds_handle_cm_req",
359 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
360 		sp->session_failover = 1;
361 	}
362 
363 	mutex_enter(&ep->ep_lock);
364 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
365 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
366 		sp->session_type = RDS_SESSION_PASSIVE;
367 		rw_exit(&sp->session_lock);
368 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
369 		rw_exit(&sp->session_lock);
370 		/*
371 		 * Peer to peer connection. There is an active
372 		 * connection pending on this ep. The one with
373 		 * greater port guid becomes active and the
374 		 * other becomes passive.
375 		 */
376 		RDS_DPRINTF2("rds_handle_cm_req",
377 		    "EP(%p) Peer-peer connection handling", ep);
378 		if (lgid.gid_guid > rgid.gid_guid) {
379 			/* this node is active so reject this request */
380 			mutex_exit(&ep->ep_lock);
381 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
382 			    "Rejecting passive in favor of active", sp, ep);
383 			return (IBT_CM_REJECT);
384 		} else {
385 			/*
386 			 * This session is not the active end, change it
387 			 * to passive end.
388 			 */
389 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
390 
391 			rw_enter(&sp->session_lock, RW_WRITER);
392 			sp->session_type = RDS_SESSION_PASSIVE;
393 			sp->session_lgid = lgid;
394 			sp->session_rgid = rgid;
395 			rw_exit(&sp->session_lock);
396 		}
397 	} else {
398 		rw_exit(&sp->session_lock);
399 	}
400 
401 	ep->ep_lbufid = cmp.cmp_last_bufid;
402 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
403 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
404 	cmp.cmp_last_bufid = ep->ep_rbufid;
405 	cmp.cmp_ack_addr = ep->ep_ack_addr;
406 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
407 	mutex_exit(&ep->ep_lock);
408 
409 	/* continue with accepting the connection request for this channel */
410 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
411 	if (chanhdl == NULL) {
412 		mutex_enter(&ep->ep_lock);
413 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
414 		mutex_exit(&ep->ep_lock);
415 		return (IBT_CM_REJECT);
416 	}
417 
418 	/* pre-post recv buffers in the RQ */
419 	rds_post_recv_buf((void *)chanhdl);
420 
421 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
422 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
423 	rargsp->cm_ret.rep.cm_channel = chanhdl;
424 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
425 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
426 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
427 
428 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
429 	    sp, ep, chanhdl);
430 
431 	return (IBT_CM_ACCEPT);
432 }
433 
434 /*
435  * Handle an incoming CM REP
436  * Pre-post recv buffers for the QP
437  */
438 /* ARGSUSED */
439 static ibt_cm_status_t
440 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
441     void *rcmp, ibt_priv_data_len_t rcmp_len)
442 {
443 	rds_ep_t	*ep;
444 	rds_cm_private_data_t	cmp;
445 
446 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
447 
448 	/* pre-post recv buffers in the RQ */
449 	rds_post_recv_buf((void *)evp->cm_channel);
450 
451 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
452 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
453 	    sizeof (rds_cm_private_data_t));
454 	ep->ep_lbufid = cmp.cmp_last_bufid;
455 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
456 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
457 
458 	rargsp->cm_ret_len = 0;
459 
460 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
461 
462 	return (IBT_CM_ACCEPT);
463 }
464 
465 /*
466  * Handle CONN EST
467  */
468 static ibt_cm_status_t
469 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
470 {
471 	rds_session_t	*sp;
472 	rds_ep_t	*ep;
473 
474 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
475 
476 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
477 	    ep->ep_state);
478 
479 	mutex_enter(&ep->ep_lock);
480 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
481 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
482 	ep->ep_state = RDS_EP_STATE_CONNECTED;
483 	ep->ep_chanhdl = evp->cm_channel;
484 	sp = ep->ep_sp;
485 	mutex_exit(&ep->ep_lock);
486 
487 	(void) rds_session_active(sp);
488 
489 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
490 	return (IBT_CM_ACCEPT);
491 }
492 
493 /*
494  * Handle CONN CLOSED
495  */
496 static ibt_cm_status_t
497 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
498 {
499 	rds_ep_t	*ep;
500 	rds_session_t	*sp;
501 
502 	/* Catch DREQs but ignore DREPs */
503 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
504 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
505 		    "Ignoring Event: %d received", evp->cm_event.closed);
506 		return (IBT_CM_ACCEPT);
507 	}
508 
509 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
510 	sp = ep->ep_sp;
511 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter",
512 	    ep, evp->cm_channel);
513 
514 	mutex_enter(&ep->ep_lock);
515 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
516 		/* Ignore this DREQ */
517 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
518 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
519 		mutex_exit(&ep->ep_lock);
520 		return (IBT_CM_ACCEPT);
521 	}
522 	ep->ep_state = RDS_EP_STATE_CLOSING;
523 	mutex_exit(&ep->ep_lock);
524 
525 	rw_enter(&sp->session_lock, RW_WRITER);
526 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
527 	    sp->session_state);
528 
529 	switch (sp->session_state) {
530 	case RDS_SESSION_STATE_CONNECTED:
531 	case RDS_SESSION_STATE_HCA_CLOSING:
532 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
533 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
534 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
535 		break;
536 
537 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
538 		sp->session_state = RDS_SESSION_STATE_CLOSED;
539 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
540 		    "RDS_SESSION_STATE_CLOSED", sp);
541 		rds_passive_session_fini(sp);
542 		sp->session_state = RDS_SESSION_STATE_FINI;
543 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
544 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
545 		break;
546 
547 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
548 	case RDS_SESSION_STATE_ERROR:
549 	case RDS_SESSION_STATE_CLOSED:
550 		break;
551 
552 	case RDS_SESSION_STATE_INIT:
553 		sp->session_state = RDS_SESSION_STATE_ERROR;
554 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
555 		    "RDS_SESSION_STATE_ERROR", sp);
556 		rds_passive_session_fini(sp);
557 		sp->session_state = RDS_SESSION_STATE_FAILED;
558 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
559 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
560 		break;
561 
562 	default:
563 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
564 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
565 		rds_passive_session_fini(sp);
566 		sp->session_state = RDS_SESSION_STATE_FAILED;
567 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
568 		    "RDS_SESSION_STATE_FAILED", sp);
569 	}
570 	rw_exit(&sp->session_lock);
571 
572 	mutex_enter(&ep->ep_lock);
573 	ep->ep_state = RDS_EP_STATE_CLOSED;
574 	mutex_exit(&ep->ep_lock);
575 
576 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
577 	return (IBT_CM_ACCEPT);
578 }
579 
580 /*
581  * Handle EVENT FAILURE
582  */
583 static ibt_cm_status_t
584 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
585 {
586 	rds_ep_t	*ep;
587 	rds_session_t	*sp;
588 	int		ret;
589 
590 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
591 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
592 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
593 	    evp->cm_event.failed.cf_reason);
594 
595 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
596 		RDS_DPRINTF2(LABEL,
597 		    "Received REJ with reason IBT_CM_INVALID_SID: "
598 		    "RDS may not be loaded on the remote system");
599 	}
600 
601 	if (evp->cm_channel == NULL) {
602 		return (IBT_CM_ACCEPT);
603 	}
604 
605 	if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
606 	    (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
607 		/*
608 		 * This end is active, just ignore, ibt_open_rc_channel()
609 		 * caller will take care of cleanup.
610 		 */
611 		RDS_DPRINTF2("rds_handle_cm_event_failure",
612 		    "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
613 		return (IBT_CM_ACCEPT);
614 	}
615 
616 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
617 	sp = ep->ep_sp;
618 
619 	rw_enter(&sp->session_lock, RW_WRITER);
620 	if (sp->session_type == RDS_SESSION_PASSIVE) {
621 		RDS_DPRINTF2("rds_handle_cm_event_failure",
622 		    "SP(%p) - state: %d", sp, sp->session_state);
623 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
624 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
625 			sp->session_state = RDS_SESSION_STATE_ERROR;
626 			RDS_DPRINTF3("rds_handle_cm_event_failure",
627 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
628 
629 			/*
630 			 * Store the cm_channel for freeing later
631 			 * Active side frees it on ibt_open_rc_channel
632 			 * failure
633 			 */
634 			if (ep->ep_chanhdl == NULL) {
635 				ep->ep_chanhdl = evp->cm_channel;
636 			}
637 			rw_exit(&sp->session_lock);
638 
639 			/*
640 			 * rds_passive_session_fini should not be called
641 			 * directly in the CM handler. It will cause a deadlock.
642 			 */
643 			ret = ddi_taskq_dispatch(rds_taskq,
644 			    rds_cleanup_passive_session, (void *)sp,
645 			    DDI_NOSLEEP);
646 			if (ret != DDI_SUCCESS) {
647 				RDS_DPRINTF2("rds_handle_cm_event_failure",
648 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
649 			}
650 			return (IBT_CM_ACCEPT);
651 		}
652 	}
653 	rw_exit(&sp->session_lock);
654 
655 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
656 	return (IBT_CM_ACCEPT);
657 }
658 
659 /*
660  * CM Handler
661  *
662  * Called by IBCM
663  * The cm_private type differs for active and passive events.
664  */
665 ibt_cm_status_t
666 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
667     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
668     ibt_priv_data_len_t ret_len_max)
669 {
670 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
671 
672 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
673 
674 	switch (eventp->cm_type) {
675 	case IBT_CM_EVENT_REQ_RCV:
676 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
677 		    ret_args, ret_priv_data, ret_len_max);
678 		break;
679 	case IBT_CM_EVENT_REP_RCV:
680 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
681 		    ret_len_max);
682 		break;
683 	case IBT_CM_EVENT_MRA_RCV:
684 		/* Not supported */
685 		break;
686 	case IBT_CM_EVENT_CONN_EST:
687 		ret = rds_handle_cm_conn_est(eventp);
688 		break;
689 	case IBT_CM_EVENT_CONN_CLOSED:
690 		ret = rds_handle_cm_conn_closed(eventp);
691 		break;
692 	case IBT_CM_EVENT_FAILURE:
693 		ret = rds_handle_cm_event_failure(eventp);
694 		break;
695 	case IBT_CM_EVENT_LAP_RCV:
696 		/* Not supported */
697 		RDS_DPRINTF2(LABEL, "LAP message received");
698 		break;
699 	case IBT_CM_EVENT_APR_RCV:
700 		/* Not supported */
701 		RDS_DPRINTF2(LABEL, "APR message received");
702 		break;
703 	default:
704 		break;
705 	}
706 
707 	RDS_DPRINTF2("rds_cm_handler", "Return");
708 
709 	return (ret);
710 }
711 
712 /* This is based on OFED Linux RDS */
713 #define	RDS_PORT_NUM	6556
714 
715 /*
716  * Register the wellknown service with service id: RDS_SERVICE_ID
717  * Incoming connection requests should arrive on this service id.
718  */
719 ibt_srv_hdl_t
720 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
721 {
722 	ibt_srv_hdl_t	srvhdl;
723 	ibt_srv_desc_t	srvdesc;
724 	int		ret;
725 
726 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
727 
728 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
729 	srvdesc.sd_handler = rds_cm_handler;
730 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
731 
732 	/*
733 	 * This is the new service id as per:
734 	 * Annex A11: RDMA IP CM Service
735 	 */
736 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
737 	    RDS_PORT_NUM);
738 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
739 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
740 	if (ret != IBT_SUCCESS) {
741 		RDS_DPRINTF2(LABEL,
742 		    "RDS Service (0x%llx) Registration Failed: %d",
743 		    rdsib_statep->rds_service_id, ret);
744 		return (NULL);
745 	}
746 
747 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
748 	return (srvhdl);
749 }
750 
751 /* Bind the RDS service on all ports */
752 int
753 rds_bind_service(rds_state_t *statep)
754 {
755 	rds_hca_t	*hcap;
756 	ib_gid_t	gid;
757 	uint_t		jx, nbinds = 0, nports = 0;
758 	int		ret;
759 
760 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
761 
762 	rw_enter(&statep->rds_hca_lock, RW_READER);
763 
764 	hcap = statep->rds_hcalistp;
765 	while (hcap != NULL) {
766 
767 		/* skip the HCAs that are not fully online */
768 		if ((hcap->hca_state != RDS_HCA_STATE_OPEN) &&
769 		    (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED)) {
770 			RDS_DPRINTF2("rds_bind_service",
771 			    "Skipping HCA: 0x%llx, state: %d",
772 			    hcap->hca_guid, hcap->hca_state);
773 			hcap = hcap->hca_nextp;
774 			continue;
775 		}
776 
777 		/* currently, we have space for only 4 bindhdls */
778 		ASSERT(hcap->hca_nports < 4);
779 		for (jx = 0; jx < hcap->hca_nports; jx++) {
780 			nports++;
781 			if (hcap->hca_pinfop[jx].p_linkstate !=
782 			    IBT_PORT_ACTIVE) {
783 				/*
784 				 * service bind will be called in the async
785 				 * handler when the port comes up. Clear any
786 				 * stale bind handle.
787 				 */
788 				hcap->hca_bindhdl[jx] = NULL;
789 				continue;
790 			}
791 
792 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
793 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
794 			    "gid: %llx:%llx", hcap->hca_guid,
795 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
796 			    gid.gid_guid);
797 
798 			/* pass statep as cm_private */
799 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
800 			    NULL, statep, &hcap->hca_bindhdl[jx]);
801 			if (ret != IBT_SUCCESS) {
802 				RDS_DPRINTF2(LABEL, "Bind service for "
803 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
804 				    "failed: %d", hcap->hca_guid,
805 				    hcap->hca_pinfop[jx].p_port_num,
806 				    gid.gid_prefix, gid.gid_guid, ret);
807 				continue;
808 			}
809 
810 			nbinds++;
811 		}
812 		hcap = hcap->hca_nextp;
813 	}
814 
815 	rw_exit(&statep->rds_hca_lock);
816 
817 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
818 	    nbinds, nports);
819 
820 #if 0
821 	if (nbinds == 0) {
822 		return (-1);
823 	}
824 #endif
825 
826 	RDS_DPRINTF2("rds_bind_service", "Return");
827 
828 	return (0);
829 }
830 
831 /* Open an RC connection */
832 int
833 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
834     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
835 {
836 	rds_session_t		*sp;
837 	ibt_chan_open_args_t	ocargs;
838 	ibt_rc_returns_t	ocrets;
839 	rds_cm_private_data_t	cmp;
840 	uint8_t			hca_port;
841 	ibt_channel_hdl_t	hdl;
842 	ibt_status_t		ret = 0;
843 	ibt_ip_cm_info_t	ipcm_info;
844 
845 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
846 
847 	sp = ep->ep_sp;
848 
849 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
850 	ipcm_info.src_addr.family = AF_INET;
851 	ipcm_info.SRCIP = sp->session_myip;
852 	ipcm_info.dst_addr.family = AF_INET;
853 	ipcm_info.DSTIP = sp->session_remip;
854 	ipcm_info.src_port = RDS_PORT_NUM;
855 	ret = ibt_format_ip_private_data(&ipcm_info,
856 	    sizeof (rds_cm_private_data_t), &cmp);
857 	if (ret != IBT_SUCCESS) {
858 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
859 		    "failed: %d", sp, ep, ret);
860 		return (-1);
861 	}
862 
863 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
864 
865 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
866 	if (hdl == NULL) {
867 		return (-1);
868 	}
869 
870 	cmp.cmp_version = RDS_VERSION;
871 	cmp.cmp_arch = RDS_THIS_ARCH;
872 	cmp.cmp_eptype = ep->ep_type;
873 	cmp.cmp_failover = sp->session_failover;
874 	cmp.cmp_last_bufid = ep->ep_rbufid;
875 	cmp.cmp_user_buffer_size = UserBufferSize;
876 	cmp.cmp_ack_addr = ep->ep_ack_addr;
877 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
878 
879 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
880 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
881 	ocargs.oc_path = pinfo;
882 	ocargs.oc_cm_handler = rds_cm_handler;
883 	ocargs.oc_cm_clnt_private = NULL;
884 	ocargs.oc_rdma_ra_out = 4;
885 	ocargs.oc_rdma_ra_in = 4;
886 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
887 	ocargs.oc_priv_data = &cmp;
888 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
889 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
890 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
891 	    mode, &ocargs, &ocrets);
892 	if (ret != IBT_SUCCESS) {
893 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
894 		    "failed: %d", sp, ep, ret);
895 		(void) ibt_flush_channel(hdl);
896 		(void) ibt_free_channel(hdl);
897 
898 		mutex_enter(&ep->ep_lock);
899 		/* don't cleanup if this failure is due to peer-peer race */
900 		if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
901 			/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
902 			ep->ep_state = RDS_EP_STATE_ERROR;
903 			rds_ep_free_rc_channel(ep);
904 		}
905 		mutex_exit(&ep->ep_lock);
906 
907 		return (-1);
908 	}
909 
910 	*chanhdl = hdl;
911 
912 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
913 	    *chanhdl);
914 
915 	return (0);
916 }
917 
918 int
919 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
920 {
921 	int	ret;
922 
923 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
924 	    chanhdl, mode);
925 
926 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
927 
928 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
929 
930 	return (ret);
931 }
932