/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* * Sun elects to include this software in Sun product * under the OpenIB BSD license. * * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include /* * This file contains CM related work: * * Service registration/deregistration * Path lookup * CM connection callbacks * CM active and passive connection establishment * Connection failover */ #define SRCIP src_addr.un.ip4addr #define DSTIP dst_addr.un.ip4addr /* * Handle an incoming CM REQ */ /* ARGSUSED */ static ibt_cm_status_t rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len) { ibt_cm_req_rcv_t *reqp; ib_gid_t lgid, rgid; rds_cm_private_data_t cmp; rds_session_t *sp; rds_ep_t *ep; ibt_channel_hdl_t chanhdl; ibt_ip_cm_info_t ipcm_info; uint8_t save_state, save_type; int ret; RDS_DPRINTF2("rds_handle_cm_req", "Enter"); reqp = &evp->cm_event.req; rgid = reqp->req_prim_addr.av_dgid; /* requester gid */ lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */ RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx", rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid); /* * CM private data brings IP information * Private data received is a stream of bytes and may not be properly * aligned. So, bcopy the data onto the stack before accessing it. */ bcopy((uint8_t *)evp->cm_priv_data, &cmp, sizeof (rds_cm_private_data_t)); /* extract the CM IP info */ ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data, &ipcm_info); if (ret != IBT_SUCCESS) { RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d", ret); return (IBT_CM_REJECT); } RDS_DPRINTF2("rds_handle_cm_req", "REQ Received: From IP: 0x%x To IP: 0x%x type: %d", ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype); if (cmp.cmp_version != RDS_VERSION) { RDS_DPRINTF2(LABEL, "Version Mismatch: Local version: %d " "Remote version: %d", RDS_VERSION, cmp.cmp_version); return (IBT_CM_REJECT); } /* RDS supports V4 addresses only */ if ((ipcm_info.src_addr.family != AF_INET) || (ipcm_info.dst_addr.family != AF_INET)) { RDS_DPRINTF2(LABEL, "Unsupported Address Family: " "src: %d dst: %d", ipcm_info.src_addr.family, ipcm_info.dst_addr.family); return (IBT_CM_REJECT); } if (cmp.cmp_arch != RDS_THIS_ARCH) { RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)", cmp.cmp_arch, RDS_THIS_ARCH); return (IBT_CM_REJECT); } if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) && (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) { RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype); return (IBT_CM_REJECT); } /* user_buffer_size should be same on all nodes */ if (cmp.cmp_user_buffer_size != UserBufferSize) { RDS_DPRINTF2(LABEL, "UserBufferSize Mismatch, this node: %d remote node: %d", UserBufferSize, cmp.cmp_user_buffer_size); return (IBT_CM_REJECT); } /* * RDS needs more time to process a failover REQ so send an MRA. * Otherwise, the remote may retry the REQ and fail the connection. */ if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) { RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA"); (void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id, 10000000 /* 10 sec */, NULL, 0); } /* Is there a session to the destination node? */ rw_enter(&statep->rds_sessionlock, RW_READER); sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid); rw_exit(&statep->rds_sessionlock); if (sp == NULL) { /* * currently there is no session to the destination * remote ip in the private data is the local ip and vice * versa */ sp = rds_session_create(statep, ipcm_info.DSTIP, ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE); if (sp == NULL) { /* Check the list anyway. */ rw_enter(&statep->rds_sessionlock, RW_READER); sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid); rw_exit(&statep->rds_sessionlock); if (sp == NULL) { /* * The only way this can fail is due to lack * of kernel resources */ return (IBT_CM_REJECT); } } } rw_enter(&sp->session_lock, RW_WRITER); /* catch peer-to-peer case as soon as possible */ if ((sp->session_state == RDS_SESSION_STATE_CREATED) || (sp->session_state == RDS_SESSION_STATE_INIT)) { /* Check possible peer-to-peer case here */ if (sp->session_type != RDS_SESSION_PASSIVE) { RDS_DPRINTF2("rds_handle_cm_req", "SP(%p) Peer-peer connection handling", sp); if (lgid.gid_guid > rgid.gid_guid) { /* this node is active so reject this request */ rw_exit(&sp->session_lock); return (IBT_CM_REJECT); } else { /* this node is passive, change the session */ sp->session_type = RDS_SESSION_PASSIVE; sp->session_lgid = lgid; sp->session_rgid = rgid; } } } RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state); save_state = sp->session_state; save_type = sp->session_type; switch (sp->session_state) { case RDS_SESSION_STATE_CONNECTED: RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp); sp->session_state = RDS_SESSION_STATE_ERROR; RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State " "RDS_SESSION_STATE_ERROR", sp); /* FALLTHRU */ case RDS_SESSION_STATE_ERROR: case RDS_SESSION_STATE_PASSIVE_CLOSING: /* * Some other thread must be processing this session, * this thread must wait until the other thread finishes. */ sp->session_type = RDS_SESSION_PASSIVE; rw_exit(&sp->session_lock); /* Handling this will take some time, so send an MRA */ (void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id, 10000000 /* 10 sec */, NULL, 0); /* * Any pending completions don't get flushed until the channel * is closed. So, passing 0 here will not wait for pending * completions in rds_session_close before closing the channel */ rds_session_close(sp, IBT_NOCALLBACKS, 0); rw_enter(&sp->session_lock, RW_WRITER); /* * If the session was in ERROR, then either a failover thread * or event_failure thread would be processing this session. * This thread should wait for event_failure thread to * complete. This need not wait for failover thread. */ if ((save_state != RDS_SESSION_STATE_CONNECTED) && (save_type == RDS_SESSION_PASSIVE)) { /* * The other thread is event_failure thread, * wait until it finishes. */ while (!((sp->session_state == RDS_SESSION_STATE_FAILED) || (sp->session_state == RDS_SESSION_STATE_FINI))) { rw_exit(&sp->session_lock); delay(drv_usectohz(1000000)); rw_enter(&sp->session_lock, RW_WRITER); } } /* move the session to init state */ if ((sp->session_state == RDS_SESSION_STATE_ERROR) || (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING)) { ret = rds_session_reinit(sp, lgid); sp->session_myip = ipcm_info.DSTIP; sp->session_lgid = lgid; sp->session_rgid = rgid; if (ret != 0) { rds_session_fini(sp); sp->session_state = RDS_SESSION_STATE_FAILED; RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State RDS_SESSION_STATE_FAILED", sp); rw_exit(&sp->session_lock); return (IBT_CM_REJECT); } else { sp->session_state = RDS_SESSION_STATE_INIT; RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State RDS_SESSION_STATE_INIT", sp); } if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) { ep = &sp->session_ctrlep; } else { ep = &sp->session_dataep; } break; } /* FALLTHRU */ case RDS_SESSION_STATE_CREATED: case RDS_SESSION_STATE_FAILED: case RDS_SESSION_STATE_FINI: /* * Initialize both channels, we accept this connection * only if both channels are initialized */ sp->session_type = RDS_SESSION_PASSIVE; sp->session_lgid = lgid; sp->session_rgid = rgid; sp->session_state = RDS_SESSION_STATE_CREATED; RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State " "RDS_SESSION_STATE_CREATED", sp); ret = rds_session_init(sp); if (ret != 0) { /* Seems like there are not enough resources */ sp->session_state = RDS_SESSION_STATE_FAILED; RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State " "RDS_SESSION_STATE_FAILED", sp); rw_exit(&sp->session_lock); return (IBT_CM_REJECT); } sp->session_state = RDS_SESSION_STATE_INIT; RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State " "RDS_SESSION_STATE_INIT", sp); /* FALLTHRU */ case RDS_SESSION_STATE_INIT: /* * When re-using an existing session, make sure the * session is still through the same HCA. Otherwise, the * memory registrations have to moved to the new HCA. */ if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) { if (sp->session_lgid.gid_guid != lgid.gid_guid) { RDS_DPRINTF2("rds_handle_cm_req", "Existing Session but different gid " "existing: 0x%llx, new: 0x%llx, " "sending an MRA", sp->session_lgid.gid_guid, lgid.gid_guid); (void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id, 10000000 /* 10 sec */, NULL, 0); ret = rds_session_reinit(sp, lgid); if (ret != 0) { rds_session_fini(sp); sp->session_state = RDS_SESSION_STATE_FAILED; sp->session_failover = 0; RDS_DPRINTF3("rds_failover_session", "SP(%p) State " "RDS_SESSION_STATE_FAILED", sp); rw_exit(&sp->session_lock); return (IBT_CM_REJECT); } } ep = &sp->session_dataep; } else { ep = &sp->session_ctrlep; } break; default: RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected " "state: %d", sp, sp->session_state); rw_exit(&sp->session_lock); return (IBT_CM_REJECT); } sp->session_failover = 0; /* reset any previous value */ if (cmp.cmp_failover) { RDS_DPRINTF2("rds_handle_cm_req", "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid); sp->session_failover = 1; } mutex_enter(&ep->ep_lock); if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) { ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING; sp->session_type = RDS_SESSION_PASSIVE; rw_exit(&sp->session_lock); } else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) { rw_exit(&sp->session_lock); /* * Peer to peer connection. There is an active * connection pending on this ep. The one with * greater port guid becomes active and the * other becomes passive. */ RDS_DPRINTF2("rds_handle_cm_req", "EP(%p) Peer-peer connection handling", ep); if (lgid.gid_guid > rgid.gid_guid) { /* this node is active so reject this request */ mutex_exit(&ep->ep_lock); RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): " "Rejecting passive in favor of active", sp, ep); return (IBT_CM_REJECT); } else { /* * This session is not the active end, change it * to passive end. */ ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING; rw_enter(&sp->session_lock, RW_WRITER); sp->session_type = RDS_SESSION_PASSIVE; sp->session_lgid = lgid; sp->session_rgid = rgid; rw_exit(&sp->session_lock); } } else { rw_exit(&sp->session_lock); } ep->ep_lbufid = cmp.cmp_last_bufid; ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr; ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey; cmp.cmp_last_bufid = ep->ep_rbufid; cmp.cmp_ack_addr = ep->ep_ack_addr; cmp.cmp_ack_rkey = ep->ep_ack_rkey; mutex_exit(&ep->ep_lock); /* continue with accepting the connection request for this channel */ chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port); if (chanhdl == NULL) { mutex_enter(&ep->ep_lock); ep->ep_state = RDS_EP_STATE_UNCONNECTED; mutex_exit(&ep->ep_lock); return (IBT_CM_REJECT); } /* pre-post recv buffers in the RQ */ rds_post_recv_buf((void *)chanhdl); rargsp->cm_ret_len = sizeof (rds_cm_private_data_t); bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t)); rargsp->cm_ret.rep.cm_channel = chanhdl; rargsp->cm_ret.rep.cm_rdma_ra_out = 4; rargsp->cm_ret.rep.cm_rdma_ra_in = 4; rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry; RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)", sp, ep, chanhdl); return (IBT_CM_ACCEPT); } /* * Handle an incoming CM REP * Pre-post recv buffers for the QP */ /* ARGSUSED */ static ibt_cm_status_t rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len) { rds_ep_t *ep; rds_cm_private_data_t cmp; RDS_DPRINTF2("rds_handle_cm_rep", "Enter"); /* pre-post recv buffers in the RQ */ rds_post_recv_buf((void *)evp->cm_channel); ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel); bcopy((uint8_t *)evp->cm_priv_data, &cmp, sizeof (rds_cm_private_data_t)); ep->ep_lbufid = cmp.cmp_last_bufid; ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr; ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey; rargsp->cm_ret_len = 0; RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid); return (IBT_CM_ACCEPT); } /* * Handle CONN EST */ static ibt_cm_status_t rds_handle_cm_conn_est(ibt_cm_event_t *evp) { rds_session_t *sp; rds_ep_t *ep; ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel); RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep, ep->ep_state); mutex_enter(&ep->ep_lock); ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) || (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING)); ep->ep_state = RDS_EP_STATE_CONNECTED; ep->ep_chanhdl = evp->cm_channel; sp = ep->ep_sp; mutex_exit(&ep->ep_lock); (void) rds_session_active(sp); RDS_DPRINTF2("rds_handle_cm_conn_est", "Return"); return (IBT_CM_ACCEPT); } /* * Handle CONN CLOSED */ static ibt_cm_status_t rds_handle_cm_conn_closed(ibt_cm_event_t *evp) { rds_ep_t *ep; rds_session_t *sp; /* Catch DREQs but ignore DREPs */ if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) { RDS_DPRINTF2("rds_handle_cm_conn_closed", "Ignoring Event: %d received", evp->cm_event.closed); return (IBT_CM_ACCEPT); } ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel); sp = ep->ep_sp; RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter", ep, evp->cm_channel); mutex_enter(&ep->ep_lock); if (ep->ep_state != RDS_EP_STATE_CONNECTED) { /* Ignore this DREQ */ RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) not connected, state: %d", ep, ep->ep_state); mutex_exit(&ep->ep_lock); return (IBT_CM_ACCEPT); } ep->ep_state = RDS_EP_STATE_CLOSING; mutex_exit(&ep->ep_lock); rw_enter(&sp->session_lock, RW_WRITER); RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp, sp->session_state); switch (sp->session_state) { case RDS_SESSION_STATE_CONNECTED: case RDS_SESSION_STATE_HCA_CLOSING: sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING; RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State " "RDS_SESSION_STATE_PASSIVE_CLOSING", sp); break; case RDS_SESSION_STATE_PASSIVE_CLOSING: sp->session_state = RDS_SESSION_STATE_CLOSED; RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State " "RDS_SESSION_STATE_CLOSED", sp); rds_passive_session_fini(sp); sp->session_state = RDS_SESSION_STATE_FINI; RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State RDS_SESSION_STATE_FINI", sp); break; case RDS_SESSION_STATE_ACTIVE_CLOSING: case RDS_SESSION_STATE_ERROR: case RDS_SESSION_STATE_CLOSED: break; case RDS_SESSION_STATE_INIT: sp->session_state = RDS_SESSION_STATE_ERROR; RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State " "RDS_SESSION_STATE_ERROR", sp); rds_passive_session_fini(sp); sp->session_state = RDS_SESSION_STATE_FAILED; RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State RDS_SESSION_STATE_FAILED", sp); break; default: RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - Unexpected state: %d", sp, sp->session_state); rds_passive_session_fini(sp); sp->session_state = RDS_SESSION_STATE_FAILED; RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State " "RDS_SESSION_STATE_FAILED", sp); } rw_exit(&sp->session_lock); mutex_enter(&ep->ep_lock); ep->ep_state = RDS_EP_STATE_CLOSED; mutex_exit(&ep->ep_lock); RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp); return (IBT_CM_ACCEPT); } /* * Handle EVENT FAILURE */ static ibt_cm_status_t rds_handle_cm_event_failure(ibt_cm_event_t *evp) { rds_ep_t *ep; rds_session_t *sp; int ret; RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p " "Code: %d msg: %d reason: %d", evp->cm_channel, evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg, evp->cm_event.failed.cf_reason); if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) { RDS_DPRINTF2(LABEL, "Received REJ with reason IBT_CM_INVALID_SID: " "RDS may not be loaded on the remote system"); } if (evp->cm_channel == NULL) { return (IBT_CM_ACCEPT); } if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) && (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) { /* * This end is active, just ignore, ibt_open_rc_channel() * caller will take care of cleanup. */ RDS_DPRINTF2("rds_handle_cm_event_failure", "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel); return (IBT_CM_ACCEPT); } ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel); sp = ep->ep_sp; rw_enter(&sp->session_lock, RW_WRITER); if (sp->session_type == RDS_SESSION_PASSIVE) { RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) - state: %d", sp, sp->session_state); if ((sp->session_state == RDS_SESSION_STATE_INIT) || (sp->session_state == RDS_SESSION_STATE_CONNECTED)) { sp->session_state = RDS_SESSION_STATE_ERROR; RDS_DPRINTF3("rds_handle_cm_event_failure", "SP(%p) State RDS_SESSION_STATE_ERROR", sp); /* * Store the cm_channel for freeing later * Active side frees it on ibt_open_rc_channel * failure */ if (ep->ep_chanhdl == NULL) { ep->ep_chanhdl = evp->cm_channel; } rw_exit(&sp->session_lock); /* * rds_passive_session_fini should not be called * directly in the CM handler. It will cause a deadlock. */ ret = ddi_taskq_dispatch(rds_taskq, rds_cleanup_passive_session, (void *)sp, DDI_NOSLEEP); if (ret != DDI_SUCCESS) { RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) TaskQ dispatch FAILED:%d", sp, ret); } return (IBT_CM_ACCEPT); } } rw_exit(&sp->session_lock); RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp); return (IBT_CM_ACCEPT); } /* * CM Handler * * Called by IBCM * The cm_private type differs for active and passive events. */ ibt_cm_status_t rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp, ibt_cm_return_args_t *ret_args, void *ret_priv_data, ibt_priv_data_len_t ret_len_max) { ibt_cm_status_t ret = IBT_CM_ACCEPT; RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type); switch (eventp->cm_type) { case IBT_CM_EVENT_REQ_RCV: ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp, ret_args, ret_priv_data, ret_len_max); break; case IBT_CM_EVENT_REP_RCV: ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data, ret_len_max); break; case IBT_CM_EVENT_MRA_RCV: /* Not supported */ break; case IBT_CM_EVENT_CONN_EST: ret = rds_handle_cm_conn_est(eventp); break; case IBT_CM_EVENT_CONN_CLOSED: ret = rds_handle_cm_conn_closed(eventp); break; case IBT_CM_EVENT_FAILURE: ret = rds_handle_cm_event_failure(eventp); break; case IBT_CM_EVENT_LAP_RCV: /* Not supported */ RDS_DPRINTF2(LABEL, "LAP message received"); break; case IBT_CM_EVENT_APR_RCV: /* Not supported */ RDS_DPRINTF2(LABEL, "APR message received"); break; default: break; } RDS_DPRINTF2("rds_cm_handler", "Return"); return (ret); } /* This is based on OFED Linux RDS */ #define RDS_PORT_NUM 6556 /* * Register the wellknown service with service id: RDS_SERVICE_ID * Incoming connection requests should arrive on this service id. */ ibt_srv_hdl_t rds_register_service(ibt_clnt_hdl_t rds_ibhdl) { ibt_srv_hdl_t srvhdl; ibt_srv_desc_t srvdesc; int ret; RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl); bzero(&srvdesc, sizeof (ibt_srv_desc_t)); srvdesc.sd_handler = rds_cm_handler; srvdesc.sd_flags = IBT_SRV_NO_FLAGS; /* * This is the new service id as per: * Annex A11: RDMA IP CM Service */ rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP, RDS_PORT_NUM); ret = ibt_register_service(rds_ibhdl, &srvdesc, rdsib_statep->rds_service_id, 1, &srvhdl, NULL); if (ret != IBT_SUCCESS) { RDS_DPRINTF2(LABEL, "RDS Service (0x%llx) Registration Failed: %d", rdsib_statep->rds_service_id, ret); return (NULL); } RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl); return (srvhdl); } /* Bind the RDS service on all ports */ int rds_bind_service(rds_state_t *statep) { rds_hca_t *hcap; ib_gid_t gid; uint_t jx, nbinds = 0, nports = 0; int ret; RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep); rw_enter(&statep->rds_hca_lock, RW_READER); hcap = statep->rds_hcalistp; while (hcap != NULL) { /* skip the HCAs that are not fully online */ if ((hcap->hca_state != RDS_HCA_STATE_OPEN) && (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED)) { RDS_DPRINTF2("rds_bind_service", "Skipping HCA: 0x%llx, state: %d", hcap->hca_guid, hcap->hca_state); hcap = hcap->hca_nextp; continue; } /* currently, we have space for only 4 bindhdls */ ASSERT(hcap->hca_nports < 4); for (jx = 0; jx < hcap->hca_nports; jx++) { nports++; if (hcap->hca_pinfop[jx].p_linkstate != IBT_PORT_ACTIVE) { /* * service bind will be called in the async * handler when the port comes up. Clear any * stale bind handle. */ hcap->hca_bindhdl[jx] = NULL; continue; } gid = hcap->hca_pinfop[jx].p_sgid_tbl[0]; RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d " "gid: %llx:%llx", hcap->hca_guid, hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix, gid.gid_guid); /* pass statep as cm_private */ ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep, &hcap->hca_bindhdl[jx]); if (ret != IBT_SUCCESS) { RDS_DPRINTF2(LABEL, "Bind service for " "HCA: 0x%llx Port: %d gid %llx:%llx " "failed: %d", hcap->hca_guid, hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix, gid.gid_guid, ret); continue; } nbinds++; } hcap = hcap->hca_nextp; } rw_exit(&statep->rds_hca_lock); RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports", nbinds, nports); #if 0 if (nbinds == 0) { return (-1); } #endif RDS_DPRINTF2("rds_bind_service", "Return"); return (0); } /* Open an RC connection */ int rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo, ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl) { rds_session_t *sp; ibt_chan_open_args_t ocargs; ibt_rc_returns_t ocrets; rds_cm_private_data_t cmp; uint8_t hca_port; ibt_channel_hdl_t hdl; ibt_status_t ret = 0; ibt_ip_cm_info_t ipcm_info; RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode); sp = ep->ep_sp; bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); ipcm_info.src_addr.family = AF_INET; ipcm_info.SRCIP = sp->session_myip; ipcm_info.dst_addr.family = AF_INET; ipcm_info.DSTIP = sp->session_remip; ipcm_info.src_port = RDS_PORT_NUM; ret = ibt_format_ip_private_data(&ipcm_info, sizeof (rds_cm_private_data_t), &cmp); if (ret != IBT_SUCCESS) { RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data " "failed: %d", sp, ep, ret); return (-1); } hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num; hdl = rds_ep_alloc_rc_channel(ep, hca_port); if (hdl == NULL) { return (-1); } cmp.cmp_version = RDS_VERSION; cmp.cmp_arch = RDS_THIS_ARCH; cmp.cmp_eptype = ep->ep_type; cmp.cmp_failover = sp->session_failover; cmp.cmp_last_bufid = ep->ep_rbufid; cmp.cmp_user_buffer_size = UserBufferSize; cmp.cmp_ack_addr = ep->ep_ack_addr; cmp.cmp_ack_rkey = ep->ep_ack_rkey; bzero(&ocargs, sizeof (ibt_chan_open_args_t)); bzero(&ocrets, sizeof (ibt_rc_returns_t)); ocargs.oc_path = pinfo; ocargs.oc_cm_handler = rds_cm_handler; ocargs.oc_cm_clnt_private = NULL; ocargs.oc_rdma_ra_out = 4; ocargs.oc_rdma_ra_in = 4; ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t); ocargs.oc_priv_data = &cmp; ocargs.oc_path_retry_cnt = IBPathRetryCount; ocargs.oc_path_rnr_retry_cnt = MinRnrRetry; ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS, mode, &ocargs, &ocrets); if (ret != IBT_SUCCESS) { RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel " "failed: %d", sp, ep, ret); (void) ibt_flush_channel(hdl); (void) ibt_free_channel(hdl); mutex_enter(&ep->ep_lock); /* don't cleanup if this failure is due to peer-peer race */ if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) { /* cleanup stuff allocated in rds_ep_alloc_rc_channel */ ep->ep_state = RDS_EP_STATE_ERROR; rds_ep_free_rc_channel(ep); } mutex_exit(&ep->ep_lock); return (-1); } *chanhdl = hdl; RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep, *chanhdl); return (0); } int rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode) { int ret; RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)", chanhdl, mode); ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0); RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl); return (ret); }