/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ /* * An implementation of the IPoIB-CM standard based on PSARC 2009/593. */ #include #include #include #include #include #include #include #include #include #include #include #include /* for HCK_FULLCKSUM */ #include /* for atomic_add*() */ #include /* for ETHERTYPE_IP */ #include /* for netinet/ip.h below */ #include /* for struct ip */ #include /* for inet/ip.h below */ #include /* for ipha_t */ #include /* for ETHERTYPE_IPV6 */ #include /* for ip6_t */ #include /* for icmp6_t */ #include extern ibd_global_state_t ibd_gstate; extern int ibd_rc_conn_timeout; uint_t ibd_rc_tx_softintr = 1; /* * If the number of WRs in receive queue of each RC connection less than * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it. */ #define IBD_RC_RX_WR_THRESHOLD 0x20 /* * If the number of free SWQEs (or large Tx buf) is larger than or equal to * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue * transmitting packets. */ #define IBD_RC_TX_FREE_THRESH 8 #define IBD_RC_QPN_TO_SID(qpn) \ ((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff))) /* For interop with legacy OFED */ #define IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \ ((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff))) /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */ #define IBD_RC_IP_ICMP_RETURN_DATA_BYTES 64 /* Functions for Reliable Connected Mode */ /* Connection Setup/Close Functions */ static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *, ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t); static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *, ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t); static void ibd_rc_act_close(ibd_rc_chan_t *, boolean_t); static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *, ibd_rc_chan_t *); static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list( ibd_rc_chan_list_t *); static inline ibd_rc_chan_t *ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *, ibd_rc_chan_t *); /* CQ handlers */ static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *); static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *); static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t); /* Receive Functions */ static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *); static void ibd_rc_srq_freemsg_cb(char *); static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *); static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *); static void ibd_rc_freemsg_cb(char *); static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *); static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *); static void ibd_rc_fini_rxlist(ibd_rc_chan_t *); /* Send Functions */ static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *); static int ibd_rc_init_txlist(ibd_rc_chan_t *); static void ibd_rc_fini_txlist(ibd_rc_chan_t *); static uint_t ibd_rc_tx_recycle(caddr_t); void ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req) { ibd_rc_chan_t *rc_chan = req->rq_ptr; ibd_ace_t *ace; while (rc_chan != NULL) { ace = rc_chan->ace; ASSERT(ace != NULL); /* Close old RC channel */ ibd_rc_act_close(rc_chan, B_TRUE); mutex_enter(&state->id_ac_mutex); ASSERT(ace->ac_ref != 0); atomic_dec_32(&ace->ac_ref); ace->ac_chan = NULL; if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { IBD_ACACHE_INSERT_FREE(state, ace); ace->ac_ref = 0; } else { ace->ac_ref |= CYCLEVAL; state->rc_delay_ace_recycle++; } mutex_exit(&state->id_ac_mutex); rc_chan = ibd_rc_rm_header_chan_list( &state->rc_obs_act_chan_list); } } void ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req) { ibd_ace_t *ace = req->rq_ptr; ibd_rc_chan_t *rc_chan; ASSERT(ace != NULL); rc_chan = ace->ac_chan; ASSERT(rc_chan != NULL); /* Close old RC channel */ ibd_rc_act_close(rc_chan, B_TRUE); mutex_enter(&state->id_ac_mutex); ASSERT(ace->ac_ref != 0); atomic_dec_32(&ace->ac_ref); ace->ac_chan = NULL; if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { IBD_ACACHE_INSERT_FREE(state, ace); ace->ac_ref = 0; } else { ace->ac_ref |= CYCLEVAL; state->rc_delay_ace_recycle++; } mutex_exit(&state->id_ac_mutex); mutex_enter(&state->rc_ace_recycle_lock); state->rc_ace_recycle = NULL; mutex_exit(&state->rc_ace_recycle_lock); } /* Simple ICMP IP Header Template */ static const ipha_t icmp_ipha = { IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP }; /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */ void ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req) { mblk_t *mp = req->rq_ptr; ibd_ace_t *ace = req->rq_ptr2; uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE; uint_t len_needed; size_t msg_len; mblk_t *pmtu_mp; ushort_t sap; ib_header_info_t *ibha; /* ib header for pmtu_pkt */ /* * ipha: IP header for pmtu_pkt * old_ipha: IP header for old packet */ ipha_t *ipha, *old_ipha; icmph_t *icmph; sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type); if (!pullupmsg(mp, -1)) { DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail"); goto too_big_fail; } /* move to IP header. */ mp->b_rptr += IPOIB_HDRSIZE; old_ipha = (ipha_t *)mp->b_rptr; len_needed = IPH_HDR_LENGTH(old_ipha); if (old_ipha->ipha_protocol == IPPROTO_ENCAP) { len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha + len_needed)); } else if (old_ipha->ipha_protocol == IPPROTO_IPV6) { ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha + len_needed); len_needed += ip_hdr_length_v6(mp, ip6h); } len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES; msg_len = msgdsize(mp); if (msg_len > len_needed) { (void) adjmsg(mp, len_needed - msg_len); msg_len = len_needed; } if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t) + sizeof (icmph_t), BPRI_MED)) == NULL) { DPRINT(40, "ibd_async_rc_process_too_big: allocb fail"); goto too_big_fail; } pmtu_mp->b_cont = mp; pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t) + sizeof (ipha_t) + sizeof (icmph_t); ibha = (ib_header_info_t *)pmtu_mp->b_rptr; /* Fill IB header */ bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL); /* * If the GRH is not valid, indicate to GLDv3 by setting * the VerTcFlow field to 0. */ ibha->ib_grh.ipoib_vertcflow = 0; ibha->ipib_rhdr.ipoib_type = htons(sap); ibha->ipib_rhdr.ipoib_mbz = 0; /* Fill IP header */ ipha = (ipha_t *)&ibha[1]; *ipha = icmp_ipha; ipha->ipha_src = old_ipha->ipha_dst; ipha->ipha_dst = old_ipha->ipha_src; ipha->ipha_ttl = old_ipha->ipha_ttl; msg_len += sizeof (icmp_ipha) + sizeof (icmph_t); if (msg_len > IP_MAXPACKET) { ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) " "> IP_MAXPACKET", (uint32_t)msg_len); (void) adjmsg(mp, IP_MAXPACKET - msg_len); msg_len = IP_MAXPACKET; } ipha->ipha_length = htons((uint16_t)msg_len); ipha->ipha_hdr_checksum = 0; ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); /* Fill ICMP body */ icmph = (icmph_t *)&ipha[1]; bzero(icmph, sizeof (icmph_t)); icmph->icmph_type = ICMP_DEST_UNREACHABLE; icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED; icmph->icmph_du_mtu = htons(mtu); icmph->icmph_checksum = 0; icmph->icmph_checksum = IP_CSUM(pmtu_mp, (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0); mac_hcksum_set(pmtu_mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK); DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, " "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d", sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl, len_needed, (uint32_t)msg_len); mac_rx(state->id_mh, state->id_rh, pmtu_mp); mutex_enter(&ace->tx_too_big_mutex); ace->tx_too_big_ongoing = B_FALSE; mutex_exit(&ace->tx_too_big_mutex); return; too_big_fail: /* Drop packet */ freemsg(mp); mutex_enter(&ace->tx_too_big_mutex); ace->tx_too_big_ongoing = B_FALSE; mutex_exit(&ace->tx_too_big_mutex); } /* * Check all active/passive channels. If any ative/passive * channel has not been used for a long time, close it. */ void ibd_rc_conn_timeout_call(void *carg) { ibd_state_t *state = carg; ibd_ace_t *ace, *pre_ace; ibd_rc_chan_t *chan, *pre_chan, *next_chan; ibd_req_t *req; /* Check all active channels. If chan->is_used == B_FALSE, close it */ mutex_enter(&state->id_ac_mutex); ace = list_head(&state->id_ah_active); while ((pre_ace = ace) != NULL) { ace = list_next(&state->id_ah_active, ace); if (pre_ace->ac_chan != NULL) { chan = pre_ace->ac_chan; ASSERT(state->id_enable_rc == B_TRUE); if (chan->chan_state == IBD_RC_STATE_ACT_ESTAB) { if (chan->is_used == B_FALSE) { state->rc_timeout_act++; INC_REF(pre_ace, 1); IBD_ACACHE_PULLOUT_ACTIVE(state, pre_ace); chan->chan_state = IBD_RC_STATE_ACT_CLOSING; ibd_rc_signal_act_close(state, pre_ace); } else { chan->is_used = B_FALSE; } } } } mutex_exit(&state->id_ac_mutex); /* Check all passive channels. If chan->is_used == B_FALSE, close it */ mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); next_chan = state->rc_pass_chan_list.chan_list; pre_chan = NULL; while ((chan = next_chan) != NULL) { next_chan = chan->next; if (chan->is_used == B_FALSE) { req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req != NULL) { /* remove it */ state->rc_timeout_pas++; req->rq_ptr = chan; ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_PAS_CHAN); } else { ibd_print_warn(state, "ibd_rc_conn_timeout: " "alloc ibd_req_t fail"); if (pre_chan == NULL) { state->rc_pass_chan_list.chan_list = chan; } else { pre_chan->next = chan; } pre_chan = chan; } } else { if (pre_chan == NULL) { state->rc_pass_chan_list.chan_list = chan; } else { pre_chan->next = chan; } pre_chan = chan; chan->is_used = B_FALSE; } } if (pre_chan != NULL) { pre_chan->next = NULL; } else { state->rc_pass_chan_list.chan_list = NULL; } mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); mutex_enter(&state->rc_timeout_lock); if (state->rc_timeout_start == B_TRUE) { state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state, SEC_TO_TICK(ibd_rc_conn_timeout)); } mutex_exit(&state->rc_timeout_lock); } #ifdef DEBUG /* * ibd_rc_update_stats - update driver private kstat counters * * This routine will dump the internal statistics counters for ibd's * Reliable Connected Mode. The current stats dump values will * be sent to the kernel status area. */ static int ibd_rc_update_stats(kstat_t *ksp, int rw) { ibd_state_t *state; ibd_rc_stat_t *ibd_rc_ksp; if (rw == KSTAT_WRITE) return (EACCES); state = (ibd_state_t *)ksp->ks_private; ASSERT(state != NULL); ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data; ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte; ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt; ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte; ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt; ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail; ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err; ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short; ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes; ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt; ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul = state->rc_xmt_fragmented_pkt; ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt; ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt; ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found; ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe; ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf; ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short; ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update; ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short; ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul = state->rc_xmt_buf_mac_update; ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ; ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail; ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn; ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn; ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close; ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close; ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle; ibd_rc_ksp->rc_act_close_simultaneous.value.ul = state->rc_act_close_simultaneous; ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt; ibd_rc_ksp->rc_timeout_act.value.ul = state->rc_timeout_act; ibd_rc_ksp->rc_timeout_pas.value.ul = state->rc_timeout_pas; return (0); } /* * ibd_rc_init_stats - initialize kstat data structures * * This routine will create and initialize the driver private * statistics counters. */ int ibd_rc_init_stats(ibd_state_t *state) { kstat_t *ksp; ibd_rc_stat_t *ibd_rc_ksp; char stat_name[KSTAT_STRLEN]; int inst; /* * Create and init kstat */ inst = ddi_get_instance(state->id_dip); (void) snprintf(stat_name, KSTAT_STRLEN, "statistics%d_%x_%u", inst, state->id_pkey, state->id_plinkid); ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED, sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0); if (ksp == NULL) { ibd_print_warn(state, "ibd_rc_init_stats: Could not create " "kernel statistics"); return (DDI_FAILURE); } state->rc_ksp = ksp; /* Fill in the ksp of ibd over RC mode */ ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data; /* * Initialize all the statistics */ kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, " "transfer mode", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, " "transfer mode", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, " "copy mode", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, " "copy mode", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt, "RC: Tx pkt small size", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt, "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt, "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt, "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after " "recycle", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf " "after recycle", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update " "#, swqe available", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in " "ibd_send", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: " "mac_tx_update #, buf available", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast " "pkt", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab " "state", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace " "recycle", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: " "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: timeout act side", KSTAT_DATA_ULONG); kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: timeout pas side", KSTAT_DATA_ULONG); /* * Function to provide kernel stat update on demand */ ksp->ks_update = ibd_rc_update_stats; /* * Pointer into provider's raw statistics */ ksp->ks_private = (void *)state; /* * Add kstat to systems kstat chain */ kstat_install(ksp); return (DDI_SUCCESS); } #endif static ibt_status_t ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state, boolean_t is_tx_chan) { ibt_status_t result; ibd_rc_chan_t *chan; ibt_rc_chan_alloc_args_t alloc_args; ibt_chan_alloc_flags_t alloc_flags; ibt_chan_sizes_t sizes; ibt_cq_attr_t cq_atts; int rv; chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP); chan->state = state; mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL); /* Allocate IB structures for a new RC channel. */ if (is_tx_chan) { chan->scq_size = state->id_rc_num_swqe; chan->rcq_size = IBD_RC_MIN_CQ_SIZE; } else { chan->scq_size = IBD_RC_MIN_CQ_SIZE; chan->rcq_size = state->id_rc_num_rwqe; } cq_atts.cq_size = chan->scq_size; cq_atts.cq_sched = NULL; cq_atts.cq_flags = IBT_CQ_NO_FLAGS; result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl, &chan->scq_size); if (result != IBT_SUCCESS) { DPRINT(40, "ibd_rc_alloc_chan: error <%d>" "create scq completion queue (size <%d>)", result, chan->scq_size); goto alloc_scq_err; } /* if failure to alloc cq */ if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count, state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) { DPRINT(30, "ibd_rc_alloc_chan: Send CQ " "interrupt moderation failed"); } ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan); ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler, (void *) (uintptr_t)chan); cq_atts.cq_size = chan->rcq_size; cq_atts.cq_sched = NULL; cq_atts.cq_flags = IBT_CQ_NO_FLAGS; result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl, &chan->rcq_size); if (result != IBT_SUCCESS) { ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating " "rx completion queue (size <%d>)", result, chan->rcq_size); goto alloc_rcq_err; } /* if failure to alloc cq */ if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count, state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) { DPRINT(30, "ibd_rc_alloc_chan: Receive CQ " "interrupt moderation failed"); } ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan); ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler, (void *)(uintptr_t)chan); if (is_tx_chan) { chan->is_tx_chan = B_TRUE; if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) { ibd_print_warn(state, "ibd_rc_alloc_chan: " "ibd_rc_init_txlist failed"); goto init_txlist_err; } if (ibd_rc_tx_softintr == 1) { if ((rv = ddi_add_softintr(state->id_dip, DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL, ibd_rc_tx_recycle, (caddr_t)chan)) != DDI_SUCCESS) { DPRINT(10, "ibd_rc_alloc_chan: failed in " "ddi_add_softintr(scq_softintr), ret=%d", rv); goto alloc_softintr_err; } } } else { chan->is_tx_chan = B_FALSE; } /* * enable completions */ result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION); if (result != IBT_SUCCESS) { ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify" "(scq) failed: status %d\n", result); goto alloc_scq_enable_err; } /* We will enable chan->rcq_hdl later. */ /* alloc a RC channel */ bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t)); bzero(&sizes, sizeof (ibt_chan_sizes_t)); alloc_args.rc_flags = IBT_WR_SIGNALED; alloc_args.rc_control = IBT_CEP_NO_FLAGS; alloc_args.rc_scq = chan->scq_hdl; alloc_args.rc_rcq = chan->rcq_hdl; alloc_args.rc_pd = state->id_pd_hdl; alloc_args.rc_hca_port_num = state->id_port; alloc_args.rc_clone_chan = NULL; /* scatter/gather */ alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg; /* * For the number of SGL elements in receive side, I think it * should be 1. Because ibd driver allocates a whole block memory * for each ibt_post_recv(). */ alloc_args.rc_sizes.cs_rq_sgl = 1; /* The send queue size and the receive queue size */ alloc_args.rc_sizes.cs_sq = chan->scq_size; alloc_args.rc_sizes.cs_rq = chan->rcq_size; if (state->id_hca_res_lkey_capab) { alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY; } else { DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey"); } if (state->rc_enable_srq) { alloc_flags = IBT_ACHAN_USES_SRQ; alloc_args.rc_srq = state->rc_srq_hdl; } else { alloc_flags = IBT_ACHAN_NO_FLAGS; } result = ibt_alloc_rc_channel(state->id_hca_hdl, alloc_flags, &alloc_args, &chan->chan_hdl, &sizes); if (result != IBT_SUCCESS) { ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel" " fail:<%d>", result); goto alloc_scq_enable_err; } if (is_tx_chan) atomic_inc_32(&state->rc_num_tx_chan); else atomic_inc_32(&state->rc_num_rx_chan); /* For the connection reaper routine ibd_rc_conn_timeout_call() */ chan->is_used = B_TRUE; *ret_chan = chan; return (IBT_SUCCESS); alloc_scq_enable_err: if (is_tx_chan) { if (ibd_rc_tx_softintr == 1) { ddi_remove_softintr(chan->scq_softintr); } } alloc_softintr_err: if (is_tx_chan) { ibd_rc_fini_txlist(chan); } init_txlist_err: (void) ibt_free_cq(chan->rcq_hdl); alloc_rcq_err: (void) ibt_free_cq(chan->scq_hdl); alloc_scq_err: mutex_destroy(&chan->tx_poll_lock); mutex_destroy(&chan->tx_post_lock); mutex_destroy(&chan->tx_rel_list.dl_mutex); mutex_destroy(&chan->tx_wqe_list.dl_mutex); mutex_destroy(&chan->rx_free_list.dl_mutex); mutex_destroy(&chan->rx_wqe_list.dl_mutex); kmem_free(chan, sizeof (ibd_rc_chan_t)); return (result); } static void ibd_rc_free_chan(ibd_rc_chan_t *chan) { ibt_status_t ret; /* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */ if (chan->chan_hdl != NULL) { ret = ibt_free_channel(chan->chan_hdl); if (ret != IBT_SUCCESS) { DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, " "chan=%p, returned: %d", chan, ret); return; } chan->chan_hdl = NULL; } if (chan->rcq_hdl != NULL) { ret = ibt_free_cq(chan->rcq_hdl); if (ret != IBT_SUCCESS) { DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, " "chan=%p, returned: %d", chan, ret); return; } chan->rcq_hdl = NULL; } if (chan->scq_hdl != NULL) { ret = ibt_free_cq(chan->scq_hdl); if (ret != IBT_SUCCESS) { DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, " "chan=%p, returned: %d", chan, ret); return; } chan->scq_hdl = NULL; } /* Free buffers */ if (chan->is_tx_chan) { ibd_rc_fini_txlist(chan); if (ibd_rc_tx_softintr == 1) { ddi_remove_softintr(chan->scq_softintr); } atomic_dec_32(&chan->state->rc_num_tx_chan); } else { if (!chan->state->rc_enable_srq) { ibd_rc_fini_rxlist(chan); } atomic_dec_32(&chan->state->rc_num_rx_chan); } mutex_destroy(&chan->tx_poll_lock); mutex_destroy(&chan->tx_post_lock); mutex_destroy(&chan->tx_rel_list.dl_mutex); mutex_destroy(&chan->tx_wqe_list.dl_mutex); mutex_destroy(&chan->rx_free_list.dl_mutex); mutex_destroy(&chan->rx_wqe_list.dl_mutex); /* * If it is a passive channel, must make sure it has been removed * from chan->state->rc_pass_chan_list */ kmem_free(chan, sizeof (ibd_rc_chan_t)); } /* Add a RC channel */ static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan) { mutex_enter(&list->chan_list_mutex); if (list->chan_list == NULL) { list->chan_list = chan; chan->next = NULL; } else { chan->next = list->chan_list; list->chan_list = chan; } mutex_exit(&list->chan_list_mutex); } static boolean_t ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t *chan) { ibd_state_t *state = chan->state; mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); if ((state->id_mac_state & IBD_DRV_STARTED) == 0) { mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); return (B_FALSE); } else { if (state->rc_pass_chan_list.chan_list == NULL) { state->rc_pass_chan_list.chan_list = chan; chan->next = NULL; } else { chan->next = state->rc_pass_chan_list.chan_list; state->rc_pass_chan_list.chan_list = chan; } mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); return (B_TRUE); } } /* Remove a RC channel */ static inline ibd_rc_chan_t * ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan) { ibd_rc_chan_t *pre_chan; mutex_enter(&list->chan_list_mutex); if (list->chan_list == chan) { DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)" " in chan_list", chan); list->chan_list = chan->next; } else { pre_chan = list->chan_list; while (pre_chan != NULL) { if (pre_chan->next == chan) { DPRINT(30, "ibd_rc_rm_from_chan_list" "(middle): found chan(%p)", chan); pre_chan->next = chan->next; break; } pre_chan = pre_chan->next; } if (pre_chan == NULL) chan = NULL; } mutex_exit(&list->chan_list_mutex); return (chan); } static inline ibd_rc_chan_t * ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list) { ibd_rc_chan_t *rc_chan; mutex_enter(&list->chan_list_mutex); rc_chan = list->chan_list; if (rc_chan != NULL) { list->chan_list = rc_chan->next; } mutex_exit(&list->chan_list_mutex); return (rc_chan); } static int ibd_rc_alloc_srq_copybufs(ibd_state_t *state) { ibt_mr_attr_t mem_attr; uint_t rc_rx_bufs_sz; /* * Allocate one big chunk for all regular rx copy bufs */ rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size; state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP); state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size * sizeof (ibd_rwqe_t), KM_SLEEP); /* * Do one memory registration on the entire rxbuf area */ mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs; mem_attr.mr_len = rc_rx_bufs_sz; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() " "failed"); kmem_free(state->rc_srq_rwqes, state->rc_srq_size * sizeof (ibd_rwqe_t)); kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz); state->rc_srq_rx_bufs = NULL; state->rc_srq_rwqes = NULL; return (DDI_FAILURE); } return (DDI_SUCCESS); } static void ibd_rc_free_srq_copybufs(ibd_state_t *state) { uint_t rc_rx_buf_sz; /* * Don't change the value of state->rc_mtu at the period from call * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs(). */ rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE; /* * Unregister rxbuf mr */ if (ibt_deregister_mr(state->id_hca_hdl, state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()" " failed"); } state->rc_srq_rx_mr_hdl = NULL; /* * Free rxbuf memory */ kmem_free(state->rc_srq_rwqes, state->rc_srq_size * sizeof (ibd_rwqe_t)); kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz); state->rc_srq_rwqes = NULL; state->rc_srq_rx_bufs = NULL; } /* * Allocate and post a certain number of SRQ receive buffers and WRs. */ int ibd_rc_init_srq_list(ibd_state_t *state) { ibd_rwqe_t *rwqe; ibt_lkey_t lkey; int i; uint_t len; uint8_t *bufaddr; ibt_srq_sizes_t srq_sizes; ibt_srq_sizes_t srq_real_sizes; ibt_status_t ret; srq_sizes.srq_sgl_sz = 1; srq_sizes.srq_wr_sz = state->id_rc_num_srq; ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS, state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes); if (ret != IBT_SUCCESS) { /* * The following code is for CR 6932460 (can't configure ibd * interface on 32 bits x86 systems). 32 bits x86 system has * less memory resource than 64 bits x86 system. If current * resource request can't be satisfied, we request less * resource here. */ len = state->id_rc_num_srq; while ((ret == IBT_HCA_WR_EXCEEDED) && (len >= 2 * IBD_RC_MIN_CQ_SIZE)) { len = len/2; srq_sizes.srq_sgl_sz = 1; srq_sizes.srq_wr_sz = len; ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS, state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes); } if (ret != IBT_SUCCESS) { DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed." "req_sgl_sz=%d, req_wr_sz=0x%x, final_req_wr_sz=" "0x%x, ret=%d", srq_sizes.srq_sgl_sz, srq_sizes.srq_wr_sz, len, ret); return (DDI_FAILURE); } state->id_rc_num_srq = len; state->id_rc_num_rwqe = state->id_rc_num_srq + 1; } state->rc_srq_size = srq_real_sizes.srq_wr_sz; if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) { ret = ibt_free_srq(state->rc_srq_hdl); if (ret != IBT_SUCCESS) { ibd_print_warn(state, "ibd_rc_init_srq_list: " "ibt_free_srq fail, ret=%d", ret); } return (DDI_FAILURE); } /* * Allocate and setup the rwqe list */ lkey = state->rc_srq_rx_mr_desc.md_lkey; rwqe = state->rc_srq_rwqes; bufaddr = state->rc_srq_rx_bufs; len = state->rc_mtu + IPOIB_GRH_SIZE; state->rc_srq_rwqe_list.dl_cnt = 0; state->rc_srq_rwqe_list.dl_bufs_outstanding = 0; for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) { rwqe->w_state = state; rwqe->w_freeing_wqe = B_FALSE; rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb; rwqe->w_freemsg_cb.free_arg = (char *)rwqe; rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, &rwqe->w_freemsg_cb)) == NULL) { DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed"); rwqe->rwqe_copybuf.ic_bufaddr = NULL; if (atomic_dec_32_nv(&state->id_running) != 0) { cmn_err(CE_WARN, "ibd_rc_init_srq_list: " "id_running was not 1\n"); } ibd_rc_fini_srq_list(state); atomic_inc_32(&state->id_running); return (DDI_FAILURE); } rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; /* Leave IPOIB_GRH_SIZE space */ rwqe->rwqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE); rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu; rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; rwqe->w_rwr.wr_nds = 1; rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; (void) ibd_rc_post_srq(state, rwqe); } mutex_enter(&state->rc_srq_free_list.dl_mutex); state->rc_srq_free_list.dl_head = NULL; state->rc_srq_free_list.dl_cnt = 0; mutex_exit(&state->rc_srq_free_list.dl_mutex); return (DDI_SUCCESS); } /* * Free the statically allocated Rx buffer list for SRQ. */ void ibd_rc_fini_srq_list(ibd_state_t *state) { ibd_rwqe_t *rwqe; int i; ibt_status_t ret; ASSERT(state->id_running == 0); ret = ibt_free_srq(state->rc_srq_hdl); if (ret != IBT_SUCCESS) { ibd_print_warn(state, "ibd_rc_fini_srq_list: " "ibt_free_srq fail, ret=%d", ret); } mutex_enter(&state->rc_srq_rwqe_list.dl_mutex); rwqe = state->rc_srq_rwqes; for (i = 0; i < state->rc_srq_size; i++, rwqe++) { if (rwqe->rwqe_im_mblk != NULL) { rwqe->w_freeing_wqe = B_TRUE; freemsg(rwqe->rwqe_im_mblk); } } mutex_exit(&state->rc_srq_rwqe_list.dl_mutex); ibd_rc_free_srq_copybufs(state); } /* Repost the elements in state->ib_rc_free_list */ int ibd_rc_repost_srq_free_list(ibd_state_t *state) { ibd_rwqe_t *rwqe; ibd_wqe_t *list; uint_t len; mutex_enter(&state->rc_srq_free_list.dl_mutex); if (state->rc_srq_free_list.dl_head != NULL) { /* repost them */ len = state->rc_mtu + IPOIB_GRH_SIZE; list = state->rc_srq_free_list.dl_head; state->rc_srq_free_list.dl_head = NULL; state->rc_srq_free_list.dl_cnt = 0; mutex_exit(&state->rc_srq_free_list.dl_mutex); while (list != NULL) { rwqe = WQE_TO_RWQE(list); if ((rwqe->rwqe_im_mblk == NULL) && ((rwqe->rwqe_im_mblk = desballoc( rwqe->rwqe_copybuf.ic_bufaddr, len, 0, &rwqe->w_freemsg_cb)) == NULL)) { DPRINT(40, "ibd_rc_repost_srq_free_list: " "failed in desballoc()"); do { ibd_rc_srq_free_rwqe(state, rwqe); list = list->w_next; rwqe = WQE_TO_RWQE(list); } while (list != NULL); return (DDI_FAILURE); } if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { ibd_rc_srq_free_rwqe(state, rwqe); } list = list->w_next; } return (DDI_SUCCESS); } mutex_exit(&state->rc_srq_free_list.dl_mutex); return (DDI_SUCCESS); } /* * Free an allocated recv wqe. */ static void ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) { /* * desballoc() failed (no memory) or the posting of rwqe failed. * * This rwqe is placed on a free list so that it * can be reinstated in future. * * NOTE: no code currently exists to reinstate * these "lost" rwqes. */ mutex_enter(&state->rc_srq_free_list.dl_mutex); state->rc_srq_free_list.dl_cnt++; rwqe->rwqe_next = state->rc_srq_free_list.dl_head; state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe); mutex_exit(&state->rc_srq_free_list.dl_mutex); } static void ibd_rc_srq_freemsg_cb(char *arg) { ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; ibd_state_t *state = rwqe->w_state; ASSERT(state->rc_enable_srq); /* * If the driver is stopped, just free the rwqe. */ if (atomic_add_32_nv(&state->id_running, 0) == 0) { if (!rwqe->w_freeing_wqe) { atomic_dec_32( &state->rc_srq_rwqe_list.dl_bufs_outstanding); DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed"); rwqe->rwqe_im_mblk = NULL; ibd_rc_srq_free_rwqe(state, rwqe); } return; } atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding); ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size); ASSERT(!rwqe->w_freeing_wqe); /* * Upper layer has released held mblk, so we have * no more use for keeping the old pointer in * our rwqe. */ rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); if (rwqe->rwqe_im_mblk == NULL) { DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed"); ibd_rc_srq_free_rwqe(state, rwqe); return; } if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq" " failed"); ibd_rc_srq_free_rwqe(state, rwqe); return; } } /* * Post a rwqe to the hardware and add it to the Rx list. */ static int ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe) { /* * Here we should add dl_cnt before post recv, because * we would have to make sure dl_cnt is updated before * the corresponding ibd_rc_process_rx() is called. */ ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size); atomic_inc_32(&state->rc_srq_rwqe_list.dl_cnt); if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) != IBT_SUCCESS) { atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt); DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed"); return (DDI_FAILURE); } return (DDI_SUCCESS); } /* * Post a rwqe to the hardware and add it to the Rx list. */ static int ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe) { /* * Here we should add dl_cnt before post recv, because we would * have to make sure dl_cnt has already updated before * corresponding ibd_rc_process_rx() is called. */ atomic_inc_32(&chan->rx_wqe_list.dl_cnt); if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) != IBT_SUCCESS) { atomic_dec_32(&chan->rx_wqe_list.dl_cnt); DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()"); return (DDI_FAILURE); } return (DDI_SUCCESS); } static int ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan) { ibd_state_t *state = chan->state; ibt_mr_attr_t mem_attr; uint_t rc_rx_bufs_sz; /* * Allocate one big chunk for all regular rx copy bufs */ rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size; chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP); chan->rx_rwqes = kmem_zalloc(chan->rcq_size * sizeof (ibd_rwqe_t), KM_SLEEP); /* * Do one memory registration on the entire rxbuf area */ mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs; mem_attr.mr_len = rc_rx_bufs_sz; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed"); kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t)); kmem_free(chan->rx_bufs, rc_rx_bufs_sz); chan->rx_bufs = NULL; chan->rx_rwqes = NULL; return (DDI_FAILURE); } return (DDI_SUCCESS); } static void ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan) { ibd_state_t *state = chan->state; uint_t rc_rx_buf_sz; ASSERT(!state->rc_enable_srq); ASSERT(chan->rx_rwqes != NULL); ASSERT(chan->rx_bufs != NULL); /* * Don't change the value of state->rc_mtu at the period from call * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs(). */ rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE; /* * Unregister rxbuf mr */ if (ibt_deregister_mr(state->id_hca_hdl, chan->rx_mr_hdl) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed"); } chan->rx_mr_hdl = NULL; /* * Free rxbuf memory */ kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t)); chan->rx_rwqes = NULL; kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz); chan->rx_bufs = NULL; } /* * Post a certain number of receive buffers and WRs on a RC channel. */ static int ibd_rc_init_rxlist(ibd_rc_chan_t *chan) { ibd_state_t *state = chan->state; ibd_rwqe_t *rwqe; ibt_lkey_t lkey; int i; uint_t len; uint8_t *bufaddr; ASSERT(!state->rc_enable_srq); if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS) return (DDI_FAILURE); /* * Allocate and setup the rwqe list */ lkey = chan->rx_mr_desc.md_lkey; rwqe = chan->rx_rwqes; bufaddr = chan->rx_bufs; len = state->rc_mtu + IPOIB_GRH_SIZE; for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) { rwqe->w_state = state; rwqe->w_chan = chan; rwqe->w_freeing_wqe = B_FALSE; rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb; rwqe->w_freemsg_cb.free_arg = (char *)rwqe; rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, &rwqe->w_freemsg_cb)) == NULL) { DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed"); rwqe->rwqe_copybuf.ic_bufaddr = NULL; ibd_rc_fini_rxlist(chan); return (DDI_FAILURE); } rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; rwqe->rwqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE); rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu; rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; rwqe->w_rwr.wr_nds = 1; rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; (void) ibd_rc_post_rwqe(chan, rwqe); } return (DDI_SUCCESS); } /* * Free the statically allocated Rx buffer list for SRQ. */ static void ibd_rc_fini_rxlist(ibd_rc_chan_t *chan) { ibd_rwqe_t *rwqe; int i; if (chan->rx_bufs == NULL) { DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit"); return; } /* bufs_outstanding must be 0 */ ASSERT((chan->rx_wqe_list.dl_head == NULL) || (chan->rx_wqe_list.dl_bufs_outstanding == 0)); mutex_enter(&chan->rx_wqe_list.dl_mutex); rwqe = chan->rx_rwqes; for (i = 0; i < chan->rcq_size; i++, rwqe++) { if (rwqe->rwqe_im_mblk != NULL) { rwqe->w_freeing_wqe = B_TRUE; freemsg(rwqe->rwqe_im_mblk); } } mutex_exit(&chan->rx_wqe_list.dl_mutex); ibd_rc_free_rx_copybufs(chan); } /* * Free an allocated recv wqe. */ static void ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe) { /* * desballoc() failed (no memory) or the posting of rwqe failed. * * This rwqe is placed on a free list so that it * can be reinstated in future. * * NOTE: no code currently exists to reinstate * these "lost" rwqes. */ mutex_enter(&chan->rx_free_list.dl_mutex); chan->rx_free_list.dl_cnt++; rwqe->rwqe_next = chan->rx_free_list.dl_head; chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe); mutex_exit(&chan->rx_free_list.dl_mutex); } /* * Processing to be done after receipt of a packet; hand off to GLD * in the format expected by GLD. */ static void ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc) { ibd_state_t *state = chan->state; ib_header_info_t *phdr; ipoib_hdr_t *ipibp; mblk_t *mp; mblk_t *mpc; int rxcnt; ip6_t *ip6h; int len; /* * Track number handed to upper layer, and number still * available to receive packets. */ if (state->rc_enable_srq) { rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt); } else { rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt); } /* * It can not be a IBA multicast packet. */ ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT); /* For the connection reaper routine ibd_rc_conn_timeout_call() */ chan->is_used = B_TRUE; #ifdef DEBUG if (rxcnt < state->id_rc_rx_rwqe_thresh) { state->rc_rwqe_short++; } #endif /* * Possibly replenish the Rx pool if needed. */ if ((rxcnt >= state->id_rc_rx_rwqe_thresh) && (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) { atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer); atomic_inc_64(&state->rc_rcv_trans_pkt); /* * Record how many rwqe has been occupied by upper * network layer */ if (state->rc_enable_srq) { atomic_inc_32( &state->rc_srq_rwqe_list.dl_bufs_outstanding); } else { atomic_inc_32(&chan->rx_wqe_list.dl_bufs_outstanding); } mp = rwqe->rwqe_im_mblk; } else { atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer); atomic_inc_64(&state->rc_rcv_copy_pkt); if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE, BPRI_HI)) == NULL) { /* no memory */ DPRINT(40, "ibd_rc_process_rx: allocb() failed"); state->rc_rcv_alloc_fail++; if (state->rc_enable_srq) { if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { ibd_rc_srq_free_rwqe(state, rwqe); } } else { if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) { ibd_rc_free_rwqe(chan, rwqe); } } return; } bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE, mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer); if (state->rc_enable_srq) { if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { ibd_rc_srq_free_rwqe(state, rwqe); } } else { if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) { ibd_rc_free_rwqe(chan, rwqe); } } } ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE); if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); len = ntohs(ip6h->ip6_plen); if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_RECV); } } phdr = (ib_header_info_t *)mp->b_rptr; phdr->ib_grh.ipoib_vertcflow = 0; ovbcopy(&state->id_macaddr, &phdr->ib_dst, sizeof (ipoib_mac_t)); mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE; /* * Can RC mode in IB guarantee its checksum correctness? * * mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK); */ /* * Make sure this is NULL or we're in trouble. */ if (mp->b_next != NULL) { ibd_print_warn(state, "ibd_rc_process_rx: got duplicate mp from rcq?"); mp->b_next = NULL; } /* * Add this mp to the list of processed mp's to send to * the nw layer */ if (state->rc_enable_srq) { mutex_enter(&state->rc_rx_lock); if (state->rc_rx_mp) { ASSERT(state->rc_rx_mp_tail != NULL); state->rc_rx_mp_tail->b_next = mp; } else { ASSERT(state->rc_rx_mp_tail == NULL); state->rc_rx_mp = mp; } state->rc_rx_mp_tail = mp; state->rc_rx_mp_len++; if (state->rc_rx_mp_len >= IBD_MAX_RX_MP_LEN) { mpc = state->rc_rx_mp; state->rc_rx_mp = NULL; state->rc_rx_mp_tail = NULL; state->rc_rx_mp_len = 0; mutex_exit(&state->rc_rx_lock); mac_rx(state->id_mh, NULL, mpc); } else { mutex_exit(&state->rc_rx_lock); } } else { mutex_enter(&chan->rx_lock); if (chan->rx_mp) { ASSERT(chan->rx_mp_tail != NULL); chan->rx_mp_tail->b_next = mp; } else { ASSERT(chan->rx_mp_tail == NULL); chan->rx_mp = mp; } chan->rx_mp_tail = mp; chan->rx_mp_len++; if (chan->rx_mp_len >= IBD_MAX_RX_MP_LEN) { mpc = chan->rx_mp; chan->rx_mp = NULL; chan->rx_mp_tail = NULL; chan->rx_mp_len = 0; mutex_exit(&chan->rx_lock); mac_rx(state->id_mh, NULL, mpc); } else { mutex_exit(&chan->rx_lock); } } } /* * Callback code invoked from STREAMs when the recv data buffer is free * for recycling. */ static void ibd_rc_freemsg_cb(char *arg) { ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; ibd_rc_chan_t *chan = rwqe->w_chan; ibd_state_t *state = rwqe->w_state; /* * If the wqe is being destructed, do not attempt recycling. */ if (rwqe->w_freeing_wqe == B_TRUE) { return; } ASSERT(!state->rc_enable_srq); ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size); rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); if (rwqe->rwqe_im_mblk == NULL) { DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed"); ibd_rc_free_rwqe(chan, rwqe); return; } /* * Post back to h/w. We could actually have more than * id_num_rwqe WQEs on the list if there were multiple * ibd_freemsg_cb() calls outstanding (since the lock is * not held the entire time). This will start getting * corrected over subsequent ibd_freemsg_cb() calls. */ if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) { ibd_rc_free_rwqe(chan, rwqe); return; } atomic_dec_32(&chan->rx_wqe_list.dl_bufs_outstanding); } /* * Common code for interrupt handling as well as for polling * for all completed wqe's while detaching. */ static void ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl) { ibd_wqe_t *wqe; ibt_wc_t *wc, *wcs; uint_t numwcs, real_numwcs; int i; wcs = chan->rx_wc; numwcs = IBD_RC_MAX_CQ_WC; while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; if (wc->wc_status != IBT_WC_SUCCESS) { chan->state->rc_rcq_err++; /* * Channel being torn down. */ DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != " "SUCC, chan=%p", wc->wc_status, chan); if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { /* * Do not invoke Rx handler because * it might add buffers to the Rx pool * when we are trying to deinitialize. */ continue; } } ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc); } } } /* Receive CQ handler */ /* ARGSUSED */ static void ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; ibd_state_t *state = chan->state; atomic_inc_32(&chan->rcq_invoking); ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB); /* * Poll for completed entries; the CQ will not interrupt any * more for incoming (or transmitted) packets. */ ibd_rc_poll_rcq(chan, chan->rcq_hdl); /* * Now enable CQ notifications; all packets that arrive now * (or complete transmission) will cause new interrupts. */ if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { /* * We do not expect a failure here. */ DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed"); } /* * Repoll to catch all packets that might have arrived after * we finished the first poll loop and before interrupts got * armed. */ ibd_rc_poll_rcq(chan, chan->rcq_hdl); if (state->rc_enable_srq) { mutex_enter(&state->rc_rx_lock); if (state->rc_rx_mp != NULL) { mblk_t *mpc; mpc = state->rc_rx_mp; state->rc_rx_mp = NULL; state->rc_rx_mp_tail = NULL; state->rc_rx_mp_len = 0; mutex_exit(&state->rc_rx_lock); mac_rx(state->id_mh, NULL, mpc); } else { mutex_exit(&state->rc_rx_lock); } } else { mutex_enter(&chan->rx_lock); if (chan->rx_mp != NULL) { mblk_t *mpc; mpc = chan->rx_mp; chan->rx_mp = NULL; chan->rx_mp_tail = NULL; chan->rx_mp_len = 0; mutex_exit(&chan->rx_lock); mac_rx(state->id_mh, NULL, mpc); } else { mutex_exit(&chan->rx_lock); } } atomic_dec_32(&chan->rcq_invoking); } /* * Allocate the statically allocated Tx buffer list. */ int ibd_rc_init_tx_largebuf_list(ibd_state_t *state) { ibd_rc_tx_largebuf_t *lbufp; ibd_rc_tx_largebuf_t *tail; uint8_t *memp; ibt_mr_attr_t mem_attr; uint32_t num_swqe; size_t mem_size; int i; num_swqe = state->id_rc_num_swqe - 1; /* * Allocate one big chunk for all Tx large copy bufs */ /* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */ mem_size = num_swqe * state->rc_mtu; state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP); mem_attr.mr_len = mem_size; mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr " "failed"); kmem_free(state->rc_tx_mr_bufs, mem_size); state->rc_tx_mr_bufs = NULL; return (DDI_FAILURE); } state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe * sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP); /* * Set up the buf chain */ memp = state->rc_tx_mr_bufs; mutex_enter(&state->rc_tx_large_bufs_lock); lbufp = state->rc_tx_largebuf_desc_base; for (i = 0; i < num_swqe; i++) { lbufp->lb_buf = memp; lbufp->lb_next = lbufp + 1; tail = lbufp; memp += state->rc_mtu; lbufp++; } tail->lb_next = NULL; /* * Set up the buffer information in ibd state */ state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base; state->rc_tx_largebuf_nfree = num_swqe; mutex_exit(&state->rc_tx_large_bufs_lock); return (DDI_SUCCESS); } void ibd_rc_fini_tx_largebuf_list(ibd_state_t *state) { uint32_t num_swqe; num_swqe = state->id_rc_num_swqe - 1; if (ibt_deregister_mr(state->id_hca_hdl, state->rc_tx_mr_hdl) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() " "failed"); } state->rc_tx_mr_hdl = NULL; kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu); state->rc_tx_mr_bufs = NULL; kmem_free(state->rc_tx_largebuf_desc_base, num_swqe * sizeof (ibd_rc_tx_largebuf_t)); state->rc_tx_largebuf_desc_base = NULL; } static int ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan) { ibt_mr_attr_t mem_attr; ibd_state_t *state; state = chan->state; ASSERT(state != NULL); /* * Allocate one big chunk for all regular tx copy bufs */ mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh; chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP); /* * Do one memory registration on the entire txbuf area */ mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed"); ASSERT(mem_attr.mr_len == chan->scq_size * state->id_rc_tx_copy_thresh); kmem_free(chan->tx_mr_bufs, mem_attr.mr_len); chan->tx_mr_bufs = NULL; return (DDI_FAILURE); } return (DDI_SUCCESS); } /* * Allocate the statically allocated Tx buffer list. */ static int ibd_rc_init_txlist(ibd_rc_chan_t *chan) { ibd_swqe_t *swqe; int i; ibt_lkey_t lkey; ibd_state_t *state = chan->state; if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS) return (DDI_FAILURE); /* * Allocate and setup the swqe list */ lkey = chan->tx_mr_desc.md_lkey; chan->tx_wqes = kmem_zalloc(chan->scq_size * sizeof (ibd_swqe_t), KM_SLEEP); swqe = chan->tx_wqes; for (i = 0; i < chan->scq_size; i++, swqe++) { swqe->swqe_next = NULL; swqe->swqe_im_mblk = NULL; swqe->swqe_copybuf.ic_sgl.ds_key = lkey; swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh); swqe->w_swr.wr_trans = IBT_RC_SRV; /* Add to list */ mutex_enter(&chan->tx_wqe_list.dl_mutex); chan->tx_wqe_list.dl_cnt++; swqe->swqe_next = chan->tx_wqe_list.dl_head; chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe); mutex_exit(&chan->tx_wqe_list.dl_mutex); } return (DDI_SUCCESS); } /* * Free the statically allocated Tx buffer list. */ static void ibd_rc_fini_txlist(ibd_rc_chan_t *chan) { ibd_state_t *state = chan->state; if (chan->tx_mr_hdl != NULL) { if (ibt_deregister_mr(chan->state->id_hca_hdl, chan->tx_mr_hdl) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr " "failed"); } chan->tx_mr_hdl = NULL; } if (chan->tx_mr_bufs != NULL) { kmem_free(chan->tx_mr_bufs, chan->scq_size * state->id_rc_tx_copy_thresh); chan->tx_mr_bufs = NULL; } if (chan->tx_wqes != NULL) { kmem_free(chan->tx_wqes, chan->scq_size * sizeof (ibd_swqe_t)); chan->tx_wqes = NULL; } } /* * Acquire send wqe from free list. * Returns error number and send wqe pointer. */ ibd_swqe_t * ibd_rc_acquire_swqes(ibd_rc_chan_t *chan) { ibd_swqe_t *wqe; mutex_enter(&chan->tx_rel_list.dl_mutex); if (chan->tx_rel_list.dl_head != NULL) { /* transfer id_tx_rel_list to id_tx_list */ chan->tx_wqe_list.dl_head = chan->tx_rel_list.dl_head; chan->tx_wqe_list.dl_cnt = chan->tx_rel_list.dl_cnt; chan->tx_wqe_list.dl_pending_sends = B_FALSE; /* clear id_tx_rel_list */ chan->tx_rel_list.dl_head = NULL; chan->tx_rel_list.dl_cnt = 0; mutex_exit(&chan->tx_rel_list.dl_mutex); wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head); chan->tx_wqe_list.dl_cnt -= 1; chan->tx_wqe_list.dl_head = wqe->swqe_next; } else { /* no free swqe */ mutex_exit(&chan->tx_rel_list.dl_mutex); chan->tx_wqe_list.dl_pending_sends = B_TRUE; wqe = NULL; } return (wqe); } /* * Release send wqe back into free list. */ static void ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe) { /* * Add back on Tx list for reuse. */ swqe->swqe_next = NULL; mutex_enter(&chan->tx_rel_list.dl_mutex); chan->tx_rel_list.dl_pending_sends = B_FALSE; swqe->swqe_next = chan->tx_rel_list.dl_head; chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe); chan->tx_rel_list.dl_cnt++; mutex_exit(&chan->tx_rel_list.dl_mutex); } void ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node) { uint_t i; uint_t num_posted; uint_t n_wrs; ibt_status_t ibt_status; ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; ibd_swqe_t *tx_head, *elem; ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; /* post the one request, then check for more */ ibt_status = ibt_post_send(chan->chan_hdl, &node->w_swr, 1, NULL); if (ibt_status != IBT_SUCCESS) { ibd_print_warn(chan->state, "ibd_post_send: " "posting one wr failed: ret=%d", ibt_status); ibd_rc_tx_cleanup(node); } tx_head = NULL; for (;;) { if (tx_head == NULL) { mutex_enter(&chan->tx_post_lock); tx_head = chan->tx_head; if (tx_head == NULL) { chan->tx_busy = 0; mutex_exit(&chan->tx_post_lock); return; } chan->tx_head = NULL; mutex_exit(&chan->tx_post_lock); } /* * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs * at a time if possible, and keep posting them. */ for (n_wrs = 0, elem = tx_head; (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { nodes[n_wrs] = elem; wrs[n_wrs] = elem->w_swr; } tx_head = elem; ASSERT(n_wrs != 0); /* * If posting fails for some reason, we'll never receive * completion intimation, so we'll need to cleanup. But * we need to make sure we don't clean up nodes whose * wrs have been successfully posted. We assume that the * hca driver returns on the first failure to post and * therefore the first 'num_posted' entries don't need * cleanup here. */ num_posted = 0; ibt_status = ibt_post_send(chan->chan_hdl, wrs, n_wrs, &num_posted); if (ibt_status != IBT_SUCCESS) { ibd_print_warn(chan->state, "ibd_post_send: " "posting multiple wrs failed: " "requested=%d, done=%d, ret=%d", n_wrs, num_posted, ibt_status); for (i = num_posted; i < n_wrs; i++) ibd_rc_tx_cleanup(nodes[i]); } } } /* * Common code that deals with clean ups after a successful or * erroneous transmission attempt. */ void ibd_rc_tx_cleanup(ibd_swqe_t *swqe) { ibd_ace_t *ace = swqe->w_ahandle; ibd_state_t *state; ASSERT(ace != NULL); ASSERT(ace->ac_chan != NULL); state = ace->ac_chan->state; /* * If this was a dynamic registration in ibd_send(), * deregister now. */ if (swqe->swqe_im_mblk != NULL) { ASSERT(swqe->w_buftype == IBD_WQE_MAPPED); if (swqe->w_buftype == IBD_WQE_MAPPED) { ibd_unmap_mem(state, swqe); } freemsg(swqe->swqe_im_mblk); swqe->swqe_im_mblk = NULL; } else { ASSERT(swqe->w_buftype != IBD_WQE_MAPPED); } if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) { ibd_rc_tx_largebuf_t *lbufp; lbufp = swqe->w_rc_tx_largebuf; ASSERT(lbufp != NULL); mutex_enter(&state->rc_tx_large_bufs_lock); lbufp->lb_next = state->rc_tx_largebuf_free_head; state->rc_tx_largebuf_free_head = lbufp; state->rc_tx_largebuf_nfree ++; mutex_exit(&state->rc_tx_large_bufs_lock); swqe->w_rc_tx_largebuf = NULL; } /* * Release the send wqe for reuse. */ ibd_rc_release_swqe(ace->ac_chan, swqe); /* * Drop the reference count on the AH; it can be reused * now for a different destination if there are no more * posted sends that will use it. This can be eliminated * if we can always associate each Tx buffer with an AH. * The ace can be null if we are cleaning up from the * ibd_send() error path. */ ibd_dec_ref_ace(state, ace); } void ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl) { ibd_state_t *state = chan->state; ibd_wqe_t *wqe; ibt_wc_t *wc, *wcs; ibd_ace_t *ace; uint_t numwcs, real_numwcs; int i; boolean_t encount_error; wcs = chan->tx_wc; numwcs = IBD_RC_MAX_CQ_WC; encount_error = B_FALSE; while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; if (wc->wc_status != IBT_WC_SUCCESS) { if (encount_error == B_FALSE) { /* * This RC channle is in error status, * remove it. */ encount_error = B_TRUE; mutex_enter(&state->id_ac_mutex); if ((chan->chan_state == IBD_RC_STATE_ACT_ESTAB) && (chan->state->id_link_state == LINK_STATE_UP) && ((ace = ibd_acache_find(state, &chan->ace->ac_mac, B_FALSE, 0)) != NULL) && (ace == chan->ace)) { ASSERT(ace->ac_mce == NULL); INC_REF(ace, 1); IBD_ACACHE_PULLOUT_ACTIVE( state, ace); chan->chan_state = IBD_RC_STATE_ACT_CLOSING; mutex_exit(&state->id_ac_mutex); state->rc_reset_cnt++; DPRINT(30, "ibd_rc_drain_scq: " "wc_status(%d) != SUCC, " "chan=%p, ace=%p, " "link_state=%d" "reset RC channel", wc->wc_status, chan, chan->ace, chan->state-> id_link_state); ibd_rc_signal_act_close( state, ace); } else { mutex_exit(&state->id_ac_mutex); state-> rc_act_close_simultaneous++; DPRINT(40, "ibd_rc_drain_scq: " "wc_status(%d) != SUCC, " "chan=%p, chan_state=%d," "ace=%p, link_state=%d." "other thread is closing " "it", wc->wc_status, chan, chan->chan_state, chan->ace, chan->state->id_link_state); } } } ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe)); } mutex_enter(&state->id_sched_lock); if (state->id_sched_needed == 0) { mutex_exit(&state->id_sched_lock); } else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) { mutex_enter(&chan->tx_wqe_list.dl_mutex); mutex_enter(&chan->tx_rel_list.dl_mutex); if ((chan->tx_rel_list.dl_cnt + chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) { state->id_sched_needed &= ~IBD_RSRC_RC_SWQE; mutex_exit(&chan->tx_rel_list.dl_mutex); mutex_exit(&chan->tx_wqe_list.dl_mutex); mutex_exit(&state->id_sched_lock); state->rc_swqe_mac_update++; mac_tx_update(state->id_mh); } else { state->rc_scq_no_swqe++; mutex_exit(&chan->tx_rel_list.dl_mutex); mutex_exit(&chan->tx_wqe_list.dl_mutex); mutex_exit(&state->id_sched_lock); } } else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) { mutex_enter(&state->rc_tx_large_bufs_lock); if (state->rc_tx_largebuf_nfree > IBD_RC_TX_FREE_THRESH) { ASSERT(state->rc_tx_largebuf_free_head != NULL); state->id_sched_needed &= ~IBD_RSRC_RC_TX_LARGEBUF; mutex_exit(&state->rc_tx_large_bufs_lock); mutex_exit(&state->id_sched_lock); state->rc_xmt_buf_mac_update++; mac_tx_update(state->id_mh); } else { state->rc_scq_no_largebuf++; mutex_exit(&state->rc_tx_large_bufs_lock); mutex_exit(&state->id_sched_lock); } } else if (state->id_sched_needed & IBD_RSRC_SWQE) { mutex_enter(&state->id_tx_list.dl_mutex); mutex_enter(&state->id_tx_rel_list.dl_mutex); if ((state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt) > IBD_FREE_SWQES_THRESH) { state->id_sched_needed &= ~IBD_RSRC_SWQE; state->id_sched_cnt++; mutex_exit(&state->id_tx_rel_list.dl_mutex); mutex_exit(&state->id_tx_list.dl_mutex); mutex_exit(&state->id_sched_lock); mac_tx_update(state->id_mh); } else { mutex_exit(&state->id_tx_rel_list.dl_mutex); mutex_exit(&state->id_tx_list.dl_mutex); mutex_exit(&state->id_sched_lock); } } else { mutex_exit(&state->id_sched_lock); } } } /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */ /* ARGSUSED */ static void ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; if (ibd_rc_tx_softintr == 1) { mutex_enter(&chan->tx_poll_lock); if (chan->tx_poll_busy & IBD_CQ_POLLING) { chan->tx_poll_busy |= IBD_REDO_CQ_POLLING; mutex_exit(&chan->tx_poll_lock); return; } else { mutex_exit(&chan->tx_poll_lock); ddi_trigger_softintr(chan->scq_softintr); } } else (void) ibd_rc_tx_recycle(arg); } static uint_t ibd_rc_tx_recycle(caddr_t arg) { ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; ibd_state_t *state = chan->state; int flag, redo_flag; int redo = 1; flag = IBD_CQ_POLLING; redo_flag = IBD_REDO_CQ_POLLING; mutex_enter(&chan->tx_poll_lock); if (chan->tx_poll_busy & flag) { ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling " "threads"); chan->tx_poll_busy |= redo_flag; mutex_exit(&chan->tx_poll_lock); return (DDI_INTR_CLAIMED); } chan->tx_poll_busy |= flag; mutex_exit(&chan->tx_poll_lock); /* * Poll for completed entries; the CQ will not interrupt any * more for completed packets. */ ibd_rc_drain_scq(chan, chan->scq_hdl); /* * Now enable CQ notifications; all completions originating now * will cause new interrupts. */ do { if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { /* * We do not expect a failure here. */ DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()" " failed"); } ibd_rc_drain_scq(chan, chan->scq_hdl); mutex_enter(&chan->tx_poll_lock); if (chan->tx_poll_busy & redo_flag) chan->tx_poll_busy &= ~redo_flag; else { chan->tx_poll_busy &= ~flag; redo = 0; } mutex_exit(&chan->tx_poll_lock); } while (redo); return (DDI_INTR_CLAIMED); } static ibt_status_t ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid, int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid) { ibd_service_t *p; ibt_status_t status; mutex_enter(&ibd_gstate.ig_mutex); for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) { if (p->is_sid == sid) { p->is_ref_cnt++; *srv_hdl = p->is_srv_hdl; *ret_sid = sid; mutex_exit(&ibd_gstate.ig_mutex); return (IBT_SUCCESS); } } status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid, num_sids, srv_hdl, ret_sid); if (status == IBT_SUCCESS) { p = kmem_alloc(sizeof (*p), KM_SLEEP); p->is_srv_hdl = *srv_hdl; p->is_sid = sid; p->is_ref_cnt = 1; p->is_link = ibd_gstate.ig_service_list; ibd_gstate.ig_service_list = p; } mutex_exit(&ibd_gstate.ig_mutex); return (status); } static ibt_status_t ibd_deregister_service(ibt_srv_hdl_t srv_hdl) { ibd_service_t *p, **pp; ibt_status_t status; mutex_enter(&ibd_gstate.ig_mutex); for (pp = &ibd_gstate.ig_service_list; *pp != NULL; pp = &((*pp)->is_link)) { p = *pp; if (p->is_srv_hdl == srv_hdl) { /* Found it */ if (--p->is_ref_cnt == 0) { status = ibt_deregister_service( ibd_gstate.ig_ibt_hdl, srv_hdl); *pp = p->is_link; /* link prev to next */ kmem_free(p, sizeof (*p)); } else { status = IBT_SUCCESS; } mutex_exit(&ibd_gstate.ig_mutex); return (status); } } /* Should not ever get here */ mutex_exit(&ibd_gstate.ig_mutex); return (IBT_FAILURE); } /* Listen with corresponding service ID */ ibt_status_t ibd_rc_listen(ibd_state_t *state) { ibt_srv_desc_t srvdesc; ib_svc_id_t ret_sid; ibt_status_t status; ib_gid_t gid; if (state->rc_listen_hdl != NULL) { DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL"); return (IBT_FAILURE); } bzero(&srvdesc, sizeof (ibt_srv_desc_t)); srvdesc.sd_handler = ibd_rc_dispatch_pass_mad; srvdesc.sd_flags = IBT_SRV_NO_FLAGS; /* * Register the service with service id * Incoming connection requests should arrive on this service id. */ status = ibd_register_service(&srvdesc, IBD_RC_QPN_TO_SID(state->id_qpnum), 1, &state->rc_listen_hdl, &ret_sid); if (status != IBT_SUCCESS) { DPRINT(40, "ibd_rc_listen: Service Registration Failed, " "ret=%d", status); return (status); } gid = state->id_sgid; /* pass state as cm_private */ status = ibt_bind_service(state->rc_listen_hdl, gid, NULL, state, &state->rc_listen_bind); if (status != IBT_SUCCESS) { DPRINT(40, "ibd_rc_listen:" " fail to bind port: <%d>", status); (void) ibd_deregister_service(state->rc_listen_hdl); return (status); } /* * Legacy OFED had used a wrong service ID (one additional zero digit) * for many years. To interop with legacy OFED, we support this wrong * service ID here. */ ASSERT(state->rc_listen_hdl_OFED_interop == NULL); bzero(&srvdesc, sizeof (ibt_srv_desc_t)); srvdesc.sd_handler = ibd_rc_dispatch_pass_mad; srvdesc.sd_flags = IBT_SRV_NO_FLAGS; /* * Register the service with service id * Incoming connection requests should arrive on this service id. */ status = ibd_register_service(&srvdesc, IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum), 1, &state->rc_listen_hdl_OFED_interop, &ret_sid); if (status != IBT_SUCCESS) { DPRINT(40, "ibd_rc_listen: Service Registration for Legacy OFED " "Failed %d", status); (void) ibt_unbind_service(state->rc_listen_hdl, state->rc_listen_bind); (void) ibd_deregister_service(state->rc_listen_hdl); return (status); } gid = state->id_sgid; /* pass state as cm_private */ status = ibt_bind_service(state->rc_listen_hdl_OFED_interop, gid, NULL, state, &state->rc_listen_bind_OFED_interop); if (status != IBT_SUCCESS) { DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for " "Legacy OFED listener", status); (void) ibd_deregister_service( state->rc_listen_hdl_OFED_interop); (void) ibt_unbind_service(state->rc_listen_hdl, state->rc_listen_bind); (void) ibd_deregister_service(state->rc_listen_hdl); return (status); } return (IBT_SUCCESS); } void ibd_rc_stop_listen(ibd_state_t *state) { int ret; /* Disable incoming connection requests */ if (state->rc_listen_hdl != NULL) { ret = ibt_unbind_all_services(state->rc_listen_hdl); if (ret != 0) { DPRINT(40, "ibd_rc_stop_listen:" "ibt_unbind_all_services() failed, ret=%d", ret); } ret = ibd_deregister_service(state->rc_listen_hdl); if (ret != 0) { DPRINT(40, "ibd_rc_stop_listen:" "ibd_deregister_service() failed, ret=%d", ret); } else { state->rc_listen_hdl = NULL; } } /* Disable incoming connection requests */ if (state->rc_listen_hdl_OFED_interop != NULL) { ret = ibt_unbind_all_services( state->rc_listen_hdl_OFED_interop); if (ret != 0) { DPRINT(40, "ibd_rc_stop_listen:" "ibt_unbind_all_services() failed: %d", ret); } ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop); if (ret != 0) { DPRINT(40, "ibd_rc_stop_listen:" "ibd_deregister_service() failed: %d", ret); } else { state->rc_listen_hdl_OFED_interop = NULL; } } } void ibd_rc_close_all_chan(ibd_state_t *state) { ibd_rc_chan_t *rc_chan; ibd_ace_t *ace, *pre_ace; uint_t attempts; /* Disable all Rx routines */ mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); rc_chan = state->rc_pass_chan_list.chan_list; while (rc_chan != NULL) { ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0); rc_chan = rc_chan->next; } mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); if (state->rc_enable_srq) { attempts = 10; while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) { DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0"); delay(drv_usectohz(100000)); if (--attempts == 0) { /* * There are pending bufs with the network * layer and we have no choice but to wait * for them to be done with. Reap all the * Tx/Rx completions that were posted since * we turned off the notification and * return failure. */ break; } } } /* Close all passive RC channels */ rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list); while (rc_chan != NULL) { (void) ibd_rc_pas_close(rc_chan, B_TRUE, B_FALSE); rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list); } /* Close all active RC channels */ mutex_enter(&state->id_ac_mutex); state->id_ac_hot_ace = NULL; ace = list_head(&state->id_ah_active); while ((pre_ace = ace) != NULL) { ace = list_next(&state->id_ah_active, ace); if (pre_ace->ac_chan != NULL) { INC_REF(pre_ace, 1); IBD_ACACHE_PULLOUT_ACTIVE(state, pre_ace); pre_ace->ac_chan->chan_state = IBD_RC_STATE_ACT_CLOSING; ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list, pre_ace->ac_chan); } } mutex_exit(&state->id_ac_mutex); rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list); while (rc_chan != NULL) { ace = rc_chan->ace; ibd_rc_act_close(rc_chan, B_TRUE); if (ace != NULL) { mutex_enter(&state->id_ac_mutex); ASSERT(ace->ac_ref != 0); atomic_dec_32(&ace->ac_ref); ace->ac_chan = NULL; if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { IBD_ACACHE_INSERT_FREE(state, ace); ace->ac_ref = 0; } else { ace->ac_ref |= CYCLEVAL; state->rc_delay_ace_recycle++; } mutex_exit(&state->id_ac_mutex); } rc_chan = ibd_rc_rm_header_chan_list( &state->rc_obs_act_chan_list); } attempts = 400; while (((state->rc_num_tx_chan != 0) || (state->rc_num_rx_chan != 0)) && (attempts > 0)) { /* Other thread is closing CM channel, wait it */ delay(drv_usectohz(100000)); attempts--; } } void ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path) { ibt_status_t status; if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return; status = ibd_rc_connect(state, ace, path, IBD_RC_SERVICE_ID_OFED_INTEROP); if (status != IBT_SUCCESS) { /* wait peer side remove stale channel */ delay(drv_usectohz(10000)); if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return; status = ibd_rc_connect(state, ace, path, IBD_RC_SERVICE_ID_OFED_INTEROP); } if (status != IBT_SUCCESS) { /* wait peer side remove stale channel */ delay(drv_usectohz(10000)); if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return; (void) ibd_rc_connect(state, ace, path, IBD_RC_SERVICE_ID); } } /* * Allocates channel and sets the ace->ac_chan to it. * Opens the channel. */ ibt_status_t ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path, uint64_t ietf_cm_service_id) { ibt_status_t status = 0; ibt_rc_returns_t open_returns; ibt_chan_open_args_t open_args; ibd_rc_msg_hello_t hello_req_msg; ibd_rc_msg_hello_t *hello_ack_msg; ibd_rc_chan_t *chan; ibt_ud_dest_query_attr_t dest_attrs; ASSERT(ace != NULL); ASSERT(ace->ac_mce == NULL); ASSERT(ace->ac_chan == NULL); if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) { DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed"); return (status); } ace->ac_chan = chan; chan->state = state; chan->ace = ace; ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace); hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP); /* * open the channels */ bzero(&open_args, sizeof (ibt_chan_open_args_t)); bzero(&open_returns, sizeof (ibt_rc_returns_t)); open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad; open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace; /* * update path record with the SID */ if ((status = ibt_query_ud_dest(ace->ac_dest, &dest_attrs)) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_connect: ibt_query_ud_dest() failed, " "ret=%d", status); return (status); } path->pi_sid = ietf_cm_service_id | ((dest_attrs.ud_dst_qpn) & 0xffffff); /* pre-allocate memory for hello ack message */ open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t); open_returns.rc_priv_data = hello_ack_msg; open_args.oc_path = path; open_args.oc_path_rnr_retry_cnt = 1; open_args.oc_path_retry_cnt = 1; /* We don't do RDMA */ open_args.oc_rdma_ra_out = 0; open_args.oc_rdma_ra_in = 0; hello_req_msg.reserved_qpn = htonl(state->id_qpnum); hello_req_msg.rx_mtu = htonl(state->rc_mtu); open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t); open_args.oc_priv_data = (void *)(&hello_req_msg); ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ); ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ); ASSERT(open_args.oc_cm_handler != NULL); status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS, IBT_BLOCKING, &open_args, &open_returns); if (status == IBT_SUCCESS) { /* Success! */ DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!"); state->rc_conn_succ++; kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t)); return (IBT_SUCCESS); } /* failure */ (void) ibt_flush_channel(chan->chan_hdl); ibd_rc_free_chan(chan); ace->ac_chan = NULL; /* check open_returns report error and exit */ DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail." "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x," " peer qpn=0x%x", status, (int)open_returns.rc_status, ace, hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn, dest_attrs.ud_dst_qpn); kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t)); return (status); } void ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace) { ibd_req_t *req; req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) { ibd_print_warn(state, "ibd_rc_signal_act_close: alloc " "ibd_req_t fail"); mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex); ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list; state->rc_obs_act_chan_list.chan_list = ace->ac_chan; mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex); } else { req->rq_ptr = ace->ac_chan; ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN); } } void ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace) { ibd_req_t *req; mutex_enter(&state->rc_ace_recycle_lock); if (state->rc_ace_recycle != NULL) { mutex_exit(&state->rc_ace_recycle_lock); return; } req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) { mutex_exit(&state->rc_ace_recycle_lock); return; } state->rc_ace_recycle = ace; mutex_exit(&state->rc_ace_recycle_lock); ASSERT(ace->ac_mce == NULL); INC_REF(ace, 1); IBD_ACACHE_PULLOUT_ACTIVE(state, ace); req->rq_ptr = ace; ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE); } /* * Close an active channel * * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel() */ static void ibd_rc_act_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan) { ibd_state_t *state; ibd_ace_t *ace; uint_t times; ibt_status_t ret; ASSERT(chan != NULL); chan->state->rc_act_close++; switch (chan->chan_state) { case IBD_RC_STATE_ACT_CLOSING: /* stale, close it */ case IBD_RC_STATE_ACT_ESTAB: DPRINT(30, "ibd_rc_act_close-1: close and free chan, " "act_state=%d, chan=%p", chan->chan_state, chan); chan->chan_state = IBD_RC_STATE_ACT_CLOSED; ibt_set_cq_handler(chan->rcq_hdl, 0, 0); /* * Wait send queue empty. Its old value is 50 (5 seconds). But * in my experiment, 5 seconds is not enough time to let IBTL * return all buffers and ace->ac_ref. I tried 25 seconds, it * works well. As another evidence, I saw IBTL takes about 17 * seconds every time it cleans a stale RC channel. */ times = 250; ace = chan->ace; ASSERT(ace != NULL); state = chan->state; ASSERT(state != NULL); mutex_enter(&state->id_ac_mutex); mutex_enter(&chan->tx_wqe_list.dl_mutex); mutex_enter(&chan->tx_rel_list.dl_mutex); while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt) != chan->scq_size) || ((ace->ac_ref != 1) && (ace->ac_ref != (CYCLEVAL+1)))) { mutex_exit(&chan->tx_rel_list.dl_mutex); mutex_exit(&chan->tx_wqe_list.dl_mutex); mutex_exit(&state->id_ac_mutex); times--; if (times == 0) { state->rc_act_close_not_clean++; DPRINT(40, "ibd_rc_act_close: dl_cnt(tx_wqe_" "list=%d, tx_rel_list=%d) != chan->" "scq_size=%d, OR ac_ref(=%d) not clean", chan->tx_wqe_list.dl_cnt, chan->tx_rel_list.dl_cnt, chan->scq_size, ace->ac_ref); break; } mutex_enter(&chan->tx_poll_lock); if (chan->tx_poll_busy & IBD_CQ_POLLING) { DPRINT(40, "ibd_rc_act_close: multiple " "polling threads"); mutex_exit(&chan->tx_poll_lock); } else { chan->tx_poll_busy = IBD_CQ_POLLING; mutex_exit(&chan->tx_poll_lock); ibd_rc_drain_scq(chan, chan->scq_hdl); mutex_enter(&chan->tx_poll_lock); chan->tx_poll_busy = 0; mutex_exit(&chan->tx_poll_lock); } delay(drv_usectohz(100000)); mutex_enter(&state->id_ac_mutex); mutex_enter(&chan->tx_wqe_list.dl_mutex); mutex_enter(&chan->tx_rel_list.dl_mutex); } if (times != 0) { mutex_exit(&chan->tx_rel_list.dl_mutex); mutex_exit(&chan->tx_wqe_list.dl_mutex); mutex_exit(&state->id_ac_mutex); } ibt_set_cq_handler(chan->scq_hdl, 0, 0); if (is_close_rc_chan) { ret = ibt_close_rc_channel(chan->chan_hdl, IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); if (ret != IBT_SUCCESS) { DPRINT(40, "ibd_rc_act_close: ibt_close_rc_" "channel fail, chan=%p, ret=%d", chan, ret); } else { DPRINT(30, "ibd_rc_act_close: ibt_close_rc_" "channel succ, chan=%p", chan); } } ibd_rc_free_chan(chan); break; case IBD_RC_STATE_ACT_REP_RECV: chan->chan_state = IBD_RC_STATE_ACT_CLOSED; (void) ibt_flush_channel(chan->chan_hdl); ibd_rc_free_chan(chan); break; case IBD_RC_STATE_ACT_ERROR: DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch"); break; default: DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, " "chan=%p", chan->chan_state, chan); } } /* * Close a passive channel * * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel() * * is_timeout_close: if B_TRUE, this function is called by the connection * reaper (refer to function ibd_rc_conn_timeout_call). When the connection * reaper calls ibd_rc_pas_close(), and if it finds that dl_bufs_outstanding * or chan->rcq_invoking is non-zero, then it can simply put that channel back * on the passive channels list and move on, since it might be an indication * that the channel became active again by the time we started it's cleanup. * It is costlier to do the cleanup and then reinitiate the channel * establishment and hence it will help to be conservative when we do the * cleanup. */ int ibd_rc_pas_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan, boolean_t is_timeout_close) { uint_t times; ibt_status_t ret; ASSERT(chan != NULL); chan->state->rc_pas_close++; switch (chan->chan_state) { case IBD_RC_STATE_PAS_ESTAB: if (is_timeout_close) { if ((chan->rcq_invoking != 0) || ((!chan->state->rc_enable_srq) && (chan->rx_wqe_list.dl_bufs_outstanding > 0))) { if (ibd_rc_re_add_to_pas_chan_list(chan)) { return (DDI_FAILURE); } } } /* * First, stop receive interrupts; this stops the * connection from handing up buffers to higher layers. * Wait for receive buffers to be returned; give up * after 5 seconds. */ ibt_set_cq_handler(chan->rcq_hdl, 0, 0); /* Wait 0.01 second to let ibt_set_cq_handler() take effect */ delay(drv_usectohz(10000)); if (!chan->state->rc_enable_srq) { times = 50; while (chan->rx_wqe_list.dl_bufs_outstanding > 0) { delay(drv_usectohz(100000)); if (--times == 0) { DPRINT(40, "ibd_rc_pas_close : " "reclaiming failed"); ibd_rc_poll_rcq(chan, chan->rcq_hdl); ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler, (void *)(uintptr_t)chan); return (DDI_FAILURE); } } } times = 50; while (chan->rcq_invoking != 0) { delay(drv_usectohz(100000)); if (--times == 0) { DPRINT(40, "ibd_rc_pas_close : " "rcq handler is being invoked"); chan->state->rc_pas_close_rcq_invoking++; break; } } ibt_set_cq_handler(chan->scq_hdl, 0, 0); chan->chan_state = IBD_RC_STATE_PAS_CLOSED; DPRINT(30, "ibd_rc_pas_close-1: close and free chan, " "chan_state=%d, chan=%p", chan->chan_state, chan); if (is_close_rc_chan) { ret = ibt_close_rc_channel(chan->chan_hdl, IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); if (ret != IBT_SUCCESS) { DPRINT(40, "ibd_rc_pas_close: ibt_close_rc_" "channel() fail, chan=%p, ret=%d", chan, ret); } else { DPRINT(30, "ibd_rc_pas_close: ibt_close_rc_" "channel() succ, chan=%p", chan); } } ibd_rc_free_chan(chan); break; case IBD_RC_STATE_PAS_REQ_RECV: chan->chan_state = IBD_RC_STATE_PAS_CLOSED; (void) ibt_flush_channel(chan->chan_hdl); ibd_rc_free_chan(chan); break; default: DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p", chan->chan_state, chan); } return (DDI_SUCCESS); } /* * Passive Side: * Handle an incoming CM REQ from active side. * * If success, this function allocates an ibd_rc_chan_t, then * assigns it to "*ret_conn". */ static ibt_cm_status_t ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn, ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args, void *ret_priv_data) { ibd_rc_msg_hello_t *hello_msg; ibd_state_t *state = (ibd_state_t *)arg; ibd_rc_chan_t *chan; if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) { DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed"); return (IBT_CM_REJECT); } ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan); ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan); if (!state->rc_enable_srq) { if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) { ibd_rc_free_chan(chan); DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() " "failed"); return (IBT_CM_REJECT); } } ret_args->cm_ret.rep.cm_channel = chan->chan_hdl; /* We don't do RDMA */ ret_args->cm_ret.rep.cm_rdma_ra_out = 0; ret_args->cm_ret.rep.cm_rdma_ra_in = 0; ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7; ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t); hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data; DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x", ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu)); hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data; hello_msg->reserved_qpn = htonl(state->id_qpnum); hello_msg->rx_mtu = htonl(state->rc_mtu); chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV; /* ready to receive */ *ret_conn = chan; return (IBT_CM_ACCEPT); } /* * ibd_rc_handle_act_estab -- handler for connection established completion * for active side. */ static ibt_cm_status_t ibd_rc_handle_act_estab(ibd_ace_t *ace) { ibt_status_t result; switch (ace->ac_chan->chan_state) { case IBD_RC_STATE_ACT_REP_RECV: ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB; result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl, IBT_NEXT_COMPLETION); if (result != IBT_SUCCESS) { DPRINT(40, "ibd_rc_handle_act_estab: " "ibt_enable_cq_notify(rcq) " "failed: status %d", result); return (IBT_CM_REJECT); } break; default: DPRINT(40, "ibd_rc_handle_act_estab: default " "branch, act_state=%d", ace->ac_chan->chan_state); return (IBT_CM_REJECT); } return (IBT_CM_ACCEPT); } /* * ibd_rc_handle_pas_estab -- handler for connection established completion * for passive side. */ static ibt_cm_status_t ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan) { ibt_status_t result; switch (chan->chan_state) { case IBD_RC_STATE_PAS_REQ_RECV: chan->chan_state = IBD_RC_STATE_PAS_ESTAB; result = ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION); if (result != IBT_SUCCESS) { DPRINT(40, "ibd_rc_handle_pas_estab: " "ibt_enable_cq_notify(rcq) " "failed: status %d", result); return (IBT_CM_REJECT); } break; default: DPRINT(40, "ibd_rc_handle_pas_estab: default " "branch, chan_state=%d", chan->chan_state); return (IBT_CM_REJECT); } return (IBT_CM_ACCEPT); } /* ARGSUSED */ static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args, void *ret_priv_data, ibt_priv_data_len_t ret_len_max) { ibt_cm_status_t result = IBT_CM_ACCEPT; ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg; ibd_rc_chan_t *rc_chan; ibd_state_t *state; ibd_rc_msg_hello_t *hello_ack; switch (ibt_cm_event->cm_type) { case IBT_CM_EVENT_REP_RCV: ASSERT(ace->ac_chan != NULL); ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT); hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data; DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, " "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu), ntohl(hello_ack->reserved_qpn)); ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV; break; case IBT_CM_EVENT_CONN_EST: ASSERT(ace->ac_chan != NULL); DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, " "ace=%p, act_state=%d, chan=%p", ace, ace->ac_chan->chan_state, ace->ac_chan); result = ibd_rc_handle_act_estab(ace); break; case IBT_CM_EVENT_CONN_CLOSED: rc_chan = ace->ac_chan; if (rc_chan == NULL) { DPRINT(40, "ibd_rc_dispatch_actv_mad: " "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED"); return (IBT_CM_ACCEPT); } state = rc_chan->state; mutex_enter(&state->id_ac_mutex); if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) && ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0)) != NULL) && (ace == rc_chan->ace)) { rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING; ASSERT(ace->ac_mce == NULL); INC_REF(ace, 1); IBD_ACACHE_PULLOUT_ACTIVE(state, ace); mutex_exit(&state->id_ac_mutex); DPRINT(30, "ibd_rc_dispatch_actv_mad: " "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, " "reason=%d", ace, rc_chan, ibt_cm_event->cm_event.closed); } else { mutex_exit(&state->id_ac_mutex); state->rc_act_close_simultaneous++; DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread " "is closing it, IBT_CM_EVENT_CONN_CLOSED, " "chan_state=%d", rc_chan->chan_state); return (IBT_CM_ACCEPT); } ibd_rc_act_close(rc_chan, B_FALSE); mutex_enter(&state->id_ac_mutex); ace->ac_chan = NULL; ASSERT(ace->ac_ref != 0); atomic_dec_32(&ace->ac_ref); if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { IBD_ACACHE_INSERT_FREE(state, ace); ace->ac_ref = 0; } else { ace->ac_ref |= CYCLEVAL; state->rc_delay_ace_recycle++; } mutex_exit(&state->id_ac_mutex); break; case IBT_CM_EVENT_FAILURE: DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE," "ace=%p, chan=%p, code: %d, msg: %d, reason=%d", ace, ace->ac_chan, ibt_cm_event->cm_event.failed.cf_code, ibt_cm_event->cm_event.failed.cf_msg, ibt_cm_event->cm_event.failed.cf_reason); /* * Don't need free resource here. The resource is freed * at function ibd_rc_connect() */ break; case IBT_CM_EVENT_MRA_RCV: DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV"); break; case IBT_CM_EVENT_LAP_RCV: DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received"); break; case IBT_CM_EVENT_APR_RCV: DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received"); break; default: DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, " "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type); break; } return (result); } /* ARGSUSED */ static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args, void *ret_priv_data, ibt_priv_data_len_t ret_len_max) { ibt_cm_status_t result = IBT_CM_ACCEPT; ibd_rc_chan_t *chan; if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) { DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV," "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey); /* Receive an incoming CM REQ from active side */ result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args, ret_priv_data); return (result); } if (ibt_cm_event->cm_channel == 0) { DPRINT(30, "ibd_rc_dispatch_pass_mad: " "ERROR ibt_cm_event->cm_channel == 0"); return (IBT_CM_REJECT); } chan = (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel); if (chan == NULL) { DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0"); return (IBT_CM_REJECT); } switch (ibt_cm_event->cm_type) { case IBT_CM_EVENT_CONN_EST: DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, " "chan=%p", chan); result = ibd_rc_handle_pas_estab(chan); break; case IBT_CM_EVENT_CONN_CLOSED: DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED," " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed); chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan); if (chan != NULL) (void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE); break; case IBT_CM_EVENT_FAILURE: DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE," " chan=%p, code: %d, msg: %d, reason=%d", chan, ibt_cm_event->cm_event.failed.cf_code, ibt_cm_event->cm_event.failed.cf_msg, ibt_cm_event->cm_event.failed.cf_reason); chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan); if (chan != NULL) (void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE); return (IBT_CM_ACCEPT); case IBT_CM_EVENT_MRA_RCV: DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV"); break; case IBT_CM_EVENT_LAP_RCV: DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received"); break; case IBT_CM_EVENT_APR_RCV: DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received"); break; default: DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, " "chan=%p", ibt_cm_event->cm_type, chan); break; } return (result); }