/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include /* * Declarations private to this file */ static int eib_rsrc_setup_txbufs(eib_t *, int *); static int eib_rsrc_setup_rxbufs(eib_t *, int *); static int eib_rsrc_setup_lsobufs(eib_t *, int *); static void eib_rsrc_init_wqe_pool(eib_t *, eib_wqe_pool_t **, ib_memlen_t, int); static void eib_rsrc_fini_wqe_pool(eib_t *, eib_wqe_pool_t **); static boolean_t eib_rsrc_ok_to_free_pool(eib_t *, eib_wqe_pool_t *, boolean_t); static int eib_rsrc_grab_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, uint_t, uint_t *, int); static void eib_rsrc_return_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, uint_t); static void eib_rb_rsrc_setup_txbufs(eib_t *, boolean_t); static void eib_rb_rsrc_setup_rxbufs(eib_t *, boolean_t); static void eib_rb_rsrc_setup_lsobufs(eib_t *, boolean_t); /* * Definitions private to this file */ static uint_t eib_lso_num_bufs = EIB_LSO_NUM_BUFS; /* tunable? */ int eib_rsrc_setup_bufs(eib_t *ss, int *err) { if (eib_rsrc_setup_txbufs(ss, err) != EIB_E_SUCCESS) return (EIB_E_FAILURE); if (ss->ei_caps->cp_lso_maxlen && ss->ei_caps->cp_cksum_flags && ss->ei_caps->cp_resv_lkey_capab) { if (eib_rsrc_setup_lsobufs(ss, err) != EIB_E_SUCCESS) { eib_rb_rsrc_setup_txbufs(ss, B_FALSE); return (EIB_E_FAILURE); } } if (eib_rsrc_setup_rxbufs(ss, err) != EIB_E_SUCCESS) { eib_rb_rsrc_setup_lsobufs(ss, B_FALSE); eib_rb_rsrc_setup_txbufs(ss, B_FALSE); return (EIB_E_FAILURE); } return (EIB_E_SUCCESS); } int eib_rsrc_grab_swqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual, int pri) { eib_wqe_t *wqe; uint32_t *encap_hdr; int ret; int i; ASSERT(ss->ei_tx != NULL); ret = eib_rsrc_grab_wqes(ss, ss->ei_tx, wqes, n_req, actual, pri); if (ret != EIB_E_SUCCESS) return (EIB_E_FAILURE); /* * See note for eib_rsrc_grab_swqe() */ for (i = 0; i < (*actual); i++) { wqe = wqes[i]; wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; wqe->qe_wr.send.wr_nds = 1; wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; wqe->qe_nxt_post = NULL; wqe->qe_iov_hdl = NULL; encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; *encap_hdr = htonl(EIB_TX_ENCAP_HDR); } return (EIB_E_SUCCESS); } int eib_rsrc_grab_rwqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual, int pri) { ASSERT(ss->ei_rx != NULL); return (eib_rsrc_grab_wqes(ss, ss->ei_rx, wqes, n_req, actual, pri)); } int eib_rsrc_grab_lsobufs(eib_t *ss, uint_t req_sz, ibt_wr_ds_t *sgl, uint32_t *nds) { eib_lsobkt_t *bkt = ss->ei_lso; eib_lsobuf_t *elem; eib_lsobuf_t *nxt; uint_t frag_sz; uint_t num_needed; int i; ASSERT(req_sz != 0); ASSERT(sgl != NULL); ASSERT(nds != NULL); /* * Determine how many bufs we'd need for the size requested */ num_needed = req_sz / EIB_LSO_BUFSZ; if ((frag_sz = req_sz % EIB_LSO_BUFSZ) != 0) num_needed++; if (bkt == NULL) return (EIB_E_FAILURE); /* * If we don't have enough lso bufs, return failure */ mutex_enter(&bkt->bk_lock); if (bkt->bk_nfree < num_needed) { mutex_exit(&bkt->bk_lock); return (EIB_E_FAILURE); } /* * Pick the first "num_needed" bufs from the free list */ elem = bkt->bk_free_head; for (i = 0; i < num_needed; i++) { ASSERT(elem->lb_isfree != 0); ASSERT(elem->lb_buf != NULL); nxt = elem->lb_next; sgl[i].ds_va = (ib_vaddr_t)(uintptr_t)elem->lb_buf; sgl[i].ds_key = bkt->bk_lkey; sgl[i].ds_len = EIB_LSO_BUFSZ; elem->lb_isfree = 0; elem->lb_next = NULL; elem = nxt; } bkt->bk_free_head = elem; /* * If the requested size is not a multiple of EIB_LSO_BUFSZ, we need * to adjust the last sgl entry's length. Since we know we need atleast * one, the i-1 use below is ok. */ if (frag_sz) { sgl[i-1].ds_len = frag_sz; } /* * Update nfree count and return */ bkt->bk_nfree -= num_needed; mutex_exit(&bkt->bk_lock); *nds = num_needed; return (EIB_E_SUCCESS); } eib_wqe_t * eib_rsrc_grab_swqe(eib_t *ss, int pri) { eib_wqe_t *wqe = NULL; uint32_t *encap_hdr; ASSERT(ss->ei_tx != NULL); (void) eib_rsrc_grab_wqes(ss, ss->ei_tx, &wqe, 1, NULL, pri); /* * Let's reset the swqe basic wr parameters to default. We need * to do this because this swqe could've previously been used * for a checksum offload (when the flags would've been set) * or for an LSO send (in which case the opcode would've been set * to a different value), or been iov mapped (in which case the * sgl/nds could've been set to different values). We'll make * it easy and initialize it here, so simple transactions can * go through without any special effort by the caller. * * Note that even though the wqe structure is common for both * send and recv, they're in two independent pools and the wqe * type remains the same throughout its lifetime. So we don't * have to worry about resetting any other field. */ if (wqe) { wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; wqe->qe_wr.send.wr_nds = 1; wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; wqe->qe_nxt_post = NULL; wqe->qe_iov_hdl = NULL; encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; *encap_hdr = htonl(EIB_TX_ENCAP_HDR); } return (wqe); } eib_wqe_t * eib_rsrc_grab_rwqe(eib_t *ss, int pri) { eib_wqe_t *wqe = NULL; ASSERT(ss->ei_rx != NULL); (void) eib_rsrc_grab_wqes(ss, ss->ei_rx, &wqe, 1, NULL, pri); return (wqe); } void eib_rsrc_return_swqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan) { ASSERT(ss->ei_tx != NULL); eib_rsrc_return_wqes(ss, ss->ei_tx, &wqe, 1); if (chan) { eib_rsrc_decr_posted_swqe(ss, chan); } } void eib_rsrc_return_rwqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan) { ASSERT(ss->ei_rx != NULL); eib_rsrc_return_wqes(ss, ss->ei_rx, &wqe, 1); if (chan) { eib_rsrc_decr_posted_rwqe(ss, chan); } } void eib_rsrc_return_lsobufs(eib_t *ss, ibt_wr_ds_t *sgl_p, uint32_t nds) { eib_lsobkt_t *bkt = ss->ei_lso; eib_lsobuf_t *elem; uint8_t *va; ptrdiff_t ndx; int i; /* * Nowhere to return the buffers to ?? */ if (bkt == NULL) return; mutex_enter(&bkt->bk_lock); for (i = 0; i < nds; i++) { va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; ASSERT(va >= bkt->bk_mem); ASSERT(va < (bkt->bk_mem + bkt->bk_nelem * EIB_LSO_BUFSZ)); /* * Figure out the buflist element this sgl buffer corresponds * to and put it back at the head */ ndx = ((uintptr_t)va - (uintptr_t)bkt->bk_mem) / EIB_LSO_BUFSZ; elem = bkt->bk_bufl + ndx; ASSERT(elem->lb_isfree == 0); ASSERT(elem->lb_buf == va); elem->lb_isfree = 1; elem->lb_next = bkt->bk_free_head; bkt->bk_free_head = elem; } bkt->bk_nfree += nds; /* * If the number of available lso buffers just crossed the * threshold, wakeup anyone who may be sleeping on the event. */ if (((bkt->bk_nfree - nds) < EIB_LSO_FREE_BUFS_THRESH) && (bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH)) { cv_broadcast(&bkt->bk_cv); } mutex_exit(&bkt->bk_lock); } /*ARGSUSED*/ void eib_rsrc_decr_posted_swqe(eib_t *ss, eib_chan_t *chan) { ASSERT(chan != NULL); mutex_enter(&chan->ch_tx_lock); chan->ch_tx_posted--; if ((chan->ch_tear_down) && (chan->ch_tx_posted == 0)) { cv_signal(&chan->ch_tx_cv); } mutex_exit(&chan->ch_tx_lock); } void eib_rsrc_decr_posted_rwqe(eib_t *ss, eib_chan_t *chan) { eib_chan_t *tail; boolean_t queue_for_refill = B_FALSE; ASSERT(chan != NULL); /* * Decrement the ch_rx_posted count. If we are tearing this channel * down, signal the waiter when the count reaches 0. If we aren't * tearing the channel down, see if the count has gone below the low * water mark. If it has, and if this channel isn't already being * refilled, queue the channel up with the service thread for a * rwqe refill. */ mutex_enter(&chan->ch_rx_lock); chan->ch_rx_posted--; if (chan->ch_tear_down) { if (chan->ch_rx_posted == 0) cv_signal(&chan->ch_rx_cv); } else if (chan->ch_rx_posted < chan->ch_lwm_rwqes) { if (chan->ch_rx_refilling == B_FALSE) { chan->ch_rx_refilling = B_TRUE; queue_for_refill = B_TRUE; } } mutex_exit(&chan->ch_rx_lock); if (queue_for_refill) { mutex_enter(&ss->ei_rxpost_lock); chan->ch_rxpost_next = NULL; for (tail = ss->ei_rxpost; tail; tail = tail->ch_rxpost_next) { if (tail->ch_rxpost_next == NULL) break; } if (tail) { tail->ch_rxpost_next = chan; } else { ss->ei_rxpost = chan; } cv_signal(&ss->ei_rxpost_cv); mutex_exit(&ss->ei_rxpost_lock); } } void eib_rsrc_txwqes_needed(eib_t *ss) { eib_wqe_pool_t *wp = ss->ei_tx; EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf); mutex_enter(&wp->wp_lock); if ((wp->wp_status & EIB_TXWQE_SHORT) == 0) { wp->wp_status |= EIB_TXWQE_SHORT; cv_broadcast(&wp->wp_cv); } mutex_exit(&wp->wp_lock); } void eib_rsrc_lsobufs_needed(eib_t *ss) { eib_lsobkt_t *bkt = ss->ei_lso; EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf); if (bkt == NULL) { EIB_DPRINTF_WARN(ss->ei_instance, "eib_rsrc_lsobufs_needed: " "lso bufs seem to be needed even though " "LSO support was not advertised"); return; } mutex_enter(&bkt->bk_lock); if ((bkt->bk_status & EIB_LBUF_SHORT) == 0) { bkt->bk_status |= EIB_LBUF_SHORT; cv_broadcast(&bkt->bk_cv); } mutex_exit(&bkt->bk_lock); } boolean_t eib_rsrc_rxpool_low(eib_wqe_t *wqe) { eib_wqe_pool_t *wp = wqe->qe_pool; boolean_t ret = B_FALSE; /* * Set the EIB_RXWQE_SHORT flag when the number of free wqes * in the rx pool falls below the low threshold for rwqes and * clear it only when the number of free wqes gets back above * the high water mark. */ mutex_enter(&wp->wp_lock); if (wp->wp_nfree <= EIB_NFREE_RWQES_LOW) { wp->wp_status |= (EIB_RXWQE_SHORT); } else if (wp->wp_nfree >= EIB_NFREE_RWQES_HWM) { wp->wp_status &= (~EIB_RXWQE_SHORT); } if ((wp->wp_status & EIB_RXWQE_SHORT) == EIB_RXWQE_SHORT) ret = B_TRUE; mutex_exit(&wp->wp_lock); return (ret); } void eib_rb_rsrc_setup_bufs(eib_t *ss, boolean_t force) { eib_rb_rsrc_setup_rxbufs(ss, force); eib_rb_rsrc_setup_lsobufs(ss, force); eib_rb_rsrc_setup_txbufs(ss, force); } static int eib_rsrc_setup_txbufs(eib_t *ss, int *err) { eib_wqe_pool_t *tx; eib_wqe_t *wqe; ibt_ud_dest_hdl_t dest; ibt_mr_attr_t attr; ibt_mr_desc_t desc; ibt_status_t ret; kthread_t *kt; uint32_t *encap_hdr; uint8_t *buf; uint_t mtu = ss->ei_props->ep_mtu; uint_t tx_bufsz; uint_t blk; uint_t ndx; uint_t i; int lso_enabled; /* * Try to allocate and initialize the tx wqe pool */ if (ss->ei_tx != NULL) return (EIB_E_SUCCESS); /* * If we keep the tx buffers as mtu-sized, then potentially every * LSO request that cannot be satisfactorily mapped, will use up * the 8K large (default size) lso buffers. This may be inadvisable * given that lso buffers are a scarce resource. Instead, we'll * slightly raise the size of the copy buffers in the send wqes * (say to EIB_TX_COPY_THRESH) so that requests that cannot be * mapped could still avoid using the 8K LSO buffers if they're * less than the copy threshold size. */ lso_enabled = ss->ei_caps->cp_lso_maxlen && ss->ei_caps->cp_cksum_flags && ss->ei_caps->cp_resv_lkey_capab; tx_bufsz = ((lso_enabled) && (EIB_TX_COPY_THRESH > mtu)) ? EIB_TX_COPY_THRESH : mtu; eib_rsrc_init_wqe_pool(ss, &ss->ei_tx, tx_bufsz, EIB_WP_TYPE_TX); tx = ss->ei_tx; /* * Register the TX memory region with IBTF for use */ attr.mr_vaddr = tx->wp_vaddr; attr.mr_len = tx->wp_memsz; attr.mr_as = NULL; attr.mr_flags = IBT_MR_SLEEP; ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, &tx->wp_mr, &desc); if (ret != IBT_SUCCESS) { EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_txbufs: " "ibt_register_mr() failed for tx " "region (0x%llx, 0x%llx) with ret=%d", attr.mr_vaddr, attr.mr_len, ret); *err = EINVAL; goto rsrc_setup_txbufs_fail; } tx->wp_lkey = desc.md_lkey; /* * Now setup the tx wqes */ buf = (uint8_t *)(uintptr_t)(tx->wp_vaddr); for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) { for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) { wqe = &tx->wp_wqe[i]; /* * Allocate a UD destination handle */ ret = ibt_alloc_ud_dest(ss->ei_hca_hdl, IBT_UD_DEST_NO_FLAGS, ss->ei_pd_hdl, &dest); if (ret != IBT_SUCCESS) { EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_txbufs: " "ibt_alloc_ud_dest(hca_hdl=0x%llx) " "failed, ret=%d", ss->ei_hca_hdl, ret); *err = ENOMEM; goto rsrc_setup_txbufs_fail; } /* * These parameters should remain fixed throughout the * lifetime of this wqe. */ wqe->qe_pool = tx; wqe->qe_cpbuf = buf; wqe->qe_bufsz = tx_bufsz; /* * The qe_dest and qe_payload_hdr are specific to tx * only, but remain unchanged throughout the lifetime * of the wqe. * * The payload header is normally used when we have an * LSO packet to send. Since the EoIB encapsulation * header won't be part of the message we get from the * network layer, we'll need to copy the lso header into * a new buffer every time before we hand over the LSO * send request to the hca driver. */ wqe->qe_dest = dest; wqe->qe_payload_hdr = kmem_zalloc(EIB_MAX_PAYLOAD_HDR_SZ, KM_SLEEP); /* * The encapsulation header is at the start of the * payload header and is initialized to the default * encapsulation header we use (no multiple segments, * no FCS). This part of the header is not expected * to change. */ encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; *encap_hdr = htonl(EIB_TX_ENCAP_HDR); /* * The parameter set below are used in tx and rx paths. * These parameters (except ds_key) are reset to these * default values in eib_rsrc_return_wqes(). */ wqe->qe_sgl.ds_key = tx->wp_lkey; wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; wqe->qe_sgl.ds_len = wqe->qe_bufsz; wqe->qe_mp = NULL; wqe->qe_info = ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) | ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) | ((uint_t)EIB_WQE_TX << EIB_WQETYP_SHIFT); /* * These tx-specific parameters (except wr_id and * wr_trans) are reset in eib_rsrc_grab_swqes() to make * sure any freshly acquired swqe from the pool has * these default settings for the caller. */ wqe->qe_wr.send.wr_id = (ibt_wrid_t)(uintptr_t)wqe; wqe->qe_wr.send.wr_trans = IBT_UD_SRV; wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; wqe->qe_wr.send.wr_nds = 1; wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; wqe->qe_nxt_post = NULL; wqe->qe_iov_hdl = NULL; buf += wqe->qe_bufsz; } } /* * Before returning, create a kernel thread to monitor the status * of wqes in the tx wqe pool. Note that this thread cannot be * created from eib_state_init() during attach(), since the thread * expects the wqe pool to be allocated and ready when it starts, * and the tx bufs initialization only happens during eib_m_start(). */ kt = thread_create(NULL, 0, eib_monitor_tx_wqes, ss, 0, &p0, TS_RUN, minclsyspri); ss->ei_txwqe_monitor = kt->t_did; return (EIB_E_SUCCESS); rsrc_setup_txbufs_fail: eib_rb_rsrc_setup_txbufs(ss, B_FALSE); return (EIB_E_FAILURE); } static int eib_rsrc_setup_rxbufs(eib_t *ss, int *err) { eib_wqe_pool_t *rx; eib_wqe_t *wqe; ibt_mr_attr_t attr; ibt_mr_desc_t desc; ibt_status_t ret; uint8_t *buf; uint_t mtu = ss->ei_props->ep_mtu; uint_t blk; uint_t ndx; uint_t i; /* * Try to allocate and initialize the wqe pool. When this is called * during a plumb via the mac m_start callback, we need to make * sure there is a need to allocate a wqe pool afresh. If during a * previous unplumb we didn't free the wqe pool because the nw layer * was holding on to some rx buffers, we don't need to allocate new * pool and set up the buffers again; we'll just start re-using the * previous one. */ if (ss->ei_rx != NULL) return (EIB_E_SUCCESS); /* * The receive buffer has to work for all channels, specifically the * data qp of the vnics. This means that the buffer must be large * enough to hold MTU sized IB payload (including the EoIB and ethernet * headers) plus the GRH. In addition, because the ethernet header is * either 14 or 18 bytes (tagless or vlan tagged), we should have the * buffer filled in such a way that the IP header starts at atleast a * 4-byte aligned address. In order to do this, we need to have some * additional room. */ eib_rsrc_init_wqe_pool(ss, &ss->ei_rx, mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM, EIB_WP_TYPE_RX); rx = ss->ei_rx; /* * Register the RX memory region with IBTF for use */ attr.mr_vaddr = rx->wp_vaddr; attr.mr_len = rx->wp_memsz; attr.mr_as = NULL; attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, &rx->wp_mr, &desc); if (ret != IBT_SUCCESS) { EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_rxbufs: " "ibt_register_mr() failed for rx " "region (0x%llx, 0x%llx) with ret=%d", attr.mr_vaddr, attr.mr_len, ret); *err = EINVAL; goto rsrc_setup_rxbufs_fail; } rx->wp_lkey = desc.md_lkey; /* * Now setup the rx wqes */ buf = (uint8_t *)(uintptr_t)(rx->wp_vaddr); for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) { for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) { wqe = &rx->wp_wqe[i]; /* * These parameters should remain fixed throughout the * lifetime of this recv wqe. The qe_frp will only be * used by the data channel of vnics and will remain * unused by other channels. */ wqe->qe_pool = rx; wqe->qe_cpbuf = buf; wqe->qe_bufsz = mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM; wqe->qe_wr.recv.wr_id = (ibt_wrid_t)(uintptr_t)wqe; wqe->qe_wr.recv.wr_nds = 1; wqe->qe_wr.recv.wr_sgl = &wqe->qe_sgl; wqe->qe_frp.free_func = eib_data_rx_recycle; wqe->qe_frp.free_arg = (caddr_t)wqe; /* * The parameter set below are used in tx and rx paths. * These parameters (except ds_key) are reset to these * default values in eib_rsrc_return_wqes(). */ wqe->qe_sgl.ds_key = rx->wp_lkey; wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; wqe->qe_sgl.ds_len = wqe->qe_bufsz; wqe->qe_mp = NULL; wqe->qe_info = ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) | ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) | ((uint_t)EIB_WQE_RX << EIB_WQETYP_SHIFT); /* * These rx-specific parameters are also reset to * these default values in eib_rsrc_return_wqes(). */ wqe->qe_chan = NULL; wqe->qe_vnic_inst = -1; buf += (mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM); } } return (EIB_E_SUCCESS); rsrc_setup_rxbufs_fail: eib_rb_rsrc_setup_rxbufs(ss, B_FALSE); return (EIB_E_FAILURE); } static int eib_rsrc_setup_lsobufs(eib_t *ss, int *err) { eib_lsobkt_t *bkt; eib_lsobuf_t *elem; eib_lsobuf_t *tail; ibt_mr_attr_t attr; ibt_mr_desc_t desc; kthread_t *kt; uint8_t *lsomem; uint8_t *memp; ibt_status_t ret; int i; /* * Allocate the lso bucket and space for buffers */ bkt = kmem_zalloc(sizeof (eib_lsobkt_t), KM_SLEEP); lsomem = kmem_zalloc(eib_lso_num_bufs * EIB_LSO_BUFSZ, KM_SLEEP); /* * Register lso memory and save the lkey */ attr.mr_vaddr = (uint64_t)(uintptr_t)lsomem; attr.mr_len = eib_lso_num_bufs * EIB_LSO_BUFSZ; attr.mr_as = NULL; attr.mr_flags = IBT_MR_SLEEP; ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, &bkt->bk_mr_hdl, &desc); if (ret != IBT_SUCCESS) { *err = EINVAL; EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_lsobufs: " "ibt_register_mr() failed for LSO " "region (0x%llx, 0x%llx) with ret=%d", attr.mr_vaddr, attr.mr_len, ret); kmem_free(lsomem, eib_lso_num_bufs * EIB_LSO_BUFSZ); kmem_free(bkt, sizeof (eib_lsobkt_t)); return (EIB_E_FAILURE); } bkt->bk_lkey = desc.md_lkey; /* * Now allocate the buflist. Note that the elements in the buflist and * the buffers in the lso memory have a permanent 1-1 relation, so we * can always derive the address of a buflist entry from the address of * an lso buffer. */ bkt->bk_bufl = kmem_zalloc(eib_lso_num_bufs * sizeof (eib_lsobuf_t), KM_SLEEP); /* * Set up the lso buf chain */ memp = lsomem; elem = bkt->bk_bufl; for (i = 0; i < eib_lso_num_bufs; i++) { elem->lb_isfree = 1; elem->lb_buf = memp; elem->lb_next = elem + 1; tail = elem; memp += EIB_LSO_BUFSZ; elem++; } tail->lb_next = NULL; /* * Set up the LSO buffer information in eib state */ bkt->bk_free_head = bkt->bk_bufl; bkt->bk_mem = lsomem; bkt->bk_nelem = eib_lso_num_bufs; bkt->bk_nfree = bkt->bk_nelem; mutex_init(&bkt->bk_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&bkt->bk_cv, NULL, CV_DEFAULT, NULL); ss->ei_lso = bkt; /* * Before returning, create a kernel thread to monitor the status * of lso bufs */ kt = thread_create(NULL, 0, eib_monitor_lso_bufs, ss, 0, &p0, TS_RUN, minclsyspri); ss->ei_lsobufs_monitor = kt->t_did; return (EIB_E_SUCCESS); } static void eib_rsrc_init_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp, ib_memlen_t bufsz, int wp_type) { eib_wqe_pool_t *wp; uint_t wp_wqesz; int i; ASSERT(wpp != NULL); ASSERT(*wpp == NULL); /* * Allocate the wqe pool, wqes and bufs */ wp = kmem_zalloc(sizeof (eib_wqe_pool_t), KM_SLEEP); wp_wqesz = EIB_WQES_PER_POOL * sizeof (eib_wqe_t); wp->wp_wqe = (eib_wqe_t *)kmem_zalloc(wp_wqesz, KM_SLEEP); wp->wp_memsz = EIB_WQES_PER_POOL * bufsz; wp->wp_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(wp->wp_memsz, KM_SLEEP); wp->wp_ss = ss; wp->wp_type = wp_type; wp->wp_nfree_lwm = (wp_type == EIB_WP_TYPE_TX) ? EIB_NFREE_SWQES_LWM : EIB_NFREE_RWQES_LWM; /* * Initialize the lock and bitmaps: everything is available at first, * but note that if the number of blocks per pool is less than 64, we * need to initialize those extra bits as "unavailable" - these will * remain unavailable throughout. */ mutex_init(&wp->wp_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&wp->wp_cv, NULL, CV_DEFAULT, NULL); wp->wp_nfree = EIB_WQES_PER_POOL; wp->wp_free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) : (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1); for (i = 0; i < EIB_BLKS_PER_POOL; i++) wp->wp_free_wqes[i] = ~0; *wpp = wp; } /*ARGSUSED*/ static void eib_rsrc_fini_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp) { eib_wqe_pool_t *wp; ASSERT(wpp != NULL); wp = *wpp; ASSERT(*wpp != NULL); cv_destroy(&wp->wp_cv); mutex_destroy(&wp->wp_lock); kmem_free((void *)(uintptr_t)(wp->wp_vaddr), wp->wp_memsz); kmem_free(wp->wp_wqe, EIB_WQES_PER_POOL * sizeof (eib_wqe_t)); kmem_free(wp, sizeof (eib_wqe_pool_t)); *wpp = NULL; } /*ARGSUSED*/ static boolean_t eib_rsrc_ok_to_free_pool(eib_t *ss, eib_wqe_pool_t *wp, boolean_t force) { uint64_t free_blks; int i; /* * See if we can release all memory allocated for buffers, wqes and * the pool. Note that in the case of data channel rx buffers, some * of the buffers may not be free if the nw layer is holding on to * them still. If this is the case, we cannot free the wqe pool now * or a subsequent access by the nw layer to the buffers will cause * a panic. */ ASSERT(wp != NULL); /* * If force-free flag is set, we can always release the memory. * Note that this flag is unused currently, and should be removed. */ if (force == B_TRUE) return (B_TRUE); mutex_enter(&wp->wp_lock); /* * If a whole block remains allocated, obviously we cannot free * the pool */ free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) : (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1); if (wp->wp_free_blks != free_blks) { mutex_exit(&wp->wp_lock); return (B_FALSE); } /* * If even a single wqe within any one block remains in-use, we * cannot free the pool */ for (i = 0; i < EIB_BLKS_PER_POOL; i++) { if (wp->wp_free_wqes[i] != (~0)) { mutex_exit(&wp->wp_lock); return (B_FALSE); } } mutex_exit(&wp->wp_lock); return (B_TRUE); } /*ARGSUSED*/ static int eib_rsrc_grab_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes, uint_t n_req, uint_t *actual, int pri) { uint_t n_allocd = 0; int blk; int ndx; int wqe_ndx; ASSERT(wp != NULL); ASSERT(wqes != NULL); mutex_enter(&wp->wp_lock); /* * If this is a low priority request, adjust the number requested * so we don't allocate beyond the low-water-mark */ if (pri == EIB_WPRI_LO) { if (wp->wp_nfree <= wp->wp_nfree_lwm) n_req = 0; else if ((wp->wp_nfree - n_req) < wp->wp_nfree_lwm) n_req = wp->wp_nfree - wp->wp_nfree_lwm; } for (n_allocd = 0; n_allocd < n_req; n_allocd++) { /* * If the entire pool is unavailable, quit */ if (wp->wp_free_blks == 0) break; /* * Find the first wqe that's available */ blk = EIB_FIND_LSB_SET(wp->wp_free_blks); ASSERT(blk != -1); ndx = EIB_FIND_LSB_SET(wp->wp_free_wqes[blk]); ASSERT(ndx != -1); /* * Mark the wqe as allocated */ wp->wp_free_wqes[blk] &= (~((uint64_t)1 << ndx)); /* * If this was the last free wqe in this block, mark * the block itself as unavailable */ if (wp->wp_free_wqes[blk] == 0) wp->wp_free_blks &= (~((uint64_t)1 << blk)); /* * Return this wqe to the caller */ wqe_ndx = blk * EIB_WQES_PER_BLK + ndx; wqes[n_allocd] = &(wp->wp_wqe[wqe_ndx]); } wp->wp_nfree -= n_allocd; mutex_exit(&wp->wp_lock); if (n_allocd == 0) return (EIB_E_FAILURE); if (actual) { *actual = n_allocd; } return (EIB_E_SUCCESS); } /*ARGSUSED*/ static void eib_rsrc_return_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes, uint_t n_wqes) { eib_wqe_t *wqe; uint_t n_freed = 0; uint_t blk; uint_t ndx; ASSERT(wp != NULL); ASSERT(wqes != NULL); mutex_enter(&wp->wp_lock); for (n_freed = 0; n_freed < n_wqes; n_freed++) { wqe = wqes[n_freed]; /* * This wqe is being returned back to the pool, so clear * any wqe flags and reset buffer address and size in the * single segment sgl back to what they were initially. * Also erase any mblk pointer and callback function ptrs. */ wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)wqe->qe_cpbuf; wqe->qe_sgl.ds_len = wqe->qe_bufsz; wqe->qe_mp = NULL; wqe->qe_chan = NULL; wqe->qe_vnic_inst = -1; wqe->qe_info &= (~EIB_WQEFLGS_MASK); /* * Mark the wqe free in its block */ blk = EIB_WQE_BLK(wqe->qe_info); ndx = EIB_WQE_NDX(wqe->qe_info); wp->wp_free_wqes[blk] |= ((uint64_t)1 << ndx); /* * This block now has atleast one wqe free, so mark * the block itself as available and move on to the * next wqe to free */ wp->wp_free_blks |= ((uint64_t)1 << blk); } wp->wp_nfree += n_freed; /* * If the number of available wqes in the pool has just crossed * the high-water-mark, wakeup anyone who may be sleeping on it. */ if ((wp->wp_type == EIB_WP_TYPE_TX) && ((wp->wp_nfree - n_freed) < EIB_NFREE_SWQES_HWM) && (wp->wp_nfree >= EIB_NFREE_SWQES_HWM)) { cv_broadcast(&wp->wp_cv); } mutex_exit(&wp->wp_lock); } static void eib_rb_rsrc_setup_txbufs(eib_t *ss, boolean_t force) { eib_wqe_pool_t *wp = ss->ei_tx; eib_wqe_t *wqe; ibt_ud_dest_hdl_t dest; ibt_status_t ret; uint8_t *plhdr; int i; if (wp == NULL) return; /* * Check if it's ok to free the tx wqe pool (i.e. all buffers have * been reclaimed) and if so, stop the txwqe monitor thread (and wait * for it to die), release the UD destination handles, deregister * memory and fini the wqe pool. */ if (eib_rsrc_ok_to_free_pool(ss, wp, force)) { eib_stop_monitor_tx_wqes(ss); for (i = 0; i < EIB_WQES_PER_POOL; i++) { wqe = &wp->wp_wqe[i]; if ((plhdr = wqe->qe_payload_hdr) != NULL) { kmem_free(plhdr, EIB_MAX_PAYLOAD_HDR_SZ); } if ((dest = wqe->qe_dest) != NULL) { ret = ibt_free_ud_dest(dest); if (ret != IBT_SUCCESS) { EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_rsrc_setup_txbufs: " "ibt_free_ud_dest() failed, ret=%d", ret); } } } if (wp->wp_mr) { if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, wp->wp_mr)) != IBT_SUCCESS) { EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_rsrc_setup_txbufs: " "ibt_deregister_mr() failed, ret=%d", ret); } wp->wp_mr = NULL; } eib_rsrc_fini_wqe_pool(ss, &ss->ei_tx); } } void eib_rb_rsrc_setup_rxbufs(eib_t *ss, boolean_t force) { eib_wqe_pool_t *rx = ss->ei_rx; ibt_status_t ret; if (rx == NULL) return; /* * Check if it's ok to free the rx wqe pool (i.e. all buffers have * been reclaimed) and if so, deregister memory and fini the wqe pool. */ if (eib_rsrc_ok_to_free_pool(ss, rx, force)) { if (rx->wp_mr) { if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, rx->wp_mr)) != IBT_SUCCESS) { EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_rsrc_setup_rxbufs: " "ibt_deregister_mr() failed, ret=%d", ret); } rx->wp_mr = NULL; } eib_rsrc_fini_wqe_pool(ss, &ss->ei_rx); } } static void eib_rb_rsrc_setup_lsobufs(eib_t *ss, boolean_t force) { eib_lsobkt_t *bkt; ibt_status_t ret; /* * Remove the lso bucket from the state */ if ((bkt = ss->ei_lso) == NULL) return; /* * Try to stop the lso bufs monitor thread. If we fail, we simply * return. We'll have another shot at it later from detach() with * the force flag set. */ if (eib_stop_monitor_lso_bufs(ss, force) != EIB_E_SUCCESS) return; /* * Free the buflist */ if (bkt->bk_bufl) { kmem_free(bkt->bk_bufl, bkt->bk_nelem * sizeof (eib_lsobuf_t)); bkt->bk_bufl = NULL; } /* * Deregister LSO memory and free it */ if (bkt->bk_mr_hdl) { if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, bkt->bk_mr_hdl)) != IBT_SUCCESS) { EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_rsrc_setup_lsobufs: " "ibt_deregister_mr() failed, ret=%d", ret); } bkt->bk_mr_hdl = NULL; } if (bkt->bk_mem) { kmem_free(bkt->bk_mem, bkt->bk_nelem * EIB_LSO_BUFSZ); bkt->bk_mem = NULL; } /* * Destroy the mutex and condvar */ cv_destroy(&bkt->bk_cv); mutex_destroy(&bkt->bk_lock); /* * Finally, free the lso bucket itself */ kmem_free(bkt, sizeof (eib_lsobkt_t)); ss->ei_lso = NULL; }