/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include "dapl.h" #include "dapl_tavor_wr.h" #include "dapl_hash.h" #include "dapl_tavor_ibtf_impl.h" static dapls_tavor_wrid_entry_t *dapli_tavor_wrid_find_match( dapls_tavor_workq_hdr_t *, tavor_hw_cqe_t *); static dapls_tavor_wrid_list_hdr_t *dapli_tavor_wrid_get_list(uint32_t, int); static void dapli_tavor_wrid_reaplist_add(ib_cq_handle_t, dapls_tavor_workq_hdr_t *); static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_wqhdr_find(ib_cq_handle_t, uint_t, uint_t); static uint32_t dapli_tavor_wrid_get_wqeaddrsz(dapls_tavor_workq_hdr_t *); static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_list_reap( dapls_tavor_wrid_list_hdr_t *); static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_wqhdr_create(ib_cq_handle_t, uint_t, uint_t, uint_t); static void dapli_tavor_wrid_wqhdr_add(dapls_tavor_workq_hdr_t *, dapls_tavor_wrid_list_hdr_t *); static void dapli_tavor_wrid_wqhdr_remove(dapls_tavor_workq_hdr_t *, dapls_tavor_wrid_list_hdr_t *); static void dapli_tavor_wrid_wqhdr_lock_both(ib_qp_handle_t); static void dapli_tavor_wrid_wqhdr_unlock_both(ib_qp_handle_t); static DAT_RETURN dapli_tavor_cq_wqhdr_add(ib_cq_handle_t, dapls_tavor_workq_hdr_t *); static void dapli_tavor_cq_wqhdr_remove(ib_cq_handle_t, dapls_tavor_workq_hdr_t *); /* * dapls_tavor_wrid_get_entry() */ uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t cq, tavor_hw_cqe_t *cqe, uint_t send_or_recv, uint_t error, dapls_tavor_wrid_entry_t *wre) { dapls_tavor_workq_hdr_t *wq; dapls_tavor_wrid_entry_t *wre_tmp; uint64_t wrid; uint_t qpnum; /* Lock the list of work queues associated with this CQ */ dapl_os_lock(&cq->cq_wrid_wqhdr_lock); /* Find the work queue for this QP number (send or receive side) */ qpnum = TAVOR_CQE_QPNUM_GET(cqe); wq = dapli_tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv); dapl_os_assert(wq != NULL); /* * Regardless of whether the completion is the result of a "success" * or a "failure", we lock the list of "containers" and attempt to * search for the the first matching completion (i.e. the first WR * with a matching WQE addr and size). Once we find it, we pull out * the "wrid" field and return it (see below). Note: One possible * future enhancement would be to enable this routine to skip over * any "unsignaled" completions to go directly to the next "signaled" * entry on success. XXX */ dapl_os_lock(&wq->wq_wrid_lock->wrl_lock); wre_tmp = dapli_tavor_wrid_find_match(wq, cqe); /* * If this is a "successful" completion, then we assert that this * completion must be a "signaled" completion. */ dapl_os_assert(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED)); /* * If the completion is a "failed" completion, then we save away the * contents of the entry (into the "wre" field passed in) for use * in later CQE processing. Note: We use the * dapli_tavor_wrid_get_wqeaddrsz() function to grab "wqeaddrsz" from * the next entry in the container. * This is required for error processing (where updating these fields * properly is necessary to correct handling of the "error" CQE) */ if (error && (wre != NULL)) { *wre = *wre_tmp; wre->wr_wqeaddrsz = dapli_tavor_wrid_get_wqeaddrsz(wq); } /* Pull out the WRID and return it */ wrid = wre_tmp->wr_wrid; dapl_os_unlock(&wq->wq_wrid_lock->wrl_lock); dapl_os_unlock(&cq->cq_wrid_wqhdr_lock); return (wrid); } /* * dapli_tavor_wrid_find_match() */ static dapls_tavor_wrid_entry_t * dapli_tavor_wrid_find_match(dapls_tavor_workq_hdr_t *wq, tavor_hw_cqe_t *cqe) { dapls_tavor_wrid_entry_t *curr = NULL; dapls_tavor_wrid_list_hdr_t *container; uint32_t wqeaddr_size; uint32_t head, tail, size; int found = 0, last_container; /* dapl_os_assert(MUTEX_HELD(&wq->wq_wrid_lock)); */ /* Pull the "wqeaddrsz" information from the CQE */ wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cqe); /* * Walk the "containers" list(s), find first WR with a matching WQE * addr. If the current "container" is not the last one on the list, * i.e. not the current one to which we are posting new WRID entries, * then we do not attempt to update the "q_head", "q_tail", and * "q_full" indicators on the main work queue header. We do, however, * update the "head" and "full" indicators on the individual containers * as we go. This is imperative because we need to be able to * determine when the current container has been emptied (so that we * can move on to the next container). */ container = wq->wq_wrid_poll; while (container != NULL) { /* Is this the last/only "container" on the list */ last_container = (container != wq->wq_wrid_post) ? 0 : 1; /* * First check if we are on an SRQ. If so, we grab the entry * and break out. Since SRQ wridlist's are never added to * reaplist, they can only be the last container. */ if (container->wl_srq_en) { dapl_os_assert(last_container == 1); curr = dapli_tavor_wrid_find_match_srq(container, cqe); break; } /* * Grab the current "head", "tail" and "size" fields before * walking the list in the current container. Note: the "size" * field here must always be a power-of-2. The "full" * parameter is checked (and updated) here to distinguish the * "queue full" condition from "queue empty". */ head = container->wl_head; tail = container->wl_tail; size = container->wl_size; while ((head != tail) || (container->wl_full)) { container->wl_full = 0; curr = &container->wl_wre[head]; head = ((head + 1) & (size - 1)); /* * If the current entry's "wqeaddrsz" matches the one * we're searching for, then this must correspond to * the work request that caused the completion. Set * the "found" flag and bail out. */ if (curr->wr_wqeaddrsz == wqeaddr_size) { found = 1; break; } } /* * If the current container is empty (having reached here the * "head == tail" condition can only mean that the container * is empty), then NULL out the "wrid_old_tail" field (see * tavor_post_send() and tavor_post_recv() for more details) * and (potentially) remove the current container from future * searches. */ if (head == tail) { container->wl_wre_old_tail = NULL; /* * If this wasn't the last "container" on the chain, * i.e. the one to which new WRID entries will be * added, then remove it from the list. * Note: we don't "lose" the memory pointed to by this * because we should have already put this container * on the "reapable" list (from where it will later be * pulled). */ if (!last_container) { wq->wq_wrid_poll = container->wl_next; } } /* Update the head index for the container */ container->wl_head = head; /* * If the entry was found in this container, then continue to * bail out. Else reset the "curr" pointer and move on to the * next container (if there is one). Note: the only real * reason for setting "curr = NULL" here is so that the ASSERT * below can catch the case where no matching entry was found * on any of the lists. */ if (found) { break; } else { curr = NULL; container = container->wl_next; } } /* * Update work queue header's "head" and "full" conditions to match * the last entry on the container list. (Note: Only if we're pulling * entries from the last work queue portion of the list, i.e. not from * the previous portions that may be the "reapable" list.) */ if (last_container) { wq->wq_head = wq->wq_wrid_post->wl_head; wq->wq_full = wq->wq_wrid_post->wl_full; } /* Ensure that we've actually found what we were searching for */ dapl_os_assert(curr != NULL); return (curr); } /* * tavor_wrid_find_match_srq() * Context: Can be called from interrupt or base context. */ dapls_tavor_wrid_entry_t * dapli_tavor_wrid_find_match_srq(dapls_tavor_wrid_list_hdr_t *wl, tavor_hw_cqe_t *cqe) { dapls_tavor_wrid_entry_t *wre; uint32_t wqe_index; uint32_t wqe_addr; uint32_t qsize_msk; uint32_t tail, next_tail; /* Grab the WQE addr out of the CQE */ wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cqe) & 0xFFFFFFC0; /* * Given the 'wqe_addr' just calculated and the srq buf address, we * find the 'wqe_index'. The 'wre' returned below contains the WRID * that we are looking for. This indexes into the wre_list for this * specific WQE. */ wqe_index = TAVOR_SRQ_WQ_INDEX(wl->wl_srq_desc_addr, wqe_addr, wl->wl_srq_wqesz); /* ASSERT on impossible wqe_index values */ dapl_os_assert(wqe_index < wl->wl_size); /* Put this WQE back on the free list */ qsize_msk = wl->wl_size - 1; tail = wl->wl_freel_tail; next_tail = (tail + 1) & qsize_msk; wl->wl_freel_entries++; dapl_os_assert(wl->wl_freel_entries <= wl->wl_size); /* Get the descriptor (IO Address) of the WQE to be built */ wl->wl_free_list[tail] = wqe_addr; wl->wl_freel_tail = next_tail; /* Using the index, return the Work Request ID Entry (wre) */ wre = &wl->wl_wre[wqe_index]; return (wre); } /* * dapls_tavor_wrid_cq_reap() */ void dapls_tavor_wrid_cq_reap(ib_cq_handle_t cq) { dapls_tavor_workq_hdr_t *consume_wqhdr; dapls_tavor_wrid_list_hdr_t *container, *to_free; /* dapl_os_assert(MUTEX_HELD(&cq->cq_lock)); */ /* Lock the list of work queues associated with this CQ */ dapl_os_lock(&cq->cq_wrid_wqhdr_lock); /* Walk the "reapable" list and free up containers */ container = cq->cq_wrid_reap_head; while (container != NULL) { to_free = container; container = container->wl_reap_next; /* * If reaping the WRID list containers pulls the last * container from the given work queue header, then we free * the work queue header as well. */ consume_wqhdr = dapli_tavor_wrid_list_reap(to_free); if (consume_wqhdr != NULL) { dapli_tavor_cq_wqhdr_remove(cq, consume_wqhdr); } } /* Once finished reaping, we reset the CQ's reap list */ cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL; dapl_os_unlock(&cq->cq_wrid_wqhdr_lock); } /* * dapls_tavor_wrid_cq_force_reap() */ void dapls_tavor_wrid_cq_force_reap(ib_cq_handle_t cq) { DAPL_HASH_DATA curr; DAT_RETURN retval; dapls_tavor_workq_hdr_t *to_free_wqhdr; dapls_tavor_wrid_list_hdr_t *container, *to_free; /* dapl_os_assert(MUTEX_HELD(&cq->cq_lock)); */ /* * The first step is to walk the "reapable" list and free up those * containers. This is necessary because the containers on the * reapable list are not otherwise connected to the work queue headers * anymore. */ dapls_tavor_wrid_cq_reap(cq); /* Now lock the list of work queues associated with this CQ */ dapl_os_lock(&cq->cq_wrid_wqhdr_lock); /* * Walk the list of work queue headers and free up all the WRID list * containers chained to it. Note: We don't need to grab the locks * for each of the individual WRID lists here because the only way * things can be added or removed from the list at this point would be * through post a work request to a QP. But if we've come this far, * then we can be assured that there are no longer any QP associated * with the CQ that we are trying to free. */ retval = dapls_hash_iterate(cq->cq_wrid_wqhdr_list, DAPL_HASH_ITERATE_INIT, &curr); dapl_os_assert(retval == DAT_SUCCESS); while (curr != NULL) { to_free_wqhdr = (dapls_tavor_workq_hdr_t *)curr; container = ((dapls_tavor_workq_hdr_t *)curr)->wq_wrid_poll; retval = dapls_hash_iterate(cq->cq_wrid_wqhdr_list, DAPL_HASH_ITERATE_NEXT, &curr); dapl_os_assert(retval == DAT_SUCCESS); while (container != NULL) { to_free = container; container = container->wl_next; /* * If reaping the WRID list containers pulls the last * container from the given work queue header, then * we free the work queue header as well. Note: we * ignore the return value because we know that the * work queue header should always be freed once the * list of containers has come to an end. */ (void) dapli_tavor_wrid_list_reap(to_free); if (container == NULL) { dapli_tavor_cq_wqhdr_remove(cq, to_free_wqhdr); } } } dapl_os_lock(&cq->cq_wrid_wqhdr_lock); } /* * dapli_tavor_wrid_get_list() */ static dapls_tavor_wrid_list_hdr_t * dapli_tavor_wrid_get_list(uint32_t qsize, int wrid_for_srq) { dapls_tavor_wrid_list_hdr_t *wridlist; dapls_tavor_wrid_entry_t *wl_wre; uint32_t *wl_freel; uint32_t size; uint32_t wl_wre_size; uint32_t wl_freel_size; wridlist = NULL; wl_wre = NULL; wl_freel = NULL; size = wl_wre_size = wl_freel_size = 0; /* * The WRID list "container" consists of the dapls_tavor_wrid_list_hdr_t * which holds the pointers necessary for maintaining the "reapable" * list, chaining together multiple "containers" old and new, and * tracking the head, tail, size, etc. for each container. The * "container" also holds all the tavor_wrid_entry_t's, one for * each entry on the corresponding work queue. */ /* * For wridlist associated with SRQs the wridlock needs to be * allocated and initialized here. */ size = sizeof (dapls_tavor_wrid_list_hdr_t); if (wrid_for_srq) { size = size + sizeof (dapls_tavor_wrid_lock_t); } wridlist = dapl_os_alloc(size); if (wridlist == NULL) { goto bail; } if (wrid_for_srq) { wridlist->wl_lock = (dapls_tavor_wrid_lock_t *)( (uintptr_t)wridlist + sizeof (dapls_tavor_wrid_list_hdr_t)); dapl_os_lock_init(&wridlist->wl_lock->wrl_lock); wridlist->wl_lock->wrl_on_srq = wrid_for_srq; } else { wridlist->wl_lock = NULL; } wl_wre_size = qsize * sizeof (dapls_tavor_wrid_entry_t); wl_wre = dapl_os_alloc(wl_wre_size); if (wl_wre == NULL) { goto bail; } if (wrid_for_srq) { /* memory for the SRQ free list */ wl_freel_size = qsize * sizeof (uint32_t); wl_freel = dapl_os_alloc(wl_freel_size); if (wl_freel == NULL) { goto bail; } } /* Complete the "container" initialization */ wridlist->wl_size = qsize; wridlist->wl_full = 0; wridlist->wl_head = 0; wridlist->wl_tail = 0; wridlist->wl_wre = wl_wre; wridlist->wl_wre_old_tail = NULL; wridlist->wl_reap_next = NULL; wridlist->wl_next = NULL; wridlist->wl_prev = NULL; if (wrid_for_srq) { wridlist->wl_srq_en = 1; wridlist->wl_free_list = (uint32_t *)wl_freel; wridlist->wl_freel_head = 0; wridlist->wl_freel_tail = 0; wridlist->wl_freel_entries = qsize; } else { wridlist->wl_srq_en = 0; wridlist->wl_free_list = NULL; wridlist->wl_freel_head = 0; wridlist->wl_freel_tail = 0; wridlist->wl_freel_entries = 0; wridlist->wl_srq_wqesz = 0; wridlist->wl_srq_desc_addr = 0; } return (wridlist); bail: if (wridlist) { if (wrid_for_srq) { dapl_os_lock_destroy(&wridlist->wl_lock->wrl_lock); } dapl_os_free(wridlist, size); } if (wl_wre) { dapl_os_free(wl_wre, wl_wre_size); } if (wl_freel) { dapl_os_free(wl_freel, wl_freel_size); } return (NULL); } /* * dapli_tavor_wrid_reaplist_add() */ static void dapli_tavor_wrid_reaplist_add(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wq) { /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */ dapl_os_lock(&wq->wq_wrid_lock->wrl_lock); /* * Add the "post" container (the last one on the current chain) to * the CQ's "reapable" list */ if ((cq->cq_wrid_reap_head == NULL) && (cq->cq_wrid_reap_tail == NULL)) { cq->cq_wrid_reap_head = wq->wq_wrid_post; cq->cq_wrid_reap_tail = wq->wq_wrid_post; } else { cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post; cq->cq_wrid_reap_tail = wq->wq_wrid_post; } dapl_os_unlock(&wq->wq_wrid_lock->wrl_lock); } /* * dapli_tavor_wrid_wqhdr_find() */ static dapls_tavor_workq_hdr_t * dapli_tavor_wrid_wqhdr_find(ib_cq_handle_t cq, uint_t qpn, uint_t send_or_recv) { DAPL_HASH_DATA curr; DAPL_HASH_KEY key; DAT_RETURN status; /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */ /* * Walk the CQ's work queue list, trying to find a send or recv queue * with the same QP number. We do this even if we are going to later * create a new entry because it helps us easily find the end of the * list. */ key = (DAPL_HASH_KEY)(((uint64_t)send_or_recv << 32) | (uint32_t)qpn); status = dapls_hash_search(cq->cq_wrid_wqhdr_list, key, &curr); if (status == DAT_SUCCESS) { return ((dapls_tavor_workq_hdr_t *)curr); } else { return (NULL); } } /* * dapli_tavor_wrid_get_wqeaddrsz() */ static uint32_t dapli_tavor_wrid_get_wqeaddrsz(dapls_tavor_workq_hdr_t *wq) { dapls_tavor_wrid_entry_t *wre; uint32_t wqeaddrsz; uint32_t head; /* * If the container is empty, then there is no next entry. So just * return zero. Note: the "head == tail" condition here can only * mean that the container is empty because we have previously pulled * something from the container. * * If the container is not empty, then find the next entry and return * the contents of its "wqeaddrsz" field. */ if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) { wqeaddrsz = 0; } else { /* * We don't need to calculate the "next" head pointer here * because "head" should already point to the next entry on * the list (since we just pulled something off - in * dapli_tavor_wrid_find_match() - and moved the head index * forward.) */ head = wq->wq_wrid_poll->wl_head; wre = &wq->wq_wrid_poll->wl_wre[head]; wqeaddrsz = wre->wr_wqeaddrsz; } return (wqeaddrsz); } /* * dapli_tavor_wrid_list_reap() * Note: The "wqhdr_list_lock" must be held. */ static dapls_tavor_workq_hdr_t * dapli_tavor_wrid_list_reap(dapls_tavor_wrid_list_hdr_t *wridlist) { dapls_tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL; dapls_tavor_wrid_list_hdr_t *prev, *next; /* Get the back pointer to the work queue header (see below) */ wqhdr = wridlist->wl_wqhdr; dapl_os_lock(&wqhdr->wq_wrid_lock->wrl_lock); /* Unlink the WRID list "container" from the work queue list */ prev = wridlist->wl_prev; next = wridlist->wl_next; if (prev != NULL) { prev->wl_next = next; } if (next != NULL) { next->wl_prev = prev; } /* * If the back pointer to the work queue header shows that it * was pointing to the entry we are about to remove, then the work * queue header is reapable as well. */ if ((wqhdr->wq_wrid_poll == wridlist) && (wqhdr->wq_wrid_post == wridlist)) { consume_wqhdr = wqhdr; } /* Be sure to update the "poll" and "post" container pointers */ if (wqhdr->wq_wrid_poll == wridlist) { wqhdr->wq_wrid_poll = next; } if (wqhdr->wq_wrid_post == wridlist) { wqhdr->wq_wrid_post = NULL; } /* * Calculate the size and free the container, for SRQ wridlist is * freed when srq gets freed */ if (!wridlist->wl_srq_en) { if (wridlist->wl_wre) { dapl_os_free(wridlist->wl_wre, wridlist->wl_size * sizeof (dapls_tavor_wrid_entry_t)); } dapl_os_assert(wridlist->wl_free_list == NULL); dapl_os_free(wridlist, sizeof (dapls_tavor_wrid_list_hdr_t)); } dapl_os_unlock(&wqhdr->wq_wrid_lock->wrl_lock); return (consume_wqhdr); } /* * dapls_tavor_srq_wrid_init() */ DAT_RETURN dapls_tavor_srq_wrid_init(ib_srq_handle_t srq) { dapls_tavor_wrid_list_hdr_t *wridlist; int i; wridlist = dapli_tavor_wrid_get_list(srq->srq_wq_numwqe, 1); if (wridlist == NULL) { srq->srq_wridlist = NULL; return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY); } /* initialize the free list with the descriptor addresses */ wridlist->wl_free_list[0] = srq->srq_wq_desc_addr; for (i = 1; i < srq->srq_wq_numwqe; i++) { wridlist->wl_free_list[i] = wridlist->wl_free_list[i-1] + srq->srq_wq_wqesz; } wridlist->wl_srq_wqesz = srq->srq_wq_wqesz; wridlist->wl_srq_desc_addr = srq->srq_wq_desc_addr; srq->srq_wridlist = wridlist; return (DAT_SUCCESS); } void dapls_tavor_srq_wrid_free(ib_srq_handle_t srq) { dapls_tavor_wrid_list_hdr_t *wridlist; size_t size = 0; wridlist = srq->srq_wridlist; if (wridlist) { dapl_os_assert(wridlist->wl_srq_en == 1); if (wridlist->wl_wre) { dapl_os_free(wridlist->wl_wre, wridlist->wl_size * sizeof (dapls_tavor_wrid_entry_t)); } if (wridlist->wl_free_list) { dapl_os_free(wridlist->wl_free_list, wridlist->wl_size * sizeof (uint32_t)); } if (wridlist->wl_lock) { dapl_os_assert(wridlist->wl_lock->wrl_on_srq == 1); dapl_os_lock_destroy(&wridlist->wl_lock->wrl_lock); size = sizeof (dapls_tavor_wrid_lock_t); } size = size; /* pacify lint */ dapl_os_free(wridlist, size + sizeof (dapls_tavor_wrid_list_hdr_t)); srq->srq_wridlist = NULL; } } /* * dapls_tavor_wrid_init() */ DAT_RETURN dapls_tavor_wrid_init(ib_qp_handle_t qp) { dapls_tavor_workq_hdr_t *swq; dapls_tavor_workq_hdr_t *rwq; dapls_tavor_wrid_list_hdr_t *s_wridlist; dapls_tavor_wrid_list_hdr_t *r_wridlist; uint_t create_new_swq = 0; uint_t create_new_rwq = 0; /* * For each of this QP's Work Queues, make sure we have a (properly * initialized) Work Request ID list attached to the relevant * completion queue. Grab the CQ lock(s) before manipulating the * lists. */ dapli_tavor_wrid_wqhdr_lock_both(qp); swq = dapli_tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_num, TAVOR_WR_SEND); if (swq == NULL) { /* Couldn't find matching work queue header, create it */ create_new_swq = 1; swq = dapli_tavor_wrid_wqhdr_create(qp->qp_sq_cqhdl, qp->qp_num, TAVOR_WR_SEND, 1); if (swq == NULL) { /* * If we couldn't find/allocate space for the workq * header, then drop the lock(s) and return failure. */ dapli_tavor_wrid_wqhdr_unlock_both(qp); return (DAT_INSUFFICIENT_RESOURCES); } } qp->qp_sq_wqhdr = swq; swq->wq_size = qp->qp_sq_numwqe; swq->wq_head = 0; swq->wq_tail = 0; swq->wq_full = 0; /* * Allocate space for the dapls_tavor_wrid_entry_t container */ s_wridlist = dapli_tavor_wrid_get_list(swq->wq_size, 0); if (s_wridlist == NULL) { /* * If we couldn't allocate space for tracking the WRID * entries, then cleanup the workq header from above (if * necessary, i.e. if we created the workq header). Then * drop the lock(s) and return failure. */ if (create_new_swq) { dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); } dapli_tavor_wrid_wqhdr_unlock_both(qp); return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY); } s_wridlist->wl_wqhdr = swq; /* Chain the new WRID list container to the workq hdr list */ dapl_os_lock(&swq->wq_wrid_lock->wrl_lock); dapli_tavor_wrid_wqhdr_add(swq, s_wridlist); dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock); /* * Now we repeat all the above operations for the receive work queue */ rwq = dapli_tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_num, TAVOR_WR_RECV); if (rwq == NULL) { create_new_rwq = 1; /* if qp is attached to an SRQ don't need to alloc wrid_lock */ rwq = dapli_tavor_wrid_wqhdr_create(qp->qp_rq_cqhdl, qp->qp_num, TAVOR_WR_RECV, qp->qp_srq_enabled ? 0 : 1); if (rwq == NULL) { /* * If we couldn't find/allocate space for the workq * header, then free all the send queue resources we * just allocated and setup (above), drop the lock(s) * and return failure. */ dapl_os_lock(&swq->wq_wrid_lock->wrl_lock); dapli_tavor_wrid_wqhdr_remove(swq, s_wridlist); dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock); if (create_new_swq) { dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); } dapli_tavor_wrid_wqhdr_unlock_both(qp); return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY); } } qp->qp_rq_wqhdr = rwq; rwq->wq_size = qp->qp_rq_numwqe; rwq->wq_head = 0; rwq->wq_tail = 0; rwq->wq_full = 0; /* * Allocate space for the dapls_tavor_wrid_entry_t container * For qp associated with SRQs the SRQ wridlist is used */ if (qp->qp_srq_enabled) { /* Use existing srq_wridlist pointer */ r_wridlist = qp->qp_srq->srq_wridlist; dapl_os_assert(r_wridlist != NULL); /* store the wl_lock in the wqhdr */ rwq->wq_wrid_lock = r_wridlist->wl_lock; dapl_os_assert(rwq->wq_wrid_lock != NULL); } else { /* Allocate memory for the r_wridlist */ r_wridlist = dapli_tavor_wrid_get_list(rwq->wq_size, 0); } if (r_wridlist == NULL) { /* * If we couldn't allocate space for tracking the WRID * entries, then cleanup all the stuff from above. Then * drop the lock(s) and return failure. */ dapl_os_lock(&swq->wq_wrid_lock->wrl_lock); dapli_tavor_wrid_wqhdr_remove(swq, s_wridlist); dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock); if (create_new_swq) { dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); } if (create_new_rwq) { dapli_tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq); } dapli_tavor_wrid_wqhdr_unlock_both(qp); return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY); } /* For SRQ based QPs r_wridlist does not point to recv wqhdr */ if (!qp->qp_srq_enabled) { r_wridlist->wl_wqhdr = rwq; } /* Chain the new WRID list "container" to the workq hdr list */ dapl_os_lock(&rwq->wq_wrid_lock->wrl_lock); dapli_tavor_wrid_wqhdr_add(rwq, r_wridlist); dapl_os_unlock(&rwq->wq_wrid_lock->wrl_lock); dapli_tavor_wrid_wqhdr_unlock_both(qp); return (DAT_SUCCESS); } /* * dapls_tavor_wrid_cleanup() */ void dapls_tavor_wrid_cleanup(DAPL_EP *ep, ib_qp_handle_t qp) { /* * For each of this QP's Work Queues, move the WRID "container" to * the "reapable" list. Although there may still be unpolled * entries in these containers, it is not a big deal. We will not * reap the list until either the Poll CQ command detects an empty * condition or the CQ itself is freed. Grab the CQ lock(s) before * manipulating the lists. */ dapli_tavor_wrid_wqhdr_lock_both(qp); dapli_tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr); /* * Repeat the above operation for the Recv work queue "container". * However for qps with SRQ we flush the cq entries, remove the * wridlist and wqhdr. * Then drop the CQ lock(s) and return */ if (qp->qp_srq_enabled) { /* * Pull off all (if any) entries for this QP from CQ. This * only includes entries that have not yet been polled */ dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); DAPL_FLUSH(ep)(qp); /* Remove wridlist from WQHDR */ dapli_tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr, qp->qp_rq_wqhdr->wq_wrid_post); dapl_os_assert(qp->qp_rq_wqhdr->wq_wrid_post == NULL); dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); /* Free the WQHDR */ dapli_tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); } else { dapli_tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); } dapli_tavor_wrid_wqhdr_unlock_both(qp); } /* * dapli_tavor_wrid_wqhdr_create() */ static dapls_tavor_workq_hdr_t * dapli_tavor_wrid_wqhdr_create(ib_cq_handle_t cq, uint_t qpn, uint_t send_or_recv, uint_t alloc_wrl) { dapls_tavor_workq_hdr_t *wqhdr_tmp; size_t size, aligned_size; /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */ /* * Allocate space for a work queue header structure and initialize it. * Each work queue header structure includes a "wq_wrid_lock" * which needs to be initialized. * * Note: the address smashing is needed to ensure wq_wrid_lock is * 8-byte aligned, which is not always the case on 32-bit sparc. */ size = (sizeof (dapls_tavor_workq_hdr_t) + 0x7) & ~0x7; aligned_size = size; if (alloc_wrl) { /* for non-srq wqhdr the lock is allocated with the wqhdr */ size = size + sizeof (dapls_tavor_wrid_lock_t); } wqhdr_tmp = dapl_os_alloc(size); if (wqhdr_tmp == NULL) { return (NULL); } if (alloc_wrl) { wqhdr_tmp->wq_wrid_lock = (dapls_tavor_wrid_lock_t *) (((uintptr_t)wqhdr_tmp + aligned_size) & ~0x7); dapl_os_lock_init(&wqhdr_tmp->wq_wrid_lock->wrl_lock); /* wrl allocated with wqhdr don't have srq enabled */ wqhdr_tmp->wq_wrid_lock->wrl_on_srq = 0; } wqhdr_tmp->wq_qpn = qpn; wqhdr_tmp->wq_send_or_recv = send_or_recv; wqhdr_tmp->wq_wrid_poll = NULL; wqhdr_tmp->wq_wrid_post = NULL; /* Chain the newly allocated work queue header to the CQ's list */ if (dapli_tavor_cq_wqhdr_add(cq, wqhdr_tmp) != DAT_SUCCESS) { if (alloc_wrl) { dapl_os_lock_destroy(&wqhdr_tmp->wq_wrid_lock-> wrl_lock); } dapl_os_free(wqhdr_tmp, size); wqhdr_tmp = NULL; } return (wqhdr_tmp); } /* * dapli_tavor_wrid_wqhdr_add() */ static void dapli_tavor_wrid_wqhdr_add(dapls_tavor_workq_hdr_t *wqhdr, dapls_tavor_wrid_list_hdr_t *wridlist) { /* dapl_os_assert(MUTEX_HELD(&wqhdr->wq_wrid_lock)); */ /* Chain the new WRID list "container" to the work queue list */ if ((wqhdr->wq_wrid_post == NULL) && (wqhdr->wq_wrid_poll == NULL)) { wqhdr->wq_wrid_poll = wridlist; wqhdr->wq_wrid_post = wridlist; } else { wqhdr->wq_wrid_post->wl_next = wridlist; wridlist->wl_prev = wqhdr->wq_wrid_post; wqhdr->wq_wrid_post = wridlist; } } /* * dapli_tavor_wrid_wqhdr_remove() * Note: this is only called to remove the most recently added WRID list * container. */ static void dapli_tavor_wrid_wqhdr_remove(dapls_tavor_workq_hdr_t *wqhdr, dapls_tavor_wrid_list_hdr_t *wridlist) { dapls_tavor_wrid_list_hdr_t *prev, *next; /* dapl_os_assert(MUTEX_HELD(&wqhdr->wq_wrid_lock)); */ /* Unlink the WRID list "container" from the work queue list */ prev = wridlist->wl_prev; next = wridlist->wl_next; if (prev != NULL) { prev->wl_next = next; } if (next != NULL) { next->wl_prev = prev; } /* * Update any pointers in the work queue hdr that may point to this * WRID list container */ if (wqhdr->wq_wrid_post == wridlist) { wqhdr->wq_wrid_post = prev; } if (wqhdr->wq_wrid_poll == wridlist) { wqhdr->wq_wrid_poll = NULL; } } /* * dapli_tavor_wrid_wqhdr_lock_both() */ static void dapli_tavor_wrid_wqhdr_lock_both(ib_qp_handle_t qp) { ib_cq_handle_t sq_cq, rq_cq; sq_cq = qp->qp_sq_cqhdl; rq_cq = qp->qp_rq_cqhdl; /* * If both work queues (send and recv) share a completion queue, then * grab the common lock. If they use different CQs (hence different * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the * receive. We do this consistently and correctly in * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind * of dead lock condition. */ if (sq_cq == rq_cq) { dapl_os_lock(&sq_cq->cq_wrid_wqhdr_lock); } else { dapl_os_lock(&sq_cq->cq_wrid_wqhdr_lock); dapl_os_lock(&rq_cq->cq_wrid_wqhdr_lock); } } /* * dapli_tavor_wrid_wqhdr_unlock_both() */ static void dapli_tavor_wrid_wqhdr_unlock_both(ib_qp_handle_t qp) { ib_cq_handle_t sq_cq, rq_cq; sq_cq = qp->qp_sq_cqhdl; rq_cq = qp->qp_rq_cqhdl; /* * See tavor_wrid_wqhdr_lock_both() above for more detail */ if (sq_cq == rq_cq) { dapl_os_unlock(&sq_cq->cq_wrid_wqhdr_lock); } else { dapl_os_unlock(&rq_cq->cq_wrid_wqhdr_lock); dapl_os_unlock(&sq_cq->cq_wrid_wqhdr_lock); } } /* * dapli_tavor_cq_wqhdr_add() */ static DAT_RETURN dapli_tavor_cq_wqhdr_add(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wqhdr) { DAPL_HASH_KEY key; /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */ /* * If the CQ's work queue list is empty, then just add it. * Otherwise, chain it to the beginning of the list. */ key = (DAPL_HASH_KEY)(((uint64_t)wqhdr->wq_send_or_recv << 32) | wqhdr->wq_qpn); return (dapls_hash_insert(cq->cq_wrid_wqhdr_list, key, wqhdr)); } /* * dapli_tavor_cq_wqhdr_remove */ static void dapli_tavor_cq_wqhdr_remove(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wqhdr) { DAPL_HASH_DATA curr; DAPL_HASH_KEY key; size_t size = 0; /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */ /* Remove "wqhdr" from the work queue header list on "cq" */ key = (DAPL_HASH_KEY)(((uint64_t)wqhdr->wq_send_or_recv << 32) | wqhdr->wq_qpn); (void) dapls_hash_remove(cq->cq_wrid_wqhdr_list, key, &curr); size = (sizeof (dapls_tavor_workq_hdr_t) + 0x7) & ~0x7; if (wqhdr->wq_wrid_lock && (!wqhdr->wq_wrid_lock->wrl_on_srq)) { dapl_os_lock_destroy(&wqhdr->wq_wrid_lock->wrl_lock); size += sizeof (dapls_tavor_wrid_lock_t); } /* Free the memory associated with "wqhdr" */ dapl_os_free(wqhdr, size); } /* * dapls_tavor_srq_wrid_resize() is called to resize the wridlist * associated with SRQS as a result of dat_srq_resize(). * * Returns: DAT_TRUE if successful, otherwise DAT_FALSE */ DAT_BOOLEAN dapls_tavor_srq_wrid_resize(ib_srq_handle_t srq_handle, uint32_t new_size) { dapls_tavor_wrid_list_hdr_t *wridlist; dapls_tavor_wrid_entry_t *old_wl_wre; dapls_tavor_wrid_entry_t *new_wl_wre; uint32_t *old_wl_freel; uint32_t *new_wl_freel; uint32_t old_size; uint32_t idx; uint32_t prev_idx; uint32_t i; wridlist = srq_handle->srq_wridlist; if (wridlist == NULL) { return (DAT_FALSE); } dapl_os_assert(wridlist->wl_srq_en); dapl_os_lock(&wridlist->wl_lock->wrl_lock); old_wl_wre = wridlist->wl_wre; old_wl_freel = wridlist->wl_free_list; old_size = wridlist->wl_size; new_wl_wre = (dapls_tavor_wrid_entry_t *)dapl_os_alloc(new_size * sizeof (dapls_tavor_wrid_entry_t)); if (new_wl_wre == NULL) { goto bail; } new_wl_freel = dapl_os_alloc(new_size * sizeof (uint32_t)); if (new_wl_freel == NULL) { goto bail; } /* * we just need to copy the old WREs to the new array. Since the * descriptors are relatively addressed the descriptor to index * mapping doesn't change. */ (void) dapl_os_memcpy(&new_wl_wre[0], &old_wl_wre[0], old_size * sizeof (dapls_tavor_wrid_entry_t)); /* * Copy the old free list to the new one */ idx = wridlist->wl_freel_head; for (i = 0; i < wridlist->wl_freel_entries; i++) { new_wl_freel[i] = old_wl_freel[idx]; idx = (idx + 1) % old_size; } /* * Add the new entries in wl_wre to the new free list */ idx = wridlist->wl_freel_entries; new_wl_freel[idx] = wridlist->wl_srq_desc_addr + old_size * wridlist->wl_srq_wqesz; prev_idx = idx; idx = (idx + 1) % new_size; for (i = 0; i < new_size - old_size - 1; i++) { new_wl_freel[idx] = new_wl_freel[prev_idx] + wridlist->wl_srq_wqesz; prev_idx = idx; idx = (idx + 1) % new_size; } wridlist->wl_size = new_size; wridlist->wl_wre = new_wl_wre; wridlist->wl_free_list = new_wl_freel; wridlist->wl_freel_head = 0; wridlist->wl_freel_tail = idx; wridlist->wl_freel_entries = wridlist->wl_freel_entries + new_size - old_size; dapl_os_unlock(&wridlist->wl_lock->wrl_lock); if (old_wl_wre) { dapl_os_free(old_wl_wre, old_size * sizeof (dapls_tavor_wrid_entry_t)); } if (old_wl_freel) { dapl_os_free(old_wl_freel, old_size * sizeof (uint32_t)); } return (DAT_TRUE); bail: dapl_os_unlock(&wridlist->wl_lock->wrl_lock); if (new_wl_wre) { dapl_os_free(new_wl_wre, new_size * sizeof (dapls_tavor_wrid_entry_t)); } return (DAT_FALSE); }