/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * hermon_wr.c * Hermon Work Request Processing Routines * * Implements all the routines necessary to provide the PostSend(), * PostRecv() and PostSRQ() verbs. Also contains all the code * necessary to implement the Hermon WRID tracking mechanism. */ #include #include #include #include #include #include #include static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr); static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr); static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp, ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp, ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp); static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp, ibt_recv_wr_t *wr, uint64_t *desc); static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq, ibt_recv_wr_t *wr, uint64_t *desc); static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t send_or_recv); static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl); static void hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl); static ibt_wr_ds_t null_sgl = { 0, 0x00000100, 0 }; /* * Add ability to try to debug RDMA_READ/RDMA_WRITE failures. * * 0x1 - print rkey used during post_send * 0x2 - print sgls used during post_send * 0x4 - print FMR comings and goings */ int hermon_rdma_debug = 0x0; static int hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp, ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted) { hermon_hw_snd_wqe_ud_t *ud; hermon_workq_hdr_t *wq; hermon_ahhdl_t ah; ibt_wr_rfci_send_t *rfci; ibt_wr_init_send_t *is; ibt_ud_dest_t *dest; uint64_t *desc; uint32_t desc_sz; uint32_t signaled_dbd, solicited; uint32_t head, tail, next_tail, qsize_msk; uint32_t hdrmwqes; uint32_t nopcode, fence, immed_data = 0; hermon_hw_wqe_sgl_t *ds, *old_ds; ibt_wr_ds_t *sgl; int nds; int i, j, last_ds, num_ds, status; uint32_t *wqe_start; int sectperwqe; uint_t posted_cnt = 0; int total_len, strong_order, fc_bits, cksum; /* initialize the FMA retry loop */ hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num); ASSERT(MUTEX_HELD(&qp->qp_sq_lock)); _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock)) /* Grab the lock for the WRID list */ membar_consumer(); /* Save away some initial QP state */ wq = qp->qp_sq_wqhdr; qsize_msk = wq->wq_mask; hdrmwqes = qp->qp_sq_hdrmwqes; /* in WQEs */ sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2); tail = wq->wq_tail; head = wq->wq_head; status = DDI_SUCCESS; post_next: /* * Check for "queue full" condition. If the queue * is already full, then no more WQEs can be posted. * So break out, ring a doorbell (if necessary) and * return an error */ if (wq->wq_full != 0) { status = IBT_QP_FULL; goto done; } next_tail = (tail + 1) & qsize_msk; if (((tail + hdrmwqes) & qsize_msk) == head) { wq->wq_full = 1; } desc = HERMON_QP_SQ_ENTRY(qp, tail); nds = wr->wr_nds; sgl = wr->wr_sgl; num_ds = 0; strong_order = 0; fc_bits = 0; cksum = 0; /* * Build a Send or Send_LSO WQE */ switch (wr->wr_opcode) { case IBT_WRC_SEND_LSO: if (wr->wr_trans != IBT_UD_SRV) { status = IBT_QP_SRV_TYPE_INVALID; goto done; } nopcode = HERMON_WQE_SEND_NOPCODE_LSO; if (wr->wr_flags & IBT_WR_SEND_CKSUM) cksum = 0x30; if (wr->wr.ud_lso.lso_hdr_sz > 60) { nopcode |= (1 << 6); /* ReRead bit must be set */ } dest = wr->wr.ud_lso.lso_ud_dest; ah = (hermon_ahhdl_t)dest->ud_ah; if (ah == NULL) { status = IBT_AH_HDL_INVALID; goto done; } ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud + sizeof (hermon_hw_snd_wqe_ud_t)); HERMON_WQE_BUILD_UD(qp, ud, ah, dest); total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf; if ((uintptr_t)ds + total_len + (nds * 16) > (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) { status = IBT_QP_SGL_LEN_INVALID; goto done; } old_ds = ds; bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1, wr->wr.ud_lso.lso_hdr_sz); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len); i = 0; break; case IBT_WRC_SEND: nopcode = HERMON_WQE_SEND_NOPCODE_SEND; if (qp->qp_serv_type == HERMON_QP_UD) { if (wr->wr_trans != IBT_UD_SRV) { status = IBT_QP_SRV_TYPE_INVALID; goto done; } if (wr->wr_flags & IBT_WR_SEND_CKSUM) cksum = 0x30; dest = wr->wr.ud.udwr_dest; } else if (qp->qp_serv_type == HERMON_QP_RFCI) { if (wr->wr_trans != IBT_RFCI_SRV) { status = IBT_QP_SRV_TYPE_INVALID; goto done; } rfci = &wr->wr.fc.rfci_send; if ((wr->wr_flags & IBT_WR_SEND_FC_CRC) != 0) { nopcode |= (rfci->rfci_eof << 16); fc_bits = 0x40; /* set FCRC */ } dest = rfci->rfci_dest; } else { status = IBT_QP_OP_TYPE_INVALID; goto done; } if (wr->wr_flags & IBT_WR_SEND_IMMED) { /* "|=" changes 0xa to 0xb without touching FCEOF */ nopcode |= HERMON_WQE_SEND_NOPCODE_SENDI; immed_data = wr->wr.ud.udwr_immed; } ah = (hermon_ahhdl_t)dest->ud_ah; if (ah == NULL) { status = IBT_AH_HDL_INVALID; goto done; } ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud + sizeof (hermon_hw_snd_wqe_ud_t)); HERMON_WQE_BUILD_UD(qp, ud, ah, dest); i = 0; break; case IBT_WRC_INIT_SEND_FCMD: if (qp->qp_serv_type != HERMON_QP_FCMND) { status = IBT_QP_OP_TYPE_INVALID; goto done; } if (wr->wr_trans != IBT_FCMD_SRV) { status = IBT_QP_SRV_TYPE_INVALID; goto done; } nopcode = HERMON_WQE_FCP_OPCODE_INIT_AND_SEND; is = wr->wr.fc.fc_is; dest = is->is_ctl.fc_dest; ah = (hermon_ahhdl_t)dest->ud_ah; if (ah == NULL) { status = IBT_AH_HDL_INVALID; goto done; } ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud + sizeof (hermon_hw_snd_wqe_ud_t)); HERMON_WQE_BUILD_UD(qp, ud, ah, dest); old_ds = ds; /* move ds beyond the FCP-3 Init Segment */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + 0x10); i = 0; break; case IBT_WRC_FAST_REG_PMR: { hermon_hw_snd_wqe_frwr_t *frwr; if (qp->qp_serv_type != HERMON_QP_FCMND) { status = IBT_QP_OP_TYPE_INVALID; goto done; } if (wr->wr_trans != IBT_FCMD_SRV) { status = IBT_QP_SRV_TYPE_INVALID; goto done; } nopcode = HERMON_WQE_SEND_NOPCODE_FRWR; frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.fc.reg_pmr); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr + sizeof (hermon_hw_snd_wqe_frwr_t)); nds = 0; strong_order = 0x80; break; } #if 0 /* firmware does not support this */ case IBT_WRC_LOCAL_INVALIDATE: { hermon_hw_snd_wqe_local_inv_t *li; if (qp->qp_serv_type != HERMON_QP_FCMND) { status = IBT_QP_OP_TYPE_INVALID; goto done; } if (wr->wr_trans != IBT_FCMD_SRV) { status = IBT_QP_SRV_TYPE_INVALID; goto done; } nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV; li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); HERMON_WQE_BUILD_LI(qp, li, wr->wr.fc.li); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li + sizeof (hermon_hw_snd_wqe_local_inv_t)); nds = 0; strong_order = 0x80; break; } #endif default: status = IBT_QP_OP_TYPE_INVALID; goto done; } if (nds > qp->qp_sq_sgl) { status = IBT_QP_SGL_LEN_INVALID; goto done; } for (last_ds = num_ds, j = i; j < nds; j++) { if (sgl[j].ds_len != 0) last_ds++; /* real last ds of wqe to fill */ } desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4; for (j = nds; --j >= i; ) { if (sgl[j].ds_len == 0) { continue; } /* * Fill in the Data Segment(s) for the current WQE, using the * information contained in the scatter-gather list of the * work request. */ last_ds--; HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]); } membar_producer(); if (wr->wr_opcode == IBT_WRC_SEND_LSO) { HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss, wr->wr.ud_lso.lso_hdr_sz); } else if (wr->wr_opcode == IBT_WRC_INIT_SEND_FCMD) { /* This sits in the STAMP, so must be set after setting SGL */ HERMON_WQE_BUILD_FCP3_INIT(old_ds, is->is_ctl.fc_frame_ctrl, is->is_cs_priority, is->is_tx_seq_id, is->is_fc_mtu, is->is_dest_id, is->is_op, is->is_rem_exch, is->is_exch_qp_idx); /* The following will be used in HERMON_WQE_SET_CTRL_SEGMENT */ /* SIT bit in FCP-3 ctrl segment */ desc_sz |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_SIT) ? 0x80 : 0; /* LS bit in FCP-3 ctrl segment */ fc_bits |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_LAST_SEQ) ? 0x10000 : 0; fc_bits |= ((is->is_ctl.fc_routing_ctrl & 0xF) << 20) | (is->is_ctl.fc_seq_id << 24); immed_data = is->is_ctl.fc_parameter; } fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) || (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0; solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0; HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited, signaled_dbd, cksum, qp, strong_order, fc_bits); wq->wq_wrid[tail] = wr->wr_id; tail = next_tail; /* Update some of the state in the QP */ wq->wq_tail = tail; membar_producer(); /* Now set the ownership bit and opcode (first dword). */ HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode); posted_cnt++; if (--num_wr > 0) { /* do the invalidate of the headroom */ wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, (tail + hdrmwqes) & qsize_msk); for (i = 16; i < sectperwqe; i += 16) { wqe_start[i] = 0xFFFFFFFF; } wr++; goto post_next; } done: if (posted_cnt != 0) { ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state); membar_producer(); /* the FMA retry loop starts for Hermon doorbell register. */ hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt, fm_status, fm_test_num); HERMON_UAR_DOORBELL(state, uarhdl, (uint64_t *)(void *)&state->hs_uar->send, (uint64_t)qp->qp_ring); /* the FMA retry loop ends. */ hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt, fm_status, fm_test_num); /* do the invalidate of the headroom */ wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, (tail + hdrmwqes) & qsize_msk); for (i = 16; i < sectperwqe; i += 16) { wqe_start[i] = 0xFFFFFFFF; } } if (num_posted != NULL) *num_posted = posted_cnt; mutex_exit(&qp->qp_sq_lock); return (status); pio_error: mutex_exit(&qp->qp_sq_lock); hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST); return (ibc_get_ci_failure(0)); } static int hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp, ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted) { uint64_t *desc; hermon_workq_hdr_t *wq; uint32_t desc_sz; uint32_t signaled_dbd, solicited; uint32_t head, tail, next_tail, qsize_msk; uint32_t hdrmwqes; int status; uint32_t nopcode, fence, immed_data = 0; hermon_hw_snd_wqe_remaddr_t *rc; hermon_hw_snd_wqe_atomic_t *at; hermon_hw_snd_wqe_bind_t *bn; hermon_hw_snd_wqe_frwr_t *frwr; hermon_hw_snd_wqe_local_inv_t *li; hermon_hw_wqe_sgl_t *ds; ibt_wr_ds_t *sgl; int nds; int i, last_ds, num_ds; uint32_t *wqe_start; int sectperwqe; uint_t posted_cnt = 0; int strong_order; int print_rdma; int rlen; uint32_t rkey; uint64_t raddr; /* initialize the FMA retry loop */ hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num); ASSERT(MUTEX_HELD(&qp->qp_sq_lock)); _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock)) /* Save away some initial QP state */ wq = qp->qp_sq_wqhdr; qsize_msk = wq->wq_mask; hdrmwqes = qp->qp_sq_hdrmwqes; /* in WQEs */ sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2); tail = wq->wq_tail; head = wq->wq_head; status = DDI_SUCCESS; post_next: print_rdma = 0; rlen = 0; strong_order = 0; /* * Check for "queue full" condition. If the queue * is already full, then no more WQEs can be posted. * So break out, ring a doorbell (if necessary) and * return an error */ if (wq->wq_full != 0) { status = IBT_QP_FULL; goto done; } next_tail = (tail + 1) & qsize_msk; if (((tail + hdrmwqes) & qsize_msk) == head) { wq->wq_full = 1; } desc = HERMON_QP_SQ_ENTRY(qp, tail); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); nds = wr->wr_nds; sgl = wr->wr_sgl; num_ds = 0; if (wr->wr_trans != IBT_RC_SRV) { status = IBT_QP_SRV_TYPE_INVALID; goto done; } /* * Validate the operation type. For RC requests, we allow * "Send", "RDMA Read", "RDMA Write", various "Atomic" * operations, and memory window "Bind" */ switch (wr->wr_opcode) { default: status = IBT_QP_OP_TYPE_INVALID; goto done; case IBT_WRC_SEND: if (wr->wr_flags & IBT_WR_SEND_REMOTE_INVAL) { nopcode = HERMON_WQE_SEND_NOPCODE_SND_INV; immed_data = wr->wr.rc.rcwr.send_inval; } else if (wr->wr_flags & IBT_WR_SEND_IMMED) { nopcode = HERMON_WQE_SEND_NOPCODE_SENDI; immed_data = wr->wr.rc.rcwr.send_immed; } else { nopcode = HERMON_WQE_SEND_NOPCODE_SEND; } break; /* * If this is an RDMA Read or RDMA Write request, then fill * in the "Remote Address" header fields. */ case IBT_WRC_RDMAW: if (wr->wr_flags & IBT_WR_SEND_IMMED) { nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI; immed_data = wr->wr.rc.rcwr.rdma.rdma_immed; } else { nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW; } /* FALLTHROUGH */ case IBT_WRC_RDMAR: if (wr->wr_opcode == IBT_WRC_RDMAR) nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR; rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); /* * Build the Remote Address Segment for the WQE, using * the information from the RC work request. */ HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma); if (hermon_rdma_debug) { print_rdma = hermon_rdma_debug; rkey = wr->wr.rc.rcwr.rdma.rdma_rkey; raddr = wr->wr.rc.rcwr.rdma.rdma_raddr; } /* Update "ds" for filling in Data Segments (below) */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc + sizeof (hermon_hw_snd_wqe_remaddr_t)); break; /* * If this is one of the Atomic type operations (i.e * Compare-Swap or Fetch-Add), then fill in both the "Remote * Address" header fields and the "Atomic" header fields. */ case IBT_WRC_CSWAP: nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS; /* FALLTHROUGH */ case IBT_WRC_FADD: if (wr->wr_opcode == IBT_WRC_FADD) nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA; rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc + sizeof (hermon_hw_snd_wqe_remaddr_t)); /* * Build the Remote Address and Atomic Segments for * the WQE, using the information from the RC Atomic * work request. */ HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr); HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic); /* Update "ds" for filling in Data Segments (below) */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at + sizeof (hermon_hw_snd_wqe_atomic_t)); /* * Update "nds" and "sgl" because Atomic requests have * only a single Data Segment. */ nds = 1; sgl = wr->wr_sgl; break; /* * If this is memory window Bind operation, then we call the * hermon_wr_bind_check() routine to validate the request and * to generate the updated RKey. If this is successful, then * we fill in the WQE's "Bind" header fields. */ case IBT_WRC_BIND: nopcode = HERMON_WQE_SEND_NOPCODE_BIND; status = hermon_wr_bind_check(state, wr); if (status != DDI_SUCCESS) goto done; bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); /* * Build the Bind Memory Window Segments for the WQE, * using the information from the RC Bind memory * window work request. */ HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind); /* * Update the "ds" pointer. Even though the "bind" * operation requires no SGLs, this is necessary to * facilitate the correct descriptor size calculations * (below). */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn + sizeof (hermon_hw_snd_wqe_bind_t)); nds = 0; break; case IBT_WRC_FAST_REG_PMR: nopcode = HERMON_WQE_SEND_NOPCODE_FRWR; frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.rc.rcwr.reg_pmr); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr + sizeof (hermon_hw_snd_wqe_frwr_t)); nds = 0; strong_order = 0x80; break; case IBT_WRC_LOCAL_INVALIDATE: nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV; li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); HERMON_WQE_BUILD_LI(qp, li, wr->wr.rc.rcwr.li); ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li + sizeof (hermon_hw_snd_wqe_local_inv_t)); nds = 0; strong_order = 0x80; break; } /* * Now fill in the Data Segments (SGL) for the Send WQE based * on the values setup above (i.e. "sgl", "nds", and the "ds" * pointer. Start by checking for a valid number of SGL entries */ if (nds > qp->qp_sq_sgl) { status = IBT_QP_SGL_LEN_INVALID; goto done; } for (last_ds = num_ds, i = 0; i < nds; i++) { if (sgl[i].ds_len != 0) last_ds++; /* real last ds of wqe to fill */ } desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4; for (i = nds; --i >= 0; ) { if (sgl[i].ds_len == 0) { continue; } rlen += sgl[i].ds_len; if (print_rdma & 0x2) IBTF_DPRINTF_L2("rdma", "post: [%d]: laddr %llx " "llen %x", i, sgl[i].ds_va, sgl[i].ds_len); /* * Fill in the Data Segment(s) for the current WQE, using the * information contained in the scatter-gather list of the * work request. */ last_ds--; HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]); } /* ensure RDMA READ does not exceed HCA limit */ if ((wr->wr_opcode == IBT_WRC_RDMAR) && (desc_sz > state->hs_ibtfinfo.hca_attr->hca_conn_rdma_read_sgl_sz + 2)) { status = IBT_QP_SGL_LEN_INVALID; goto done; } if (print_rdma & 0x1) { IBTF_DPRINTF_L2("rdma", "post: indx %x rkey %x raddr %llx " "total len %x", tail, rkey, raddr, rlen); } fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) || (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0; solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0; HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited, signaled_dbd, 0, qp, strong_order, 0); wq->wq_wrid[tail] = wr->wr_id; tail = next_tail; /* Update some of the state in the QP */ wq->wq_tail = tail; membar_producer(); /* Now set the ownership bit of the first one in the chain. */ HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode); posted_cnt++; if (--num_wr > 0) { /* do the invalidate of the headroom */ wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, (tail + hdrmwqes) & qsize_msk); for (i = 16; i < sectperwqe; i += 16) { wqe_start[i] = 0xFFFFFFFF; } wr++; goto post_next; } done: if (posted_cnt != 0) { ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state); membar_producer(); /* the FMA retry loop starts for Hermon doorbell register. */ hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt, fm_status, fm_test_num); /* Ring the doorbell */ HERMON_UAR_DOORBELL(state, uarhdl, (uint64_t *)(void *)&state->hs_uar->send, (uint64_t)qp->qp_ring); /* the FMA retry loop ends. */ hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt, fm_status, fm_test_num); /* do the invalidate of the headroom */ wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, (tail + hdrmwqes) & qsize_msk); for (i = 16; i < sectperwqe; i += 16) { wqe_start[i] = 0xFFFFFFFF; } } /* * Update the "num_posted" return value (if necessary). * Then drop the locks and return success. */ if (num_posted != NULL) { *num_posted = posted_cnt; } mutex_exit(&qp->qp_sq_lock); return (status); pio_error: mutex_exit(&qp->qp_sq_lock); hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST); return (ibc_get_ci_failure(0)); } /* * hermon_post_send() * Context: Can be called from interrupt or base context. */ int hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp, ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted) { ibt_send_wr_t *curr_wr; hermon_workq_hdr_t *wq; hermon_ahhdl_t ah; uint64_t *desc, *prev; uint32_t desc_sz; uint32_t signaled_dbd, solicited; uint32_t head, tail, next_tail, qsize_msk; uint32_t hdrmwqes; uint_t currindx, wrindx, numremain; uint_t chainlen; uint_t posted_cnt, maxstat; uint_t total_posted; int status; uint32_t nopcode, fence, immed_data = 0; uint32_t prev_nopcode; uint_t qp_state; /* initialize the FMA retry loop */ hermon_pio_init(fm_loop_cnt, fm_status, fm_test); /* * Check for user-mappable QP memory. Note: We do not allow kernel * clients to post to QP memory that is accessible directly by the * user. If the QP memory is user accessible, then return an error. */ if (qp->qp_alloc_flags & IBT_QP_USER_MAP) { return (IBT_QP_HDL_INVALID); } mutex_enter(&qp->qp_sq_lock); /* * Check QP state. Can not post Send requests from the "Reset", * "Init", or "RTR" states */ qp_state = qp->qp_state_for_post_send; if ((qp_state == HERMON_QP_RESET) || (qp_state == HERMON_QP_INIT) || (qp_state == HERMON_QP_RTR)) { mutex_exit(&qp->qp_sq_lock); return (IBT_QP_STATE_INVALID); } if (qp->qp_is_special) goto post_many; /* Use these optimized functions most of the time */ if (qp->qp_type == IBT_UD_RQP) { return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted)); } if (qp->qp_serv_type == HERMON_QP_RC) { return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted)); } if (qp->qp_serv_type == HERMON_QP_UC) goto post_many; mutex_exit(&qp->qp_sq_lock); return (IBT_QP_SRV_TYPE_INVALID); post_many: /* general loop for non-optimized posting */ /* Save away some initial QP state */ wq = qp->qp_sq_wqhdr; qsize_msk = wq->wq_mask; tail = wq->wq_tail; head = wq->wq_head; hdrmwqes = qp->qp_sq_hdrmwqes; /* in WQEs */ /* Initialize posted_cnt */ posted_cnt = 0; total_posted = 0; /* * For each ibt_send_wr_t in the wr[] list passed in, parse the * request and build a Send WQE. NOTE: Because we are potentially * building a chain of WQEs to post, we want to build them all first, * and set the valid (HW Ownership) bit on all but the first. * However, we do not want to validate the first one until the * entire chain of WQEs has been built. Then in the final * we set the valid bit in the first, flush if needed, and as a last * step ring the appropriate doorbell. NOTE: the doorbell ring may * NOT be needed if the HCA is already processing, but the doorbell * ring will be done regardless. NOTE ALSO: It is possible for * more Work Requests to be posted than the HW will support at one * shot. If this happens, we need to be able to post and ring * several chains here until the the entire request is complete. * NOTE ALSO: the term "chain" is used to differentiate it from * Work Request List passed in; and because that's the terminology * from the previous generations of HCA - but the WQEs are not, in fact * chained together for Hermon */ wrindx = 0; numremain = num_wr; status = DDI_SUCCESS; while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { /* * For the first WQE on a new chain we need "prev" to point * to the current descriptor. */ prev = HERMON_QP_SQ_ENTRY(qp, tail); /* * Break the request up into lists that are less than or * equal to the maximum number of WQEs that can be posted * per doorbell ring - 256 currently */ chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ? HERMON_QP_MAXDESC_PER_DB : numremain; numremain -= chainlen; for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { /* * Check for "queue full" condition. If the queue * is already full, then no more WQEs can be posted. * So break out, ring a doorbell (if necessary) and * return an error */ if (wq->wq_full != 0) { status = IBT_QP_FULL; break; } /* * Increment the "tail index". Check for "queue * full" condition incl. headroom. If we detect that * the current work request is going to fill the work * queue, then we mark this condition and continue. * Don't need >=, because going one-by-one we have to * hit it exactly sooner or later */ next_tail = (tail + 1) & qsize_msk; if (((tail + hdrmwqes) & qsize_msk) == head) { wq->wq_full = 1; } /* * Get the address of the location where the next * Send WQE should be built */ desc = HERMON_QP_SQ_ENTRY(qp, tail); /* * Call hermon_wqe_send_build() to build the WQE * at the given address. This routine uses the * information in the ibt_send_wr_t list (wr[]) and * returns the size of the WQE when it returns. */ status = hermon_wqe_send_build(state, qp, &wr[wrindx], desc, &desc_sz); if (status != DDI_SUCCESS) { break; } /* * Now, build the Ctrl Segment based on * what was just done */ curr_wr = &wr[wrindx]; switch (curr_wr->wr_opcode) { case IBT_WRC_RDMAW: if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI; immed_data = hermon_wr_get_immediate(curr_wr); } else { nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW; } break; case IBT_WRC_SEND: if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { nopcode = HERMON_WQE_SEND_NOPCODE_SENDI; immed_data = hermon_wr_get_immediate(curr_wr); } else { nopcode = HERMON_WQE_SEND_NOPCODE_SEND; } break; case IBT_WRC_SEND_LSO: nopcode = HERMON_WQE_SEND_NOPCODE_LSO; break; case IBT_WRC_RDMAR: nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR; break; case IBT_WRC_CSWAP: nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS; break; case IBT_WRC_FADD: nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA; break; case IBT_WRC_BIND: nopcode = HERMON_WQE_SEND_NOPCODE_BIND; break; } fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; /* * now, build up the control segment, leaving the * owner bit as it is */ if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) || (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) { signaled_dbd = 0xC; } else { signaled_dbd = 0; } if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT) solicited = 0x2; else solicited = 0; if (qp->qp_is_special) { /* Ensure correctness, set the ReRead bit */ nopcode |= (1 << 6); ah = (hermon_ahhdl_t) curr_wr->wr.ud.udwr_dest->ud_ah; mutex_enter(&ah->ah_lock); maxstat = ah->ah_udav->max_stat_rate; HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz, signaled_dbd, maxstat, ah->ah_udav->rlid, qp, ah->ah_udav->sl); mutex_exit(&ah->ah_lock); } else { HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited, signaled_dbd, 0, qp, 0, 0); } wq->wq_wrid[tail] = curr_wr->wr_id; /* * If this is not the first descriptor on the current * chain, then set the ownership bit. */ if (currindx != 0) { /* not the first */ membar_producer(); HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode); } else prev_nopcode = nopcode; /* * Update the current "tail index" and increment * "posted_cnt" */ tail = next_tail; posted_cnt++; } /* * If we reach here and there are one or more WQEs which have * been successfully built as a chain, we have to finish up * and prepare them for writing to the HW * The steps are: * 1. do the headroom fixup * 2. add in the size of the headroom for the sync * 3. write the owner bit for the first WQE * 4. sync them * 5. fix up the structures * 6. hit the doorbell in UAR */ if (posted_cnt != 0) { ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state); /* do the invalidate of the headroom */ hermon_wqe_headroom(tail, qp); /* Update some of the state in the QP */ wq->wq_tail = tail; total_posted += posted_cnt; posted_cnt = 0; membar_producer(); /* * Now set the ownership bit of the first * one in the chain */ HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev, prev_nopcode); /* the FMA retry loop starts for Hermon doorbell. */ hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt, fm_status, fm_test); HERMON_UAR_DOORBELL(state, uarhdl, (uint64_t *)(void *)&state->hs_uar->send, (uint64_t)qp->qp_ring); /* the FMA retry loop ends. */ hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt, fm_status, fm_test); } } /* * Update the "num_posted" return value (if necessary). * Then drop the locks and return success. */ if (num_posted != NULL) { *num_posted = total_posted; } mutex_exit(&qp->qp_sq_lock); return (status); pio_error: mutex_exit(&qp->qp_sq_lock); hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST); return (ibc_get_ci_failure(0)); } /* * hermon_post_recv() * Context: Can be called from interrupt or base context. */ int hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp, ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) { uint64_t *desc; hermon_workq_hdr_t *wq; uint32_t head, tail, next_tail, qsize_msk; uint_t wrindx; uint_t posted_cnt; int status; /* * Check for user-mappable QP memory. Note: We do not allow kernel * clients to post to QP memory that is accessible directly by the * user. If the QP memory is user accessible, then return an error. */ if (qp->qp_alloc_flags & IBT_QP_USER_MAP) { return (IBT_QP_HDL_INVALID); } /* Initialize posted_cnt */ posted_cnt = 0; mutex_enter(&qp->qp_lock); /* * Check if QP is associated with an SRQ */ if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) { mutex_exit(&qp->qp_lock); return (IBT_SRQ_IN_USE); } /* * Check QP state. Can not post Recv requests from the "Reset" state */ if (qp->qp_state == HERMON_QP_RESET) { mutex_exit(&qp->qp_lock); return (IBT_QP_STATE_INVALID); } /* Check that work request transport type is valid */ if ((qp->qp_type != IBT_UD_RQP) && (qp->qp_serv_type != HERMON_QP_RC) && (qp->qp_serv_type != HERMON_QP_UC)) { mutex_exit(&qp->qp_lock); return (IBT_QP_SRV_TYPE_INVALID); } /* * Grab the lock for the WRID list, i.e., membar_consumer(). * This is not needed because the mutex_enter() above has * the same effect. */ /* Save away some initial QP state */ wq = qp->qp_rq_wqhdr; qsize_msk = wq->wq_mask; tail = wq->wq_tail; head = wq->wq_head; wrindx = 0; status = DDI_SUCCESS; for (wrindx = 0; wrindx < num_wr; wrindx++) { if (wq->wq_full != 0) { status = IBT_QP_FULL; break; } next_tail = (tail + 1) & qsize_msk; if (next_tail == head) { wq->wq_full = 1; } desc = HERMON_QP_RQ_ENTRY(qp, tail); status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc); if (status != DDI_SUCCESS) { break; } wq->wq_wrid[tail] = wr[wrindx].wr_id; qp->qp_rq_wqecntr++; tail = next_tail; posted_cnt++; } if (posted_cnt != 0) { wq->wq_tail = tail; membar_producer(); /* ensure wrids are visible */ /* Update the doorbell record w/ wqecntr */ HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr, qp->qp_rq_wqecntr & 0xFFFF); } if (num_posted != NULL) { *num_posted = posted_cnt; } mutex_exit(&qp->qp_lock); return (status); } /* * hermon_post_srq() * Context: Can be called from interrupt or base context. */ int hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq, ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) { uint64_t *desc; hermon_workq_hdr_t *wq; uint_t indx, wrindx; uint_t posted_cnt; int status; mutex_enter(&srq->srq_lock); /* * Check for user-mappable QP memory. Note: We do not allow kernel * clients to post to QP memory that is accessible directly by the * user. If the QP memory is user accessible, then return an error. */ if (srq->srq_is_umap) { mutex_exit(&srq->srq_lock); return (IBT_SRQ_HDL_INVALID); } /* * Check SRQ state. Can not post Recv requests when SRQ is in error */ if (srq->srq_state == HERMON_SRQ_STATE_ERROR) { mutex_exit(&srq->srq_lock); return (IBT_QP_STATE_INVALID); } status = DDI_SUCCESS; posted_cnt = 0; wq = srq->srq_wq_wqhdr; indx = wq->wq_head; for (wrindx = 0; wrindx < num_wr; wrindx++) { if (indx == wq->wq_tail) { status = IBT_QP_FULL; break; } desc = HERMON_SRQ_WQE_ADDR(srq, indx); wq->wq_wrid[indx] = wr[wrindx].wr_id; status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc); if (status != DDI_SUCCESS) { break; } posted_cnt++; indx = htons(((uint16_t *)desc)[1]); wq->wq_head = indx; } if (posted_cnt != 0) { srq->srq_wq_wqecntr += posted_cnt; membar_producer(); /* ensure wrids are visible */ /* Ring the doorbell w/ wqecntr */ HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr, srq->srq_wq_wqecntr & 0xFFFF); } if (num_posted != NULL) { *num_posted = posted_cnt; } mutex_exit(&srq->srq_lock); return (status); } /* * hermon_wqe_send_build() * Context: Can be called from interrupt or base context. */ static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp, ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) { hermon_hw_snd_wqe_ud_t *ud; hermon_hw_snd_wqe_remaddr_t *rc; hermon_hw_snd_wqe_atomic_t *at; hermon_hw_snd_wqe_remaddr_t *uc; hermon_hw_snd_wqe_bind_t *bn; hermon_hw_wqe_sgl_t *ds, *old_ds; ibt_ud_dest_t *dest; ibt_wr_ds_t *sgl; hermon_ahhdl_t ah; uint32_t nds; int i, j, last_ds, num_ds, status; int tmpsize; ASSERT(MUTEX_HELD(&qp->qp_sq_lock)); /* Initialize the information for the Data Segments */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); nds = wr->wr_nds; sgl = wr->wr_sgl; num_ds = 0; i = 0; /* * Build a Send WQE depends first and foremost on the transport * type of Work Request (i.e. UD, RC, or UC) */ switch (wr->wr_trans) { case IBT_UD_SRV: /* Ensure that work request transport type matches QP type */ if (qp->qp_serv_type != HERMON_QP_UD) { return (IBT_QP_SRV_TYPE_INVALID); } /* * Validate the operation type. For UD requests, only the * "Send" and "Send LSO" operations are valid. */ if (wr->wr_opcode != IBT_WRC_SEND && wr->wr_opcode != IBT_WRC_SEND_LSO) { return (IBT_QP_OP_TYPE_INVALID); } /* * If this is a Special QP (QP0 or QP1), then we need to * build MLX WQEs instead. So jump to hermon_wqe_mlx_build() * and return whatever status it returns */ if (qp->qp_is_special) { if (wr->wr_opcode == IBT_WRC_SEND_LSO) { return (IBT_QP_OP_TYPE_INVALID); } status = hermon_wqe_mlx_build(state, qp, wr, desc, size); return (status); } /* * Otherwise, if this is a normal UD Send request, then fill * all the fields in the Hermon UD header for the WQE. Note: * to do this we'll need to extract some information from the * Address Handle passed with the work request. */ ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); if (wr->wr_opcode == IBT_WRC_SEND) { dest = wr->wr.ud.udwr_dest; } else { dest = wr->wr.ud_lso.lso_ud_dest; } ah = (hermon_ahhdl_t)dest->ud_ah; if (ah == NULL) { return (IBT_AH_HDL_INVALID); } /* * Build the Unreliable Datagram Segment for the WQE, using * the information from the address handle and the work * request. */ /* mutex_enter(&ah->ah_lock); */ if (wr->wr_opcode == IBT_WRC_SEND) { HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest); } else { /* IBT_WRC_SEND_LSO */ HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud_lso.lso_ud_dest); } /* mutex_exit(&ah->ah_lock); */ /* Update "ds" for filling in Data Segments (below) */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud + sizeof (hermon_hw_snd_wqe_ud_t)); if (wr->wr_opcode == IBT_WRC_SEND_LSO) { int total_len; total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf; if ((uintptr_t)ds + total_len + (nds * 16) > (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) return (IBT_QP_SGL_LEN_INVALID); bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1, wr->wr.ud_lso.lso_hdr_sz); old_ds = ds; ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len); for (; i < nds; i++) { if (sgl[i].ds_len == 0) continue; HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]); num_ds++; i++; break; } membar_producer(); HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss, wr->wr.ud_lso.lso_hdr_sz); } break; case IBT_RC_SRV: /* Ensure that work request transport type matches QP type */ if (qp->qp_serv_type != HERMON_QP_RC) { return (IBT_QP_SRV_TYPE_INVALID); } /* * Validate the operation type. For RC requests, we allow * "Send", "RDMA Read", "RDMA Write", various "Atomic" * operations, and memory window "Bind" */ if ((wr->wr_opcode != IBT_WRC_SEND) && (wr->wr_opcode != IBT_WRC_RDMAR) && (wr->wr_opcode != IBT_WRC_RDMAW) && (wr->wr_opcode != IBT_WRC_CSWAP) && (wr->wr_opcode != IBT_WRC_FADD) && (wr->wr_opcode != IBT_WRC_BIND)) { return (IBT_QP_OP_TYPE_INVALID); } /* * If this is a Send request, then all we need to do is break * out and here and begin the Data Segment processing below */ if (wr->wr_opcode == IBT_WRC_SEND) { break; } /* * If this is an RDMA Read or RDMA Write request, then fill * in the "Remote Address" header fields. */ if ((wr->wr_opcode == IBT_WRC_RDMAR) || (wr->wr_opcode == IBT_WRC_RDMAW)) { rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); /* * Build the Remote Address Segment for the WQE, using * the information from the RC work request. */ HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma); /* Update "ds" for filling in Data Segments (below) */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc + sizeof (hermon_hw_snd_wqe_remaddr_t)); break; } /* * If this is one of the Atomic type operations (i.e * Compare-Swap or Fetch-Add), then fill in both the "Remote * Address" header fields and the "Atomic" header fields. */ if ((wr->wr_opcode == IBT_WRC_CSWAP) || (wr->wr_opcode == IBT_WRC_FADD)) { rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc + sizeof (hermon_hw_snd_wqe_remaddr_t)); /* * Build the Remote Address and Atomic Segments for * the WQE, using the information from the RC Atomic * work request. */ HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr); HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic); /* Update "ds" for filling in Data Segments (below) */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at + sizeof (hermon_hw_snd_wqe_atomic_t)); /* * Update "nds" and "sgl" because Atomic requests have * only a single Data Segment (and they are encoded * somewhat differently in the work request. */ nds = 1; sgl = wr->wr_sgl; break; } /* * If this is memory window Bind operation, then we call the * hermon_wr_bind_check() routine to validate the request and * to generate the updated RKey. If this is successful, then * we fill in the WQE's "Bind" header fields. */ if (wr->wr_opcode == IBT_WRC_BIND) { status = hermon_wr_bind_check(state, wr); if (status != DDI_SUCCESS) { return (status); } bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); /* * Build the Bind Memory Window Segments for the WQE, * using the information from the RC Bind memory * window work request. */ HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind); /* * Update the "ds" pointer. Even though the "bind" * operation requires no SGLs, this is necessary to * facilitate the correct descriptor size calculations * (below). */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn + sizeof (hermon_hw_snd_wqe_bind_t)); nds = 0; } break; case IBT_UC_SRV: /* Ensure that work request transport type matches QP type */ if (qp->qp_serv_type != HERMON_QP_UC) { return (IBT_QP_SRV_TYPE_INVALID); } /* * Validate the operation type. For UC requests, we only * allow "Send", "RDMA Write", and memory window "Bind". * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic" * operations */ if ((wr->wr_opcode != IBT_WRC_SEND) && (wr->wr_opcode != IBT_WRC_RDMAW) && (wr->wr_opcode != IBT_WRC_BIND)) { return (IBT_QP_OP_TYPE_INVALID); } /* * If this is a Send request, then all we need to do is break * out and here and begin the Data Segment processing below */ if (wr->wr_opcode == IBT_WRC_SEND) { break; } /* * If this is an RDMA Write request, then fill in the "Remote * Address" header fields. */ if (wr->wr_opcode == IBT_WRC_RDMAW) { uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); /* * Build the Remote Address Segment for the WQE, using * the information from the UC work request. */ HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma); /* Update "ds" for filling in Data Segments (below) */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc + sizeof (hermon_hw_snd_wqe_remaddr_t)); break; } /* * If this is memory window Bind operation, then we call the * hermon_wr_bind_check() routine to validate the request and * to generate the updated RKey. If this is successful, then * we fill in the WQE's "Bind" header fields. */ if (wr->wr_opcode == IBT_WRC_BIND) { status = hermon_wr_bind_check(state, wr); if (status != DDI_SUCCESS) { return (status); } bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc + sizeof (hermon_hw_snd_wqe_ctrl_t)); /* * Build the Bind Memory Window Segments for the WQE, * using the information from the UC Bind memory * window work request. */ HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind); /* * Update the "ds" pointer. Even though the "bind" * operation requires no SGLs, this is necessary to * facilitate the correct descriptor size calculations * (below). */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn + sizeof (hermon_hw_snd_wqe_bind_t)); nds = 0; } break; default: return (IBT_QP_SRV_TYPE_INVALID); } /* * Now fill in the Data Segments (SGL) for the Send WQE based on * the values setup above (i.e. "sgl", "nds", and the "ds" pointer * Start by checking for a valid number of SGL entries */ if (nds > qp->qp_sq_sgl) { return (IBT_QP_SGL_LEN_INVALID); } /* * For each SGL in the Send Work Request, fill in the Send WQE's data * segments. Note: We skip any SGL with zero size because Hermon * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually * the encoding for zero means a 2GB transfer. */ for (last_ds = num_ds, j = i; j < nds; j++) { if (sgl[j].ds_len != 0) last_ds++; /* real last ds of wqe to fill */ } /* * Return the size of descriptor (in 16-byte chunks) * For Hermon, we want them (for now) to be on stride size * boundaries, which was implicit in Tavor/Arbel * */ tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc); *size = tmpsize >> 0x4; for (j = nds; --j >= i; ) { if (sgl[j].ds_len == 0) { continue; } /* * Fill in the Data Segment(s) for the current WQE, using the * information contained in the scatter-gather list of the * work request. */ last_ds--; HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]); } return (DDI_SUCCESS); } /* * hermon_wqe_mlx_build() * Context: Can be called from interrupt or base context. */ static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp, ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) { hermon_ahhdl_t ah; hermon_hw_udav_t *udav; ib_lrh_hdr_t *lrh; ib_grh_t *grh; ib_bth_hdr_t *bth; ib_deth_hdr_t *deth; hermon_hw_wqe_sgl_t *ds; ibt_wr_ds_t *sgl; uint8_t *mgmtclass, *hpoint, *hcount; uint32_t nds, offset, pktlen; uint32_t desc_sz; int i, num_ds; int tmpsize; ASSERT(MUTEX_HELD(&qp->qp_sq_lock)); /* Initialize the information for the Data Segments */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc + sizeof (hermon_hw_mlx_wqe_nextctrl_t)); /* * Pull the address handle from the work request. The UDAV will * be used to answer some questions about the request. */ ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah; if (ah == NULL) { return (IBT_AH_HDL_INVALID); } mutex_enter(&ah->ah_lock); udav = ah->ah_udav; /* * If the request is for QP1 and the destination LID is equal to * the Permissive LID, then return an error. This combination is * not allowed */ if ((udav->rlid == IB_LID_PERMISSIVE) && (qp->qp_is_special == HERMON_QP_GSI)) { mutex_exit(&ah->ah_lock); return (IBT_AH_HDL_INVALID); } /* * Calculate the size of the packet headers, including the GRH * (if necessary) */ desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) + sizeof (ib_deth_hdr_t); if (udav->grh) { desc_sz += sizeof (ib_grh_t); } /* * Begin to build the first "inline" data segment for the packet * headers. Note: By specifying "inline" we can build the contents * of the MAD packet headers directly into the work queue (as part * descriptor). This has the advantage of both speeding things up * and of not requiring the driver to allocate/register any additional * memory for the packet headers. */ HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz); desc_sz += 4; /* * Build Local Route Header (LRH) * We start here by building the LRH into a temporary location. * When we have finished we copy the LRH data into the descriptor. * * Notice that the VL values are hardcoded. This is not a problem * because VL15 is decided later based on the value in the MLX * transport "next/ctrl" header (see the "vl15" bit below), and it * is otherwise (meaning for QP1) chosen from the SL-to-VL table * values. This rule does not hold for loopback packets however * (all of which bypass the SL-to-VL tables) and it is the reason * that non-QP0 MADs are setup with VL hardcoded to zero below. * * Notice also that Source LID is hardcoded to the Permissive LID * (0xFFFF). This is also not a problem because if the Destination * LID is not the Permissive LID, then the "slr" value in the MLX * transport "next/ctrl" header will be set to zero and the hardware * will pull the LID from value in the port. */ lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4); pktlen = (desc_sz + 0x100) >> 2; HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen); /* * Build Global Route Header (GRH) * This is only built if necessary as defined by the "grh" bit in * the address vector. Note: We also calculate the offset to the * next header (BTH) based on whether or not the "grh" bit is set. */ if (udav->grh) { /* * If the request is for QP0, then return an error. The * combination of global routine (GRH) and QP0 is not allowed. */ if (qp->qp_is_special == HERMON_QP_SMI) { mutex_exit(&ah->ah_lock); return (IBT_AH_HDL_INVALID); } grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen); bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t)); } else { bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); } mutex_exit(&ah->ah_lock); /* * Build Base Transport Header (BTH) * Notice that the M, PadCnt, and TVer fields are all set * to zero implicitly. This is true for all Management Datagrams * MADs whether GSI are SMI. */ HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr); /* * Build Datagram Extended Transport Header (DETH) */ deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t)); HERMON_WQE_BUILD_MLX_DETH(deth, qp); /* Ensure that the Data Segment is aligned on a 16-byte boundary */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t)); ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF); nds = wr->wr_nds; sgl = wr->wr_sgl; num_ds = 0; /* * Now fill in the Data Segments (SGL) for the MLX WQE based on the * values set up above (i.e. "sgl", "nds", and the "ds" pointer * Start by checking for a valid number of SGL entries */ if (nds > qp->qp_sq_sgl) { return (IBT_QP_SGL_LEN_INVALID); } /* * For each SGL in the Send Work Request, fill in the MLX WQE's data * segments. Note: We skip any SGL with zero size because Hermon * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually * the encoding for zero means a 2GB transfer. Because of this special * encoding in the hardware, we mask the requested length with * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as * zero.) */ mgmtclass = hpoint = hcount = NULL; offset = 0; for (i = 0; i < nds; i++) { if (sgl[i].ds_len == 0) { continue; } /* * Fill in the Data Segment(s) for the MLX send WQE, using * the information contained in the scatter-gather list of * the work request. */ HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]); /* * Search through the contents of all MADs posted to QP0 to * initialize pointers to the places where Directed Route "hop * pointer", "hop count", and "mgmtclass" would be. Hermon * needs these updated (i.e. incremented or decremented, as * necessary) by software. */ if (qp->qp_is_special == HERMON_QP_SMI) { HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass, offset, sgl[i].ds_va, sgl[i].ds_len); HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint, offset, sgl[i].ds_va, sgl[i].ds_len); HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount, offset, sgl[i].ds_va, sgl[i].ds_len); offset += sgl[i].ds_len; } num_ds++; } /* * Hermon's Directed Route MADs need to have the "hop pointer" * incremented/decremented (as necessary) depending on whether it is * currently less than or greater than the "hop count" (i.e. whether * the MAD is a request or a response.) */ if (qp->qp_is_special == HERMON_QP_SMI) { HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass, *hpoint, *hcount); } /* * Now fill in the ICRC Data Segment. This data segment is inlined * just like the packets headers above, but it is only four bytes and * set to zero (to indicate that we wish the hardware to generate ICRC. */ HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0); num_ds++; /* * Return the size of descriptor (in 16-byte chunks) * For Hermon, we want them (for now) to be on stride size * boundaries, which was implicit in Tavor/Arbel */ tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc); *size = tmpsize >> 0x04; return (DDI_SUCCESS); } /* * hermon_wqe_recv_build() * Context: Can be called from interrupt or base context. */ /* ARGSUSED */ static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp, ibt_recv_wr_t *wr, uint64_t *desc) { hermon_hw_wqe_sgl_t *ds; int i, num_ds; ASSERT(MUTEX_HELD(&qp->qp_lock)); /* * Fill in the Data Segments (SGL) for the Recv WQE - don't * need to have a reserved for the ctrl, there is none on the * recv queue for hermon, but will need to put an invalid * (null) scatter pointer per PRM */ ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc; num_ds = 0; /* Check for valid number of SGL entries */ if (wr->wr_nds > qp->qp_rq_sgl) { return (IBT_QP_SGL_LEN_INVALID); } /* * For each SGL in the Recv Work Request, fill in the Recv WQE's data * segments. Note: We skip any SGL with zero size because Hermon * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually * the encoding for zero means a 2GB transfer. Because of this special * encoding in the hardware, we mask the requested length with * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as * zero.) */ for (i = 0; i < wr->wr_nds; i++) { if (wr->wr_sgl[i].ds_len == 0) { continue; } /* * Fill in the Data Segment(s) for the receive WQE, using the * information contained in the scatter-gather list of the * work request. */ HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]); num_ds++; } /* put the null sgl pointer as well if needed */ if (num_ds < qp->qp_rq_sgl) { HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl); } return (DDI_SUCCESS); } /* * hermon_wqe_srq_build() * Context: Can be called from interrupt or base context. */ /* ARGSUSED */ static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq, ibt_recv_wr_t *wr, uint64_t *desc) { hermon_hw_wqe_sgl_t *ds; int i, num_ds; ASSERT(MUTEX_HELD(&srq->srq_lock)); /* Fill in the Data Segments (SGL) for the Recv WQE */ ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc + sizeof (hermon_hw_srq_wqe_next_t)); num_ds = 0; /* Check for valid number of SGL entries */ if (wr->wr_nds > srq->srq_wq_sgl) { return (IBT_QP_SGL_LEN_INVALID); } /* * For each SGL in the Recv Work Request, fill in the Recv WQE's data * segments. Note: We skip any SGL with zero size because Hermon * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually * the encoding for zero means a 2GB transfer. Because of this special * encoding in the hardware, we mask the requested length with * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as * zero.) */ for (i = 0; i < wr->wr_nds; i++) { if (wr->wr_sgl[i].ds_len == 0) { continue; } /* * Fill in the Data Segment(s) for the receive WQE, using the * information contained in the scatter-gather list of the * work request. */ HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]); num_ds++; } /* * put in the null sgl pointer as well, if needed */ if (num_ds < srq->srq_wq_sgl) { HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl); } return (DDI_SUCCESS); } /* * hermon_wr_get_immediate() * Context: Can be called from interrupt or base context. */ static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr) { /* * This routine extracts the "immediate data" from the appropriate * location in the IBTF work request. Because of the way the * work request structure is defined, the location for this data * depends on the actual work request operation type. */ /* For RDMA Write, test if RC or UC */ if (wr->wr_opcode == IBT_WRC_RDMAW) { if (wr->wr_trans == IBT_RC_SRV) { return (wr->wr.rc.rcwr.rdma.rdma_immed); } else { /* IBT_UC_SRV */ return (wr->wr.uc.ucwr.rdma.rdma_immed); } } /* For Send, test if RC, UD, or UC */ if (wr->wr_opcode == IBT_WRC_SEND) { if (wr->wr_trans == IBT_RC_SRV) { return (wr->wr.rc.rcwr.send_immed); } else if (wr->wr_trans == IBT_UD_SRV) { return (wr->wr.ud.udwr_immed); } else { /* IBT_UC_SRV */ return (wr->wr.uc.ucwr.send_immed); } } /* * If any other type of request, then immediate is undefined */ return (0); } /* * hermon_wqe_headroom() * Context: can be called from interrupt or base, currently only from * base context. * Routine that fills in the headroom for the Send Queue */ static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp) { uint32_t *wqe_start, *wqe_top, *wqe_base, qsize; int hdrmwqes, wqesizebytes, sectperwqe; uint32_t invalue; int i, j; qsize = qp->qp_sq_bufsz; wqesizebytes = 1 << qp->qp_sq_log_wqesz; sectperwqe = wqesizebytes >> 6; /* 64 bytes/section */ hdrmwqes = qp->qp_sq_hdrmwqes; wqe_base = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0); wqe_top = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize); wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from); for (i = 0; i < hdrmwqes; i++) { for (j = 0; j < sectperwqe; j++) { if (j == 0) { /* 1st section of wqe */ /* perserve ownership bit */ invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl, wqe_start) | 0x7FFFFFFF; } else { /* or just invalidate it */ invalue = 0xFFFFFFFF; } ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue); wqe_start += 16; /* move 64 bytes */ } if (wqe_start == wqe_top) /* hit the end of the queue */ wqe_start = wqe_base; /* wrap to start */ } } /* * hermon_wr_bind_check() * Context: Can be called from interrupt or base context. */ /* ARGSUSED */ static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr) { ibt_bind_flags_t bind_flags; uint64_t vaddr, len; uint64_t reg_start_addr, reg_end_addr; hermon_mwhdl_t mw; hermon_mrhdl_t mr; hermon_rsrc_t *mpt; uint32_t new_rkey; /* Check for a valid Memory Window handle in the WR */ mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl; if (mw == NULL) { return (IBT_MW_HDL_INVALID); } /* Check for a valid Memory Region handle in the WR */ mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl; if (mr == NULL) { return (IBT_MR_HDL_INVALID); } mutex_enter(&mr->mr_lock); mutex_enter(&mw->mr_lock); /* * Check here to see if the memory region has already been partially * deregistered as a result of a hermon_umap_umemlock_cb() callback. * If so, this is an error, return failure. */ if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) { mutex_exit(&mr->mr_lock); mutex_exit(&mw->mr_lock); return (IBT_MR_HDL_INVALID); } /* Check for a valid Memory Window RKey (i.e. a matching RKey) */ if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) { mutex_exit(&mr->mr_lock); mutex_exit(&mw->mr_lock); return (IBT_MR_RKEY_INVALID); } /* Check for a valid Memory Region LKey (i.e. a matching LKey) */ if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) { mutex_exit(&mr->mr_lock); mutex_exit(&mw->mr_lock); return (IBT_MR_LKEY_INVALID); } /* * Now check for valid "vaddr" and "len". Note: We don't check the * "vaddr" range when "len == 0" (i.e. on unbind operations) */ len = wr->wr.rc.rcwr.bind->bind_len; if (len != 0) { vaddr = wr->wr.rc.rcwr.bind->bind_va; reg_start_addr = mr->mr_bindinfo.bi_addr; reg_end_addr = mr->mr_bindinfo.bi_addr + (mr->mr_bindinfo.bi_len - 1); if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) { mutex_exit(&mr->mr_lock); mutex_exit(&mw->mr_lock); return (IBT_MR_VA_INVALID); } vaddr = (vaddr + len) - 1; if (vaddr > reg_end_addr) { mutex_exit(&mr->mr_lock); mutex_exit(&mw->mr_lock); return (IBT_MR_LEN_INVALID); } } /* * Validate the bind access flags. Remote Write and Atomic access for * the Memory Window require that Local Write access be set in the * corresponding Memory Region. */ bind_flags = wr->wr.rc.rcwr.bind->bind_flags; if (((bind_flags & IBT_WR_BIND_WRITE) || (bind_flags & IBT_WR_BIND_ATOMIC)) && !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) { mutex_exit(&mr->mr_lock); mutex_exit(&mw->mr_lock); return (IBT_MR_ACCESS_REQ_INVALID); } /* Calculate the new RKey for the Memory Window */ mpt = mw->mr_mptrsrcp; new_rkey = hermon_mr_keycalc(mpt->hr_indx); new_rkey = hermon_mr_key_swap(new_rkey); wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey; mw->mr_rkey = new_rkey; mutex_exit(&mr->mr_lock); mutex_exit(&mw->mr_lock); return (DDI_SUCCESS); } /* * hermon_wrid_from_reset_handling() * Context: Can be called from interrupt or base context. */ /* ARGSUSED */ int hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp) { hermon_workq_hdr_t *swq, *rwq; if (qp->qp_alloc_flags & IBT_QP_USER_MAP) return (DDI_SUCCESS); #ifdef __lock_lint mutex_enter(&qp->qp_rq_cqhdl->cq_lock); mutex_enter(&qp->qp_sq_cqhdl->cq_lock); #else /* grab the cq lock(s) to modify the wqavl tree */ if (qp->qp_rq_cqhdl) mutex_enter(&qp->qp_rq_cqhdl->cq_lock); if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl && qp->qp_sq_cqhdl != NULL) mutex_enter(&qp->qp_sq_cqhdl->cq_lock); #endif /* Chain the newly allocated work queue header to the CQ's list */ if (qp->qp_sq_cqhdl) hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl); swq = qp->qp_sq_wqhdr; swq->wq_head = 0; swq->wq_tail = 0; swq->wq_full = 0; /* * Now we repeat all the above operations for the receive work queue, * or shared receive work queue. * * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case. */ #ifdef __lock_lint mutex_enter(&qp->qp_srqhdl->srq_lock); #else if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) { mutex_enter(&qp->qp_srqhdl->srq_lock); } else { rwq = qp->qp_rq_wqhdr; rwq->wq_head = 0; rwq->wq_tail = 0; rwq->wq_full = 0; qp->qp_rq_wqecntr = 0; } #endif hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl); #ifdef __lock_lint mutex_exit(&qp->qp_srqhdl->srq_lock); #else if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) { mutex_exit(&qp->qp_srqhdl->srq_lock); } #endif #ifdef __lock_lint mutex_exit(&qp->qp_sq_cqhdl->cq_lock); mutex_exit(&qp->qp_rq_cqhdl->cq_lock); #else if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl && qp->qp_sq_cqhdl != NULL) mutex_exit(&qp->qp_sq_cqhdl->cq_lock); if (qp->qp_rq_cqhdl) mutex_exit(&qp->qp_rq_cqhdl->cq_lock); #endif return (DDI_SUCCESS); } /* * hermon_wrid_to_reset_handling() * Context: Can be called from interrupt or base context. */ int hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp) { if (qp->qp_alloc_flags & IBT_QP_USER_MAP) return (DDI_SUCCESS); /* * If there are unpolled entries in these CQs, they are * polled/flushed. * Grab the CQ lock(s) before manipulating the lists. */ #ifdef __lock_lint mutex_enter(&qp->qp_rq_cqhdl->cq_lock); mutex_enter(&qp->qp_sq_cqhdl->cq_lock); #else /* grab the cq lock(s) to modify the wqavl tree */ if (qp->qp_rq_cqhdl) mutex_enter(&qp->qp_rq_cqhdl->cq_lock); if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl && qp->qp_sq_cqhdl != NULL) mutex_enter(&qp->qp_sq_cqhdl->cq_lock); #endif #ifdef __lock_lint mutex_enter(&qp->qp_srqhdl->srq_lock); #else if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) { mutex_enter(&qp->qp_srqhdl->srq_lock); } #endif /* * Flush the entries on the CQ for this QP's QPN. */ hermon_cq_entries_flush(state, qp); #ifdef __lock_lint mutex_exit(&qp->qp_srqhdl->srq_lock); #else if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) { mutex_exit(&qp->qp_srqhdl->srq_lock); } #endif hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl); if (qp->qp_sq_cqhdl != NULL) hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl); #ifdef __lock_lint mutex_exit(&qp->qp_sq_cqhdl->cq_lock); mutex_exit(&qp->qp_rq_cqhdl->cq_lock); #else if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl && qp->qp_sq_cqhdl != NULL) mutex_exit(&qp->qp_sq_cqhdl->cq_lock); if (qp->qp_rq_cqhdl) mutex_exit(&qp->qp_rq_cqhdl->cq_lock); #endif return (IBT_SUCCESS); } /* * hermon_wrid_get_entry() * Context: Can be called from interrupt or base context. */ uint64_t hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe) { hermon_workq_avl_t *wqa; hermon_workq_hdr_t *wq; uint64_t wrid; uint_t send_or_recv, qpnum; uint32_t indx; /* * Determine whether this CQE is a send or receive completion. */ send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe); /* Find the work queue for this QP number (send or receive side) */ qpnum = HERMON_CQE_QPNUM_GET(cq, cqe); wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv); wq = wqa->wqa_wq; /* * Regardless of whether the completion is the result of a "success" * or a "failure", we lock the list of "containers" and attempt to * search for the the first matching completion (i.e. the first WR * with a matching WQE addr and size). Once we find it, we pull out * the "wrid" field and return it (see below). XXX Note: One possible * future enhancement would be to enable this routine to skip over * any "unsignaled" completions to go directly to the next "signaled" * entry on success. */ indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask; wrid = wq->wq_wrid[indx]; if (wqa->wqa_srq_en) { struct hermon_sw_srq_s *srq; uint64_t *desc; /* put wqe back on the srq free list */ srq = wqa->wqa_srq; mutex_enter(&srq->srq_lock); desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail); ((uint16_t *)desc)[1] = htons(indx); wq->wq_tail = indx; mutex_exit(&srq->srq_lock); } else { wq->wq_head = (indx + 1) & wq->wq_mask; wq->wq_full = 0; } return (wrid); } int hermon_wrid_workq_compare(const void *p1, const void *p2) { hermon_workq_compare_t *cmpp; hermon_workq_avl_t *curr; cmpp = (hermon_workq_compare_t *)p1; curr = (hermon_workq_avl_t *)p2; if (cmpp->cmp_qpn < curr->wqa_qpn) return (-1); else if (cmpp->cmp_qpn > curr->wqa_qpn) return (+1); else if (cmpp->cmp_type < curr->wqa_type) return (-1); else if (cmpp->cmp_type > curr->wqa_type) return (+1); else return (0); } /* * hermon_wrid_workq_find() * Context: Can be called from interrupt or base context. */ static hermon_workq_avl_t * hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type) { hermon_workq_avl_t *curr; hermon_workq_compare_t cmp; /* * Walk the CQ's work queue list, trying to find a send or recv queue * with the same QP number. We do this even if we are going to later * create a new entry because it helps us easily find the end of the * list. */ cmp.cmp_qpn = qpn; cmp.cmp_type = wq_type; #ifdef __lock_lint hermon_wrid_workq_compare(NULL, NULL); #endif curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL); return (curr); } /* * hermon_wrid_wqhdr_create() * Context: Can be called from base context. */ /* ARGSUSED */ hermon_workq_hdr_t * hermon_wrid_wqhdr_create(int bufsz) { hermon_workq_hdr_t *wqhdr; /* * Allocate space for the wqhdr, and an array to record all the wrids. */ wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP); if (wqhdr == NULL) { return (NULL); } _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr)) wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP); if (wqhdr->wq_wrid == NULL) { kmem_free(wqhdr, sizeof (*wqhdr)); return (NULL); } wqhdr->wq_size = bufsz; wqhdr->wq_mask = bufsz - 1; return (wqhdr); } void hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr) { kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t)); kmem_free(wqhdr, sizeof (*wqhdr)); } /* * hermon_cq_workq_add() * Context: Can be called from interrupt or base context. */ static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl) { hermon_workq_compare_t cmp; avl_index_t where; cmp.cmp_qpn = wqavl->wqa_qpn; cmp.cmp_type = wqavl->wqa_type; #ifdef __lock_lint hermon_wrid_workq_compare(NULL, NULL); #endif (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where); avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where); } /* * hermon_cq_workq_remove() * Context: Can be called from interrupt or base context. */ static void hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl) { #ifdef __lock_lint hermon_wrid_workq_compare(NULL, NULL); #endif avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl); }