1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/kmem.h>
28 #include <sys/conf.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/ksynch.h>
32 
33 #include <sys/ib/clients/eoib/eib_impl.h>
34 
35 /*
36  * Declarations private to this file
37  */
38 static int eib_rsrc_setup_txbufs(eib_t *, int *);
39 static int eib_rsrc_setup_rxbufs(eib_t *, int *);
40 static int eib_rsrc_setup_lsobufs(eib_t *, int *);
41 static void eib_rsrc_init_wqe_pool(eib_t *, eib_wqe_pool_t **,
42     ib_memlen_t, int);
43 static void eib_rsrc_fini_wqe_pool(eib_t *, eib_wqe_pool_t **);
44 static boolean_t eib_rsrc_ok_to_free_pool(eib_t *, eib_wqe_pool_t *, boolean_t);
45 static int eib_rsrc_grab_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, uint_t,
46     uint_t *, int);
47 static void eib_rsrc_return_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **,
48     uint_t);
49 
50 static void eib_rb_rsrc_setup_txbufs(eib_t *, boolean_t);
51 static void eib_rb_rsrc_setup_rxbufs(eib_t *, boolean_t);
52 static void eib_rb_rsrc_setup_lsobufs(eib_t *, boolean_t);
53 
54 /*
55  * Definitions private to this file
56  */
57 static uint_t eib_lso_num_bufs = EIB_LSO_NUM_BUFS;	/* tunable? */
58 
59 int
eib_rsrc_setup_bufs(eib_t * ss,int * err)60 eib_rsrc_setup_bufs(eib_t *ss, int *err)
61 {
62 	if (eib_rsrc_setup_txbufs(ss, err) != EIB_E_SUCCESS)
63 		return (EIB_E_FAILURE);
64 
65 	if (ss->ei_caps->cp_lso_maxlen && ss->ei_caps->cp_cksum_flags &&
66 	    ss->ei_caps->cp_resv_lkey_capab) {
67 		if (eib_rsrc_setup_lsobufs(ss, err) != EIB_E_SUCCESS) {
68 			eib_rb_rsrc_setup_txbufs(ss, B_FALSE);
69 			return (EIB_E_FAILURE);
70 		}
71 	}
72 
73 	if (eib_rsrc_setup_rxbufs(ss, err) != EIB_E_SUCCESS) {
74 		eib_rb_rsrc_setup_lsobufs(ss, B_FALSE);
75 		eib_rb_rsrc_setup_txbufs(ss, B_FALSE);
76 		return (EIB_E_FAILURE);
77 	}
78 
79 	return (EIB_E_SUCCESS);
80 }
81 
82 int
eib_rsrc_grab_swqes(eib_t * ss,eib_wqe_t ** wqes,uint_t n_req,uint_t * actual,int pri)83 eib_rsrc_grab_swqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual,
84     int pri)
85 {
86 	eib_wqe_t *wqe;
87 	uint32_t *encap_hdr;
88 	int ret;
89 	int i;
90 
91 	ASSERT(ss->ei_tx != NULL);
92 
93 	ret = eib_rsrc_grab_wqes(ss, ss->ei_tx, wqes, n_req, actual, pri);
94 	if (ret != EIB_E_SUCCESS)
95 		return (EIB_E_FAILURE);
96 
97 	/*
98 	 * See note for eib_rsrc_grab_swqe()
99 	 */
100 	for (i = 0; i < (*actual); i++) {
101 		wqe = wqes[i];
102 		wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS;
103 		wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest;
104 		wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND;
105 		wqe->qe_wr.send.wr_nds = 1;
106 		wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl;
107 		wqe->qe_nxt_post = NULL;
108 		wqe->qe_iov_hdl = NULL;
109 
110 		encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr;
111 		*encap_hdr = htonl(EIB_TX_ENCAP_HDR);
112 	}
113 
114 	return (EIB_E_SUCCESS);
115 }
116 
117 int
eib_rsrc_grab_rwqes(eib_t * ss,eib_wqe_t ** wqes,uint_t n_req,uint_t * actual,int pri)118 eib_rsrc_grab_rwqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual,
119     int pri)
120 {
121 	ASSERT(ss->ei_rx != NULL);
122 
123 	return (eib_rsrc_grab_wqes(ss, ss->ei_rx, wqes, n_req, actual, pri));
124 }
125 
126 int
eib_rsrc_grab_lsobufs(eib_t * ss,uint_t req_sz,ibt_wr_ds_t * sgl,uint32_t * nds)127 eib_rsrc_grab_lsobufs(eib_t *ss, uint_t req_sz, ibt_wr_ds_t *sgl, uint32_t *nds)
128 {
129 	eib_lsobkt_t *bkt = ss->ei_lso;
130 	eib_lsobuf_t *elem;
131 	eib_lsobuf_t *nxt;
132 	uint_t frag_sz;
133 	uint_t num_needed;
134 	int i;
135 
136 	ASSERT(req_sz != 0);
137 	ASSERT(sgl != NULL);
138 	ASSERT(nds != NULL);
139 
140 	/*
141 	 * Determine how many bufs we'd need for the size requested
142 	 */
143 	num_needed = req_sz / EIB_LSO_BUFSZ;
144 	if ((frag_sz = req_sz % EIB_LSO_BUFSZ) != 0)
145 		num_needed++;
146 
147 	if (bkt == NULL)
148 		return (EIB_E_FAILURE);
149 
150 	/*
151 	 * If we don't have enough lso bufs, return failure
152 	 */
153 	mutex_enter(&bkt->bk_lock);
154 	if (bkt->bk_nfree < num_needed) {
155 		mutex_exit(&bkt->bk_lock);
156 		return (EIB_E_FAILURE);
157 	}
158 
159 	/*
160 	 * Pick the first "num_needed" bufs from the free list
161 	 */
162 	elem = bkt->bk_free_head;
163 	for (i = 0; i < num_needed; i++) {
164 		ASSERT(elem->lb_isfree != 0);
165 		ASSERT(elem->lb_buf != NULL);
166 
167 		nxt = elem->lb_next;
168 
169 		sgl[i].ds_va = (ib_vaddr_t)(uintptr_t)elem->lb_buf;
170 		sgl[i].ds_key = bkt->bk_lkey;
171 		sgl[i].ds_len = EIB_LSO_BUFSZ;
172 
173 		elem->lb_isfree = 0;
174 		elem->lb_next = NULL;
175 
176 		elem = nxt;
177 	}
178 	bkt->bk_free_head = elem;
179 
180 	/*
181 	 * If the requested size is not a multiple of EIB_LSO_BUFSZ, we need
182 	 * to adjust the last sgl entry's length. Since we know we need atleast
183 	 * one, the i-1 use below is ok.
184 	 */
185 	if (frag_sz) {
186 		sgl[i-1].ds_len = frag_sz;
187 	}
188 
189 	/*
190 	 * Update nfree count and return
191 	 */
192 	bkt->bk_nfree -= num_needed;
193 
194 	mutex_exit(&bkt->bk_lock);
195 
196 	*nds = num_needed;
197 
198 	return (EIB_E_SUCCESS);
199 }
200 
201 eib_wqe_t *
eib_rsrc_grab_swqe(eib_t * ss,int pri)202 eib_rsrc_grab_swqe(eib_t *ss, int pri)
203 {
204 	eib_wqe_t *wqe = NULL;
205 	uint32_t *encap_hdr;
206 
207 	ASSERT(ss->ei_tx != NULL);
208 	(void) eib_rsrc_grab_wqes(ss, ss->ei_tx, &wqe, 1, NULL, pri);
209 
210 	/*
211 	 * Let's reset the swqe basic wr parameters to default. We need
212 	 * to do this because this swqe could've previously been used
213 	 * for a checksum offload (when the flags would've been set)
214 	 * or for an LSO send (in which case the opcode would've been set
215 	 * to a different value), or been iov mapped (in which case the
216 	 * sgl/nds could've been set to different values).  We'll make
217 	 * it easy and initialize it here, so simple transactions can
218 	 * go through without any special effort by the caller.
219 	 *
220 	 * Note that even though the wqe structure is common for both
221 	 * send and recv, they're in two independent pools and the wqe
222 	 * type remains the same throughout its lifetime. So we don't
223 	 * have to worry about resetting any other field.
224 	 */
225 	if (wqe) {
226 		wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS;
227 		wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest;
228 		wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND;
229 		wqe->qe_wr.send.wr_nds = 1;
230 		wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl;
231 		wqe->qe_nxt_post = NULL;
232 		wqe->qe_iov_hdl = NULL;
233 
234 		encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr;
235 		*encap_hdr = htonl(EIB_TX_ENCAP_HDR);
236 	}
237 
238 	return (wqe);
239 }
240 
241 eib_wqe_t *
eib_rsrc_grab_rwqe(eib_t * ss,int pri)242 eib_rsrc_grab_rwqe(eib_t *ss, int pri)
243 {
244 	eib_wqe_t *wqe = NULL;
245 
246 	ASSERT(ss->ei_rx != NULL);
247 	(void) eib_rsrc_grab_wqes(ss, ss->ei_rx, &wqe, 1, NULL, pri);
248 
249 	return (wqe);
250 }
251 
252 void
eib_rsrc_return_swqe(eib_t * ss,eib_wqe_t * wqe,eib_chan_t * chan)253 eib_rsrc_return_swqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan)
254 {
255 	ASSERT(ss->ei_tx != NULL);
256 
257 	eib_rsrc_return_wqes(ss, ss->ei_tx, &wqe, 1);
258 	if (chan) {
259 		eib_rsrc_decr_posted_swqe(ss, chan);
260 	}
261 }
262 
263 
264 void
eib_rsrc_return_rwqe(eib_t * ss,eib_wqe_t * wqe,eib_chan_t * chan)265 eib_rsrc_return_rwqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan)
266 {
267 	ASSERT(ss->ei_rx != NULL);
268 
269 	eib_rsrc_return_wqes(ss, ss->ei_rx, &wqe, 1);
270 	if (chan) {
271 		eib_rsrc_decr_posted_rwqe(ss, chan);
272 	}
273 }
274 
275 void
eib_rsrc_return_lsobufs(eib_t * ss,ibt_wr_ds_t * sgl_p,uint32_t nds)276 eib_rsrc_return_lsobufs(eib_t *ss, ibt_wr_ds_t *sgl_p, uint32_t nds)
277 {
278 	eib_lsobkt_t *bkt = ss->ei_lso;
279 	eib_lsobuf_t *elem;
280 	uint8_t *va;
281 	ptrdiff_t ndx;
282 	int i;
283 
284 	/*
285 	 * Nowhere to return the buffers to ??
286 	 */
287 	if (bkt == NULL)
288 		return;
289 
290 	mutex_enter(&bkt->bk_lock);
291 
292 	for (i = 0; i < nds; i++) {
293 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
294 
295 		ASSERT(va >= bkt->bk_mem);
296 		ASSERT(va < (bkt->bk_mem + bkt->bk_nelem * EIB_LSO_BUFSZ));
297 
298 		/*
299 		 * Figure out the buflist element this sgl buffer corresponds
300 		 * to and put it back at the head
301 		 */
302 		ndx = ((uintptr_t)va - (uintptr_t)bkt->bk_mem) / EIB_LSO_BUFSZ;
303 		elem = bkt->bk_bufl + ndx;
304 
305 		ASSERT(elem->lb_isfree == 0);
306 		ASSERT(elem->lb_buf == va);
307 
308 		elem->lb_isfree = 1;
309 		elem->lb_next = bkt->bk_free_head;
310 		bkt->bk_free_head = elem;
311 	}
312 	bkt->bk_nfree += nds;
313 
314 	/*
315 	 * If the number of available lso buffers just crossed the
316 	 * threshold, wakeup anyone who may be sleeping on the event.
317 	 */
318 	if (((bkt->bk_nfree - nds) < EIB_LSO_FREE_BUFS_THRESH) &&
319 	    (bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH)) {
320 		cv_broadcast(&bkt->bk_cv);
321 	}
322 
323 	mutex_exit(&bkt->bk_lock);
324 }
325 
326 /*ARGSUSED*/
327 void
eib_rsrc_decr_posted_swqe(eib_t * ss,eib_chan_t * chan)328 eib_rsrc_decr_posted_swqe(eib_t *ss, eib_chan_t *chan)
329 {
330 	ASSERT(chan != NULL);
331 
332 	mutex_enter(&chan->ch_tx_lock);
333 
334 	chan->ch_tx_posted--;
335 	if ((chan->ch_tear_down) && (chan->ch_tx_posted == 0)) {
336 		cv_signal(&chan->ch_tx_cv);
337 	}
338 
339 	mutex_exit(&chan->ch_tx_lock);
340 }
341 
342 void
eib_rsrc_decr_posted_rwqe(eib_t * ss,eib_chan_t * chan)343 eib_rsrc_decr_posted_rwqe(eib_t *ss, eib_chan_t *chan)
344 {
345 	eib_chan_t *tail;
346 	boolean_t queue_for_refill = B_FALSE;
347 
348 	ASSERT(chan != NULL);
349 
350 	/*
351 	 * Decrement the ch_rx_posted count. If we are tearing this channel
352 	 * down, signal the waiter when the count reaches 0.  If we aren't
353 	 * tearing the channel down, see if the count has gone below the low
354 	 * water mark.  If it has, and if this channel isn't already being
355 	 * refilled, queue the channel up with the service thread for a
356 	 * rwqe refill.
357 	 */
358 	mutex_enter(&chan->ch_rx_lock);
359 	chan->ch_rx_posted--;
360 	if (chan->ch_tear_down) {
361 		if (chan->ch_rx_posted == 0)
362 			cv_signal(&chan->ch_rx_cv);
363 	} else if (chan->ch_rx_posted < chan->ch_lwm_rwqes) {
364 		if (chan->ch_rx_refilling == B_FALSE) {
365 			chan->ch_rx_refilling = B_TRUE;
366 			queue_for_refill = B_TRUE;
367 		}
368 	}
369 	mutex_exit(&chan->ch_rx_lock);
370 
371 	if (queue_for_refill) {
372 		mutex_enter(&ss->ei_rxpost_lock);
373 
374 		chan->ch_rxpost_next = NULL;
375 		for (tail = ss->ei_rxpost; tail; tail = tail->ch_rxpost_next) {
376 			if (tail->ch_rxpost_next == NULL)
377 				break;
378 		}
379 		if (tail) {
380 			tail->ch_rxpost_next = chan;
381 		} else {
382 			ss->ei_rxpost = chan;
383 		}
384 
385 		cv_signal(&ss->ei_rxpost_cv);
386 		mutex_exit(&ss->ei_rxpost_lock);
387 	}
388 }
389 
390 void
eib_rsrc_txwqes_needed(eib_t * ss)391 eib_rsrc_txwqes_needed(eib_t *ss)
392 {
393 	eib_wqe_pool_t *wp = ss->ei_tx;
394 
395 	EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf);
396 
397 	mutex_enter(&wp->wp_lock);
398 	if ((wp->wp_status & EIB_TXWQE_SHORT) == 0) {
399 		wp->wp_status |= EIB_TXWQE_SHORT;
400 		cv_broadcast(&wp->wp_cv);
401 	}
402 	mutex_exit(&wp->wp_lock);
403 }
404 
405 void
eib_rsrc_lsobufs_needed(eib_t * ss)406 eib_rsrc_lsobufs_needed(eib_t *ss)
407 {
408 	eib_lsobkt_t *bkt = ss->ei_lso;
409 
410 	EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf);
411 
412 	if (bkt == NULL) {
413 		EIB_DPRINTF_WARN(ss->ei_instance,
414 		    "eib_rsrc_lsobufs_needed: "
415 		    "lso bufs seem to be needed even though "
416 		    "LSO support was not advertised");
417 		return;
418 	}
419 
420 	mutex_enter(&bkt->bk_lock);
421 	if ((bkt->bk_status & EIB_LBUF_SHORT) == 0) {
422 		bkt->bk_status |= EIB_LBUF_SHORT;
423 		cv_broadcast(&bkt->bk_cv);
424 	}
425 	mutex_exit(&bkt->bk_lock);
426 }
427 
428 boolean_t
eib_rsrc_rxpool_low(eib_wqe_t * wqe)429 eib_rsrc_rxpool_low(eib_wqe_t *wqe)
430 {
431 	eib_wqe_pool_t *wp = wqe->qe_pool;
432 	boolean_t ret = B_FALSE;
433 
434 	/*
435 	 * Set the EIB_RXWQE_SHORT flag when the number of free wqes
436 	 * in the rx pool falls below the low threshold for rwqes and
437 	 * clear it only when the number of free wqes gets back above
438 	 * the high water mark.
439 	 */
440 	mutex_enter(&wp->wp_lock);
441 
442 	if (wp->wp_nfree <= EIB_NFREE_RWQES_LOW) {
443 		wp->wp_status |= (EIB_RXWQE_SHORT);
444 	} else if (wp->wp_nfree >= EIB_NFREE_RWQES_HWM) {
445 		wp->wp_status &= (~EIB_RXWQE_SHORT);
446 	}
447 
448 	if ((wp->wp_status & EIB_RXWQE_SHORT) == EIB_RXWQE_SHORT)
449 		ret = B_TRUE;
450 
451 	mutex_exit(&wp->wp_lock);
452 
453 	return (ret);
454 }
455 
456 void
eib_rb_rsrc_setup_bufs(eib_t * ss,boolean_t force)457 eib_rb_rsrc_setup_bufs(eib_t *ss, boolean_t force)
458 {
459 	eib_rb_rsrc_setup_rxbufs(ss, force);
460 	eib_rb_rsrc_setup_lsobufs(ss, force);
461 	eib_rb_rsrc_setup_txbufs(ss, force);
462 }
463 
464 static int
eib_rsrc_setup_txbufs(eib_t * ss,int * err)465 eib_rsrc_setup_txbufs(eib_t *ss, int *err)
466 {
467 	eib_wqe_pool_t *tx;
468 	eib_wqe_t *wqe;
469 	ibt_ud_dest_hdl_t dest;
470 	ibt_mr_attr_t attr;
471 	ibt_mr_desc_t desc;
472 	ibt_status_t ret;
473 	kthread_t *kt;
474 	uint32_t *encap_hdr;
475 	uint8_t	*buf;
476 	uint_t mtu = ss->ei_props->ep_mtu;
477 	uint_t tx_bufsz;
478 	uint_t blk;
479 	uint_t ndx;
480 	uint_t i;
481 	int lso_enabled;
482 
483 	/*
484 	 * Try to allocate and initialize the tx wqe pool
485 	 */
486 	if (ss->ei_tx != NULL)
487 		return (EIB_E_SUCCESS);
488 
489 	/*
490 	 * If we keep the tx buffers as mtu-sized, then potentially every
491 	 * LSO request that cannot be satisfactorily mapped, will use up
492 	 * the 8K large (default size) lso buffers. This may be inadvisable
493 	 * given that lso buffers are a scarce resource.  Instead, we'll
494 	 * slightly raise the size of the copy buffers in the send wqes
495 	 * (say to EIB_TX_COPY_THRESH) so that requests that cannot be
496 	 * mapped could still avoid using the 8K LSO buffers if they're
497 	 * less than the copy threshold size.
498 	 */
499 	lso_enabled = ss->ei_caps->cp_lso_maxlen &&
500 	    ss->ei_caps->cp_cksum_flags && ss->ei_caps->cp_resv_lkey_capab;
501 	tx_bufsz = ((lso_enabled) && (EIB_TX_COPY_THRESH > mtu)) ?
502 	    EIB_TX_COPY_THRESH : mtu;
503 
504 	eib_rsrc_init_wqe_pool(ss, &ss->ei_tx, tx_bufsz, EIB_WP_TYPE_TX);
505 	tx = ss->ei_tx;
506 
507 	/*
508 	 * Register the TX memory region with IBTF for use
509 	 */
510 	attr.mr_vaddr = tx->wp_vaddr;
511 	attr.mr_len = tx->wp_memsz;
512 	attr.mr_as = NULL;
513 	attr.mr_flags = IBT_MR_SLEEP;
514 
515 	ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr,
516 	    &tx->wp_mr, &desc);
517 	if (ret != IBT_SUCCESS) {
518 		EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_txbufs: "
519 		    "ibt_register_mr() failed for tx "
520 		    "region (0x%llx, 0x%llx) with ret=%d",
521 		    attr.mr_vaddr, attr.mr_len, ret);
522 
523 		*err = EINVAL;
524 		goto rsrc_setup_txbufs_fail;
525 	}
526 	tx->wp_lkey = desc.md_lkey;
527 
528 	/*
529 	 * Now setup the tx wqes
530 	 */
531 	buf = (uint8_t *)(uintptr_t)(tx->wp_vaddr);
532 	for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) {
533 		for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) {
534 			wqe = &tx->wp_wqe[i];
535 			/*
536 			 * Allocate a UD destination handle
537 			 */
538 			ret = ibt_alloc_ud_dest(ss->ei_hca_hdl,
539 			    IBT_UD_DEST_NO_FLAGS, ss->ei_pd_hdl, &dest);
540 			if (ret != IBT_SUCCESS) {
541 				EIB_DPRINTF_ERR(ss->ei_instance,
542 				    "eib_rsrc_setup_txbufs: "
543 				    "ibt_alloc_ud_dest(hca_hdl=0x%llx) "
544 				    "failed, ret=%d", ss->ei_hca_hdl, ret);
545 
546 				*err = ENOMEM;
547 				goto rsrc_setup_txbufs_fail;
548 			}
549 
550 			/*
551 			 * These parameters should remain fixed throughout the
552 			 * lifetime of this wqe.
553 			 */
554 			wqe->qe_pool = tx;
555 			wqe->qe_cpbuf = buf;
556 			wqe->qe_bufsz = tx_bufsz;
557 
558 			/*
559 			 * The qe_dest and qe_payload_hdr are specific to tx
560 			 * only, but remain unchanged throughout the lifetime
561 			 * of the wqe.
562 			 *
563 			 * The payload header is normally used when we have an
564 			 * LSO packet to send.  Since the EoIB encapsulation
565 			 * header won't be part of the message we get from the
566 			 * network layer, we'll need to copy the lso header into
567 			 * a new buffer every time before we hand over the LSO
568 			 * send request to the hca driver.
569 			 */
570 			wqe->qe_dest = dest;
571 			wqe->qe_payload_hdr =
572 			    kmem_zalloc(EIB_MAX_PAYLOAD_HDR_SZ, KM_SLEEP);
573 
574 			/*
575 			 * The encapsulation header is at the start of the
576 			 * payload header and is initialized to the default
577 			 * encapsulation header we use (no multiple segments,
578 			 * no FCS). This part of the header is not expected
579 			 * to change.
580 			 */
581 			encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr;
582 			*encap_hdr = htonl(EIB_TX_ENCAP_HDR);
583 
584 			/*
585 			 * The parameter set below are used in tx and rx paths.
586 			 * These parameters (except ds_key) are reset to these
587 			 * default values in eib_rsrc_return_wqes().
588 			 */
589 			wqe->qe_sgl.ds_key = tx->wp_lkey;
590 			wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf;
591 			wqe->qe_sgl.ds_len = wqe->qe_bufsz;
592 			wqe->qe_mp = NULL;
593 			wqe->qe_info =
594 			    ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) |
595 			    ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) |
596 			    ((uint_t)EIB_WQE_TX << EIB_WQETYP_SHIFT);
597 
598 			/*
599 			 * These tx-specific parameters (except wr_id and
600 			 * wr_trans) are reset in eib_rsrc_grab_swqes() to make
601 			 * sure any freshly acquired swqe from the pool has
602 			 * these default settings for the caller.
603 			 */
604 			wqe->qe_wr.send.wr_id = (ibt_wrid_t)(uintptr_t)wqe;
605 			wqe->qe_wr.send.wr_trans = IBT_UD_SRV;
606 			wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS;
607 			wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest;
608 			wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND;
609 			wqe->qe_wr.send.wr_nds = 1;
610 			wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl;
611 			wqe->qe_nxt_post = NULL;
612 			wqe->qe_iov_hdl = NULL;
613 
614 			buf += wqe->qe_bufsz;
615 		}
616 	}
617 
618 	/*
619 	 * Before returning, create a kernel thread to monitor the status
620 	 * of wqes in the tx wqe pool.  Note that this thread cannot be
621 	 * created from eib_state_init() during attach(), since the thread
622 	 * expects the wqe pool to be allocated and ready when it starts,
623 	 * and the tx bufs initialization only happens during eib_m_start().
624 	 */
625 	kt = thread_create(NULL, 0, eib_monitor_tx_wqes, ss, 0,
626 	    &p0, TS_RUN, minclsyspri);
627 	ss->ei_txwqe_monitor = kt->t_did;
628 
629 	return (EIB_E_SUCCESS);
630 
631 rsrc_setup_txbufs_fail:
632 	eib_rb_rsrc_setup_txbufs(ss, B_FALSE);
633 	return (EIB_E_FAILURE);
634 }
635 
636 static int
eib_rsrc_setup_rxbufs(eib_t * ss,int * err)637 eib_rsrc_setup_rxbufs(eib_t *ss, int *err)
638 {
639 	eib_wqe_pool_t *rx;
640 	eib_wqe_t *wqe;
641 	ibt_mr_attr_t attr;
642 	ibt_mr_desc_t desc;
643 	ibt_status_t ret;
644 	uint8_t	*buf;
645 	uint_t mtu = ss->ei_props->ep_mtu;
646 	uint_t blk;
647 	uint_t ndx;
648 	uint_t i;
649 
650 	/*
651 	 * Try to allocate and initialize the wqe pool. When this is called
652 	 * during a plumb via the mac m_start callback, we need to make
653 	 * sure there is a need to allocate a wqe pool afresh.  If during a
654 	 * previous unplumb we didn't free the wqe pool because the nw layer
655 	 * was holding on to some rx buffers, we don't need to allocate new
656 	 * pool and set up the buffers again; we'll just start re-using the
657 	 * previous one.
658 	 */
659 	if (ss->ei_rx != NULL)
660 		return (EIB_E_SUCCESS);
661 
662 	/*
663 	 * The receive buffer has to work for all channels, specifically the
664 	 * data qp of the vnics.  This means that the buffer must be large
665 	 * enough to hold MTU sized IB payload (including the EoIB and ethernet
666 	 * headers) plus the GRH. In addition, because the ethernet header is
667 	 * either 14 or 18 bytes (tagless or vlan tagged), we should have the
668 	 * buffer filled in such a way that the IP header starts at atleast a
669 	 * 4-byte aligned address.  In order to do this, we need to have some
670 	 * additional room.
671 	 */
672 	eib_rsrc_init_wqe_pool(ss, &ss->ei_rx,
673 	    mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM, EIB_WP_TYPE_RX);
674 	rx = ss->ei_rx;
675 
676 	/*
677 	 * Register the RX memory region with IBTF for use
678 	 */
679 	attr.mr_vaddr = rx->wp_vaddr;
680 	attr.mr_len = rx->wp_memsz;
681 	attr.mr_as = NULL;
682 	attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
683 
684 	ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr,
685 	    &rx->wp_mr, &desc);
686 	if (ret != IBT_SUCCESS) {
687 		EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_rxbufs: "
688 		    "ibt_register_mr() failed for rx "
689 		    "region (0x%llx, 0x%llx) with ret=%d",
690 		    attr.mr_vaddr, attr.mr_len, ret);
691 
692 		*err = EINVAL;
693 		goto rsrc_setup_rxbufs_fail;
694 	}
695 	rx->wp_lkey = desc.md_lkey;
696 
697 	/*
698 	 * Now setup the rx wqes
699 	 */
700 	buf = (uint8_t *)(uintptr_t)(rx->wp_vaddr);
701 	for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) {
702 		for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) {
703 			wqe = &rx->wp_wqe[i];
704 
705 			/*
706 			 * These parameters should remain fixed throughout the
707 			 * lifetime of this recv wqe. The qe_frp will only be
708 			 * used by the data channel of vnics and will remain
709 			 * unused by other channels.
710 			 */
711 			wqe->qe_pool = rx;
712 			wqe->qe_cpbuf = buf;
713 			wqe->qe_bufsz = mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM;
714 			wqe->qe_wr.recv.wr_id = (ibt_wrid_t)(uintptr_t)wqe;
715 			wqe->qe_wr.recv.wr_nds = 1;
716 			wqe->qe_wr.recv.wr_sgl = &wqe->qe_sgl;
717 			wqe->qe_frp.free_func = eib_data_rx_recycle;
718 			wqe->qe_frp.free_arg = (caddr_t)wqe;
719 
720 			/*
721 			 * The parameter set below are used in tx and rx paths.
722 			 * These parameters (except ds_key) are reset to these
723 			 * default values in eib_rsrc_return_wqes().
724 			 */
725 			wqe->qe_sgl.ds_key = rx->wp_lkey;
726 			wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf;
727 			wqe->qe_sgl.ds_len = wqe->qe_bufsz;
728 			wqe->qe_mp = NULL;
729 			wqe->qe_info =
730 			    ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) |
731 			    ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) |
732 			    ((uint_t)EIB_WQE_RX << EIB_WQETYP_SHIFT);
733 
734 			/*
735 			 * These rx-specific parameters are also reset to
736 			 * these default values in eib_rsrc_return_wqes().
737 			 */
738 			wqe->qe_chan = NULL;
739 			wqe->qe_vnic_inst = -1;
740 
741 			buf += (mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM);
742 		}
743 	}
744 
745 	return (EIB_E_SUCCESS);
746 
747 rsrc_setup_rxbufs_fail:
748 	eib_rb_rsrc_setup_rxbufs(ss, B_FALSE);
749 	return (EIB_E_FAILURE);
750 }
751 
752 static int
eib_rsrc_setup_lsobufs(eib_t * ss,int * err)753 eib_rsrc_setup_lsobufs(eib_t *ss, int *err)
754 {
755 	eib_lsobkt_t *bkt;
756 	eib_lsobuf_t *elem;
757 	eib_lsobuf_t *tail;
758 	ibt_mr_attr_t attr;
759 	ibt_mr_desc_t desc;
760 	kthread_t *kt;
761 
762 	uint8_t *lsomem;
763 	uint8_t *memp;
764 	ibt_status_t ret;
765 	int i;
766 
767 	/*
768 	 * Allocate the lso bucket and space for buffers
769 	 */
770 	bkt = kmem_zalloc(sizeof (eib_lsobkt_t), KM_SLEEP);
771 	lsomem = kmem_zalloc(eib_lso_num_bufs * EIB_LSO_BUFSZ, KM_SLEEP);
772 
773 	/*
774 	 * Register lso memory and save the lkey
775 	 */
776 	attr.mr_vaddr = (uint64_t)(uintptr_t)lsomem;
777 	attr.mr_len = eib_lso_num_bufs * EIB_LSO_BUFSZ;
778 	attr.mr_as = NULL;
779 	attr.mr_flags = IBT_MR_SLEEP;
780 
781 	ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr,
782 	    &bkt->bk_mr_hdl, &desc);
783 	if (ret != IBT_SUCCESS) {
784 		*err = EINVAL;
785 		EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_lsobufs: "
786 		    "ibt_register_mr() failed for LSO "
787 		    "region (0x%llx, 0x%llx) with ret=%d",
788 		    attr.mr_vaddr, attr.mr_len, ret);
789 
790 		kmem_free(lsomem, eib_lso_num_bufs * EIB_LSO_BUFSZ);
791 		kmem_free(bkt, sizeof (eib_lsobkt_t));
792 
793 		return (EIB_E_FAILURE);
794 	}
795 	bkt->bk_lkey = desc.md_lkey;
796 
797 	/*
798 	 * Now allocate the buflist.  Note that the elements in the buflist and
799 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
800 	 * can always derive the address of a buflist entry from the address of
801 	 * an lso buffer.
802 	 */
803 	bkt->bk_bufl = kmem_zalloc(eib_lso_num_bufs * sizeof (eib_lsobuf_t),
804 	    KM_SLEEP);
805 
806 	/*
807 	 * Set up the lso buf chain
808 	 */
809 	memp = lsomem;
810 	elem = bkt->bk_bufl;
811 	for (i = 0; i < eib_lso_num_bufs; i++) {
812 		elem->lb_isfree = 1;
813 		elem->lb_buf = memp;
814 		elem->lb_next = elem + 1;
815 
816 		tail = elem;
817 
818 		memp += EIB_LSO_BUFSZ;
819 		elem++;
820 	}
821 	tail->lb_next = NULL;
822 
823 	/*
824 	 * Set up the LSO buffer information in eib state
825 	 */
826 	bkt->bk_free_head = bkt->bk_bufl;
827 	bkt->bk_mem = lsomem;
828 	bkt->bk_nelem = eib_lso_num_bufs;
829 	bkt->bk_nfree = bkt->bk_nelem;
830 
831 	mutex_init(&bkt->bk_lock, NULL, MUTEX_DRIVER, NULL);
832 	cv_init(&bkt->bk_cv, NULL, CV_DEFAULT, NULL);
833 
834 	ss->ei_lso = bkt;
835 
836 	/*
837 	 * Before returning, create a kernel thread to monitor the status
838 	 * of lso bufs
839 	 */
840 	kt = thread_create(NULL, 0, eib_monitor_lso_bufs, ss, 0,
841 	    &p0, TS_RUN, minclsyspri);
842 	ss->ei_lsobufs_monitor = kt->t_did;
843 
844 	return (EIB_E_SUCCESS);
845 }
846 
847 static void
eib_rsrc_init_wqe_pool(eib_t * ss,eib_wqe_pool_t ** wpp,ib_memlen_t bufsz,int wp_type)848 eib_rsrc_init_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp, ib_memlen_t bufsz,
849     int wp_type)
850 {
851 	eib_wqe_pool_t *wp;
852 	uint_t wp_wqesz;
853 	int i;
854 
855 	ASSERT(wpp != NULL);
856 	ASSERT(*wpp == NULL);
857 
858 	/*
859 	 * Allocate the wqe pool, wqes and bufs
860 	 */
861 	wp = kmem_zalloc(sizeof (eib_wqe_pool_t), KM_SLEEP);
862 	wp_wqesz = EIB_WQES_PER_POOL * sizeof (eib_wqe_t);
863 	wp->wp_wqe = (eib_wqe_t *)kmem_zalloc(wp_wqesz, KM_SLEEP);
864 	wp->wp_memsz = EIB_WQES_PER_POOL * bufsz;
865 	wp->wp_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(wp->wp_memsz,
866 	    KM_SLEEP);
867 	wp->wp_ss = ss;
868 	wp->wp_type = wp_type;
869 	wp->wp_nfree_lwm = (wp_type == EIB_WP_TYPE_TX) ?
870 	    EIB_NFREE_SWQES_LWM : EIB_NFREE_RWQES_LWM;
871 
872 	/*
873 	 * Initialize the lock and bitmaps: everything is available at first,
874 	 * but note that if the number of blocks per pool is less than 64, we
875 	 * need to initialize those extra bits as "unavailable" - these will
876 	 * remain unavailable throughout.
877 	 */
878 	mutex_init(&wp->wp_lock, NULL, MUTEX_DRIVER, NULL);
879 	cv_init(&wp->wp_cv, NULL, CV_DEFAULT, NULL);
880 
881 	wp->wp_nfree = EIB_WQES_PER_POOL;
882 	wp->wp_free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) :
883 	    (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1);
884 	for (i = 0; i < EIB_BLKS_PER_POOL; i++)
885 		wp->wp_free_wqes[i] = ~0;
886 
887 	*wpp = wp;
888 }
889 
890 /*ARGSUSED*/
891 static void
eib_rsrc_fini_wqe_pool(eib_t * ss,eib_wqe_pool_t ** wpp)892 eib_rsrc_fini_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp)
893 {
894 	eib_wqe_pool_t *wp;
895 
896 	ASSERT(wpp != NULL);
897 
898 	wp = *wpp;
899 	ASSERT(*wpp != NULL);
900 
901 	cv_destroy(&wp->wp_cv);
902 	mutex_destroy(&wp->wp_lock);
903 
904 	kmem_free((void *)(uintptr_t)(wp->wp_vaddr), wp->wp_memsz);
905 	kmem_free(wp->wp_wqe, EIB_WQES_PER_POOL * sizeof (eib_wqe_t));
906 	kmem_free(wp, sizeof (eib_wqe_pool_t));
907 
908 	*wpp = NULL;
909 }
910 
911 /*ARGSUSED*/
912 static boolean_t
eib_rsrc_ok_to_free_pool(eib_t * ss,eib_wqe_pool_t * wp,boolean_t force)913 eib_rsrc_ok_to_free_pool(eib_t *ss, eib_wqe_pool_t *wp, boolean_t force)
914 {
915 	uint64_t free_blks;
916 	int i;
917 
918 	/*
919 	 * See if we can release all memory allocated for buffers, wqes and
920 	 * the pool.  Note that in the case of data channel rx buffers, some
921 	 * of the buffers may not be free if the nw layer is holding on to
922 	 * them still.  If this is the case, we cannot free the wqe pool now
923 	 * or a subsequent access by the nw layer to the buffers will cause
924 	 * a panic.
925 	 */
926 	ASSERT(wp != NULL);
927 
928 	/*
929 	 * If force-free flag is set, we can always release the memory.
930 	 * Note that this flag is unused currently, and should be removed.
931 	 */
932 	if (force == B_TRUE)
933 		return (B_TRUE);
934 
935 	mutex_enter(&wp->wp_lock);
936 
937 	/*
938 	 * If a whole block remains allocated, obviously we cannot free
939 	 * the pool
940 	 */
941 	free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) :
942 	    (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1);
943 	if (wp->wp_free_blks != free_blks) {
944 		mutex_exit(&wp->wp_lock);
945 		return (B_FALSE);
946 	}
947 
948 	/*
949 	 * If even a single wqe within any one block remains in-use, we
950 	 * cannot free the pool
951 	 */
952 	for (i = 0; i < EIB_BLKS_PER_POOL; i++) {
953 		if (wp->wp_free_wqes[i] != (~0)) {
954 			mutex_exit(&wp->wp_lock);
955 			return (B_FALSE);
956 		}
957 	}
958 
959 	mutex_exit(&wp->wp_lock);
960 
961 	return (B_TRUE);
962 }
963 
964 /*ARGSUSED*/
965 static int
eib_rsrc_grab_wqes(eib_t * ss,eib_wqe_pool_t * wp,eib_wqe_t ** wqes,uint_t n_req,uint_t * actual,int pri)966 eib_rsrc_grab_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes,
967     uint_t n_req, uint_t *actual, int pri)
968 {
969 	uint_t n_allocd = 0;
970 	int blk;
971 	int ndx;
972 	int wqe_ndx;
973 
974 	ASSERT(wp != NULL);
975 	ASSERT(wqes != NULL);
976 
977 	mutex_enter(&wp->wp_lock);
978 
979 	/*
980 	 * If this is a low priority request, adjust the number requested
981 	 * so we don't allocate beyond the low-water-mark
982 	 */
983 	if (pri == EIB_WPRI_LO) {
984 		if (wp->wp_nfree <= wp->wp_nfree_lwm)
985 			n_req = 0;
986 		else if ((wp->wp_nfree - n_req) < wp->wp_nfree_lwm)
987 			n_req = wp->wp_nfree - wp->wp_nfree_lwm;
988 	}
989 
990 	for (n_allocd = 0;  n_allocd < n_req; n_allocd++) {
991 		/*
992 		 * If the entire pool is unavailable, quit
993 		 */
994 		if (wp->wp_free_blks == 0)
995 			break;
996 
997 		/*
998 		 * Find the first wqe that's available
999 		 */
1000 		blk = EIB_FIND_LSB_SET(wp->wp_free_blks);
1001 		ASSERT(blk != -1);
1002 		ndx = EIB_FIND_LSB_SET(wp->wp_free_wqes[blk]);
1003 		ASSERT(ndx != -1);
1004 
1005 		/*
1006 		 * Mark the wqe as allocated
1007 		 */
1008 		wp->wp_free_wqes[blk] &= (~((uint64_t)1 << ndx));
1009 
1010 		/*
1011 		 * If this was the last free wqe in this block, mark
1012 		 * the block itself as unavailable
1013 		 */
1014 		if (wp->wp_free_wqes[blk] == 0)
1015 			wp->wp_free_blks &= (~((uint64_t)1 << blk));
1016 
1017 		/*
1018 		 * Return this wqe to the caller
1019 		 */
1020 		wqe_ndx = blk * EIB_WQES_PER_BLK + ndx;
1021 		wqes[n_allocd] = &(wp->wp_wqe[wqe_ndx]);
1022 	}
1023 
1024 	wp->wp_nfree -= n_allocd;
1025 
1026 	mutex_exit(&wp->wp_lock);
1027 
1028 	if (n_allocd == 0)
1029 		return (EIB_E_FAILURE);
1030 
1031 	if (actual) {
1032 		*actual = n_allocd;
1033 	}
1034 
1035 	return (EIB_E_SUCCESS);
1036 }
1037 
1038 /*ARGSUSED*/
1039 static void
eib_rsrc_return_wqes(eib_t * ss,eib_wqe_pool_t * wp,eib_wqe_t ** wqes,uint_t n_wqes)1040 eib_rsrc_return_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes,
1041     uint_t n_wqes)
1042 {
1043 	eib_wqe_t *wqe;
1044 	uint_t n_freed = 0;
1045 	uint_t blk;
1046 	uint_t ndx;
1047 
1048 	ASSERT(wp != NULL);
1049 	ASSERT(wqes != NULL);
1050 
1051 	mutex_enter(&wp->wp_lock);
1052 	for (n_freed = 0;  n_freed < n_wqes; n_freed++) {
1053 		wqe = wqes[n_freed];
1054 
1055 		/*
1056 		 * This wqe is being returned back to the pool, so clear
1057 		 * any wqe flags and reset buffer address and size in the
1058 		 * single segment sgl back to what they were initially.
1059 		 * Also erase any mblk pointer and callback function ptrs.
1060 		 */
1061 		wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)wqe->qe_cpbuf;
1062 		wqe->qe_sgl.ds_len = wqe->qe_bufsz;
1063 		wqe->qe_mp = NULL;
1064 		wqe->qe_chan = NULL;
1065 		wqe->qe_vnic_inst = -1;
1066 		wqe->qe_info &= (~EIB_WQEFLGS_MASK);
1067 
1068 		/*
1069 		 * Mark the wqe free in its block
1070 		 */
1071 		blk = EIB_WQE_BLK(wqe->qe_info);
1072 		ndx = EIB_WQE_NDX(wqe->qe_info);
1073 
1074 		wp->wp_free_wqes[blk] |= ((uint64_t)1 << ndx);
1075 
1076 		/*
1077 		 * This block now has atleast one wqe free, so mark
1078 		 * the block itself as available and move on to the
1079 		 * next wqe to free
1080 		 */
1081 		wp->wp_free_blks |= ((uint64_t)1 << blk);
1082 	}
1083 
1084 	wp->wp_nfree += n_freed;
1085 
1086 	/*
1087 	 * If the number of available wqes in the pool has just crossed
1088 	 * the high-water-mark, wakeup anyone who may be sleeping on it.
1089 	 */
1090 	if ((wp->wp_type == EIB_WP_TYPE_TX) &&
1091 	    ((wp->wp_nfree - n_freed) < EIB_NFREE_SWQES_HWM) &&
1092 	    (wp->wp_nfree >= EIB_NFREE_SWQES_HWM)) {
1093 		cv_broadcast(&wp->wp_cv);
1094 	}
1095 
1096 	mutex_exit(&wp->wp_lock);
1097 }
1098 
1099 static void
eib_rb_rsrc_setup_txbufs(eib_t * ss,boolean_t force)1100 eib_rb_rsrc_setup_txbufs(eib_t *ss, boolean_t force)
1101 {
1102 	eib_wqe_pool_t *wp = ss->ei_tx;
1103 	eib_wqe_t *wqe;
1104 	ibt_ud_dest_hdl_t dest;
1105 	ibt_status_t ret;
1106 	uint8_t *plhdr;
1107 	int i;
1108 
1109 	if (wp == NULL)
1110 		return;
1111 
1112 	/*
1113 	 * Check if it's ok to free the tx wqe pool (i.e. all buffers have
1114 	 * been reclaimed) and if so, stop the txwqe monitor thread (and wait
1115 	 * for it to die), release the UD destination handles, deregister
1116 	 * memory and fini the wqe pool.
1117 	 */
1118 	if (eib_rsrc_ok_to_free_pool(ss, wp, force)) {
1119 		eib_stop_monitor_tx_wqes(ss);
1120 
1121 		for (i = 0; i < EIB_WQES_PER_POOL; i++) {
1122 			wqe = &wp->wp_wqe[i];
1123 			if ((plhdr = wqe->qe_payload_hdr) != NULL) {
1124 				kmem_free(plhdr, EIB_MAX_PAYLOAD_HDR_SZ);
1125 			}
1126 			if ((dest = wqe->qe_dest) != NULL) {
1127 				ret = ibt_free_ud_dest(dest);
1128 				if (ret != IBT_SUCCESS) {
1129 					EIB_DPRINTF_WARN(ss->ei_instance,
1130 					    "eib_rb_rsrc_setup_txbufs: "
1131 					    "ibt_free_ud_dest() failed, ret=%d",
1132 					    ret);
1133 				}
1134 			}
1135 		}
1136 		if (wp->wp_mr) {
1137 			if ((ret = ibt_deregister_mr(ss->ei_hca_hdl,
1138 			    wp->wp_mr)) != IBT_SUCCESS) {
1139 				EIB_DPRINTF_WARN(ss->ei_instance,
1140 				    "eib_rb_rsrc_setup_txbufs: "
1141 				    "ibt_deregister_mr() failed, ret=%d", ret);
1142 			}
1143 			wp->wp_mr = NULL;
1144 		}
1145 		eib_rsrc_fini_wqe_pool(ss, &ss->ei_tx);
1146 	}
1147 }
1148 
1149 void
eib_rb_rsrc_setup_rxbufs(eib_t * ss,boolean_t force)1150 eib_rb_rsrc_setup_rxbufs(eib_t *ss, boolean_t force)
1151 {
1152 	eib_wqe_pool_t *rx = ss->ei_rx;
1153 	ibt_status_t ret;
1154 
1155 	if (rx == NULL)
1156 		return;
1157 
1158 	/*
1159 	 * Check if it's ok to free the rx wqe pool (i.e. all buffers have
1160 	 * been reclaimed) and if so, deregister memory and fini the wqe pool.
1161 	 */
1162 	if (eib_rsrc_ok_to_free_pool(ss, rx, force)) {
1163 		if (rx->wp_mr) {
1164 			if ((ret = ibt_deregister_mr(ss->ei_hca_hdl,
1165 			    rx->wp_mr)) != IBT_SUCCESS) {
1166 				EIB_DPRINTF_WARN(ss->ei_instance,
1167 				    "eib_rb_rsrc_setup_rxbufs: "
1168 				    "ibt_deregister_mr() failed, ret=%d", ret);
1169 			}
1170 			rx->wp_mr = NULL;
1171 		}
1172 
1173 		eib_rsrc_fini_wqe_pool(ss, &ss->ei_rx);
1174 	}
1175 }
1176 
1177 static void
eib_rb_rsrc_setup_lsobufs(eib_t * ss,boolean_t force)1178 eib_rb_rsrc_setup_lsobufs(eib_t *ss, boolean_t force)
1179 {
1180 	eib_lsobkt_t *bkt;
1181 	ibt_status_t ret;
1182 
1183 	/*
1184 	 * Remove the lso bucket from the state
1185 	 */
1186 	if ((bkt = ss->ei_lso) == NULL)
1187 		return;
1188 
1189 	/*
1190 	 * Try to stop the lso bufs monitor thread. If we fail, we simply
1191 	 * return.  We'll have another shot at it later from detach() with
1192 	 * the force flag set.
1193 	 */
1194 	if (eib_stop_monitor_lso_bufs(ss, force) != EIB_E_SUCCESS)
1195 		return;
1196 
1197 	/*
1198 	 * Free the buflist
1199 	 */
1200 	if (bkt->bk_bufl) {
1201 		kmem_free(bkt->bk_bufl, bkt->bk_nelem * sizeof (eib_lsobuf_t));
1202 		bkt->bk_bufl = NULL;
1203 	}
1204 
1205 	/*
1206 	 * Deregister LSO memory and free it
1207 	 */
1208 	if (bkt->bk_mr_hdl) {
1209 		if ((ret = ibt_deregister_mr(ss->ei_hca_hdl,
1210 		    bkt->bk_mr_hdl)) != IBT_SUCCESS) {
1211 			EIB_DPRINTF_WARN(ss->ei_instance,
1212 			    "eib_rb_rsrc_setup_lsobufs: "
1213 			    "ibt_deregister_mr() failed, ret=%d", ret);
1214 		}
1215 		bkt->bk_mr_hdl = NULL;
1216 	}
1217 	if (bkt->bk_mem) {
1218 		kmem_free(bkt->bk_mem, bkt->bk_nelem * EIB_LSO_BUFSZ);
1219 		bkt->bk_mem = NULL;
1220 	}
1221 
1222 	/*
1223 	 * Destroy the mutex and condvar
1224 	 */
1225 	cv_destroy(&bkt->bk_cv);
1226 	mutex_destroy(&bkt->bk_lock);
1227 
1228 	/*
1229 	 * Finally, free the lso bucket itself
1230 	 */
1231 	kmem_free(bkt, sizeof (eib_lsobkt_t));
1232 	ss->ei_lso = NULL;
1233 }
1234