xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_ring.c (revision 0207f820281e2416190c7ed5f1cb4d11188c082b)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/atomic.h>
27 #include <sys/cpuvar.h>
28 #include <sys/sdt.h>
29 
30 #include <sys/pattr.h>
31 #include <sys/dlpi.h>
32 
33 #include <sys/mac_provider.h>
34 
35 #include <sys/random.h>
36 
37 #include <mlxcx.h>
38 
39 boolean_t
40 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
41 {
42 	ddi_device_acc_attr_t acc;
43 	ddi_dma_attr_t attr;
44 	boolean_t ret;
45 	size_t sz;
46 
47 	VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
48 
49 	/* Receive and send queue entries might be different sizes. */
50 	switch (mlwq->mlwq_type) {
51 	case MLXCX_WQ_TYPE_SENDQ:
52 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift;
53 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
54 		sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t);
55 		break;
56 	case MLXCX_WQ_TYPE_RECVQ:
57 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift;
58 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
59 		sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t);
60 		break;
61 	default:
62 		VERIFY(0);
63 		return (B_FALSE);
64 	}
65 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
66 
67 	mlxcx_dma_acc_attr(mlxp, &acc);
68 	mlxcx_dma_queue_attr(mlxp, &attr);
69 
70 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc,
71 	    B_TRUE, sz, B_TRUE);
72 	if (!ret) {
73 		mlxcx_warn(mlxp, "failed to allocate WQ memory");
74 		return (B_FALSE);
75 	}
76 
77 	/*
78 	 * Just set the first pointer in the union. Yes, this is a strict
79 	 * aliasing violation. No, I don't care.
80 	 */
81 	mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va;
82 
83 	mlxcx_dma_acc_attr(mlxp, &acc);
84 	mlxcx_dma_qdbell_attr(mlxp, &attr);
85 	sz = sizeof (mlxcx_workq_doorbell_t);
86 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc,
87 	    B_TRUE, sz, B_TRUE);
88 	if (!ret) {
89 		mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory");
90 		mlxcx_dma_free(&mlwq->mlwq_dma);
91 		mlwq->mlwq_send_ent = NULL;
92 		return (B_FALSE);
93 	}
94 
95 	mlwq->mlwq_doorbell =
96 	    (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va;
97 
98 	mlwq->mlwq_state |= MLXCX_WQ_ALLOC;
99 
100 	return (B_TRUE);
101 }
102 
103 void
104 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
105 {
106 	VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
107 	if (mlwq->mlwq_state & MLXCX_WQ_CREATED)
108 		VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED);
109 
110 	mlxcx_dma_free(&mlwq->mlwq_dma);
111 	mlwq->mlwq_send_ent = NULL;
112 	mlxcx_dma_free(&mlwq->mlwq_doorbell_dma);
113 	mlwq->mlwq_doorbell = NULL;
114 
115 	mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
116 }
117 
118 static boolean_t
119 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
120     uint_t ent_shift)
121 {
122 	ddi_device_acc_attr_t acc;
123 	ddi_dma_attr_t attr;
124 	boolean_t ret;
125 	size_t sz, i;
126 
127 	VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
128 
129 	mlcq->mlcq_entshift = ent_shift;
130 	mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
131 	sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
132 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
133 
134 	mlxcx_dma_acc_attr(mlxp, &acc);
135 	mlxcx_dma_queue_attr(mlxp, &attr);
136 
137 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc,
138 	    B_TRUE, sz, B_TRUE);
139 	if (!ret) {
140 		mlxcx_warn(mlxp, "failed to allocate CQ memory");
141 		return (B_FALSE);
142 	}
143 
144 	mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va;
145 
146 	for (i = 0; i < mlcq->mlcq_nents; ++i) {
147 		mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID;
148 		mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT;
149 	}
150 
151 	mlxcx_dma_acc_attr(mlxp, &acc);
152 	mlxcx_dma_qdbell_attr(mlxp, &attr);
153 	sz = sizeof (mlxcx_completionq_doorbell_t);
154 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc,
155 	    B_TRUE, sz, B_TRUE);
156 	if (!ret) {
157 		mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory");
158 		mlxcx_dma_free(&mlcq->mlcq_dma);
159 		mlcq->mlcq_ent = NULL;
160 		return (B_FALSE);
161 	}
162 
163 	mlcq->mlcq_doorbell =
164 	    (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va;
165 
166 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ALLOC);
167 
168 	return (B_TRUE);
169 }
170 
171 static void
172 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
173 {
174 	VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
175 	if (mlcq->mlcq_state & MLXCX_CQ_CREATED)
176 		VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
177 
178 	mlxcx_dma_free(&mlcq->mlcq_dma);
179 	mlcq->mlcq_ent = NULL;
180 	mlxcx_dma_free(&mlcq->mlcq_doorbell_dma);
181 	mlcq->mlcq_doorbell = NULL;
182 
183 	atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ALLOC);
184 }
185 
186 void
187 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
188 {
189 	mlxcx_completion_queue_t *mlcq;
190 
191 	/*
192 	 * If something is holding the lock on a long operation like a
193 	 * refill, setting this flag asks them to exit early if possible.
194 	 */
195 	atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN);
196 
197 	mutex_enter(&mlwq->mlwq_mtx);
198 
199 	list_remove(&mlxp->mlx_wqs, mlwq);
200 
201 	if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) &&
202 	    !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) {
203 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
204 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
205 		    !mlxcx_cmd_stop_rq(mlxp, mlwq)) {
206 			mlxcx_warn(mlxp, "failed to stop "
207 			    "recv queue num %x", mlwq->mlwq_num);
208 		}
209 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
210 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
211 		    !mlxcx_cmd_stop_sq(mlxp, mlwq)) {
212 			mlxcx_warn(mlxp, "failed to stop "
213 			    "send queue num %x", mlwq->mlwq_num);
214 		}
215 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
216 		    !mlxcx_cmd_destroy_rq(mlxp, mlwq)) {
217 			mlxcx_warn(mlxp, "failed to destroy "
218 			    "recv queue num %x", mlwq->mlwq_num);
219 		}
220 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
221 		    !mlxcx_cmd_destroy_sq(mlxp, mlwq)) {
222 			mlxcx_warn(mlxp, "failed to destroy "
223 			    "send queue num %x", mlwq->mlwq_num);
224 		}
225 	}
226 	if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) {
227 		mlxcx_wq_rele_dma(mlxp, mlwq);
228 	}
229 	mlcq = mlwq->mlwq_cq;
230 
231 	/* These will be released by mlxcx_teardown_bufs() */
232 	mlwq->mlwq_bufs = NULL;
233 	mlwq->mlwq_foreign_bufs = NULL;
234 
235 	mutex_exit(&mlwq->mlwq_mtx);
236 
237 	mutex_enter(&mlcq->mlcq_mtx);
238 	mutex_enter(&mlwq->mlwq_mtx);
239 	ASSERT3P(mlcq->mlcq_wq, ==, mlwq);
240 	mlcq->mlcq_wq = NULL;
241 	mutex_exit(&mlwq->mlwq_mtx);
242 	mutex_exit(&mlcq->mlcq_mtx);
243 
244 	mutex_destroy(&mlwq->mlwq_mtx);
245 }
246 
247 void
248 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
249 {
250 	mlxcx_event_queue_t *mleq;
251 	mlxcx_buffer_t *b;
252 
253 	/*
254 	 * If something is holding the lock on a long operation like polling
255 	 * which we're going to abort anyway, this flag asks them to exit
256 	 * early if possible.
257 	 */
258 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN);
259 
260 	mutex_enter(&mlcq->mlcq_mtx);
261 
262 	list_remove(&mlxp->mlx_cqs, mlcq);
263 
264 	if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) &&
265 	    !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) {
266 		if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) {
267 			mlxcx_warn(mlxp, "failed to destroy "
268 			    "completion queue num %u",
269 			    mlcq->mlcq_num);
270 		}
271 	}
272 	if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) {
273 		mlxcx_cq_rele_dma(mlxp, mlcq);
274 	}
275 	/*
276 	 * If we're on an EQ AVL tree, then we need to grab
277 	 * the EQ's mutex to take it off. The ISR always takes
278 	 * EQ mutex before CQ mutex, so we have to let go of
279 	 * the CQ mutex then come back again.
280 	 *
281 	 * The ISR will bail out if tries to touch this CQ now since
282 	 * we added the CQ_DESTROYED flag above.
283 	 */
284 	if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
285 		mleq = mlcq->mlcq_eq;
286 	} else {
287 		mleq = NULL;
288 	}
289 
290 	/* Return any outstanding buffers to the free pool. */
291 	while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) {
292 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
293 	}
294 	mutex_enter(&mlcq->mlcq_bufbmtx);
295 	while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) {
296 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
297 	}
298 	mutex_exit(&mlcq->mlcq_bufbmtx);
299 
300 	/*
301 	 * Since the interrupt handlers take the EQ lock before the CQ one,
302 	 * we must do the same here. That means letting go of the lock
303 	 * for a brief window here (we'll double-check the state when we
304 	 * get back in).
305 	 */
306 	mutex_exit(&mlcq->mlcq_mtx);
307 
308 	if (mleq != NULL) {
309 		mutex_enter(&mleq->mleq_mtx);
310 		mutex_enter(&mlcq->mlcq_mtx);
311 		/*
312 		 * Double-check the state, we let go of the
313 		 * mutex briefly.
314 		 */
315 		if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
316 			avl_remove(&mleq->mleq_cqs, mlcq);
317 			atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_EQAVL);
318 		}
319 		mutex_exit(&mlcq->mlcq_mtx);
320 		mutex_exit(&mleq->mleq_mtx);
321 	}
322 
323 	mutex_enter(&mlcq->mlcq_mtx);
324 	ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED |
325 	    MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED));
326 	mutex_exit(&mlcq->mlcq_mtx);
327 
328 	mutex_destroy(&mlcq->mlcq_mtx);
329 	mutex_destroy(&mlcq->mlcq_arm_mtx);
330 	mutex_destroy(&mlcq->mlcq_bufbmtx);
331 	list_destroy(&mlcq->mlcq_buffers);
332 	list_destroy(&mlcq->mlcq_buffers_b);
333 	kmem_free(mlcq, sizeof (mlxcx_completion_queue_t));
334 }
335 
336 static boolean_t
337 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
338     mlxcx_completion_queue_t **cqp, uint_t ent_shift)
339 {
340 	mlxcx_completion_queue_t *cq;
341 
342 	cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP);
343 	mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER,
344 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
345 	mutex_init(&cq->mlcq_arm_mtx, NULL, MUTEX_DRIVER,
346 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
347 	mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER,
348 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
349 	list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t),
350 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
351 	list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t),
352 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
353 
354 	cq->mlcq_mlx = mlxp;
355 	list_insert_tail(&mlxp->mlx_cqs, cq);
356 
357 	mutex_enter(&cq->mlcq_mtx);
358 
359 	if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
360 		mutex_exit(&cq->mlcq_mtx);
361 		return (B_FALSE);
362 	}
363 
364 	cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP;
365 	cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP;
366 
367 	cq->mlcq_uar = &mlxp->mlx_uar;
368 	cq->mlcq_eq = eq;
369 
370 	cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec;
371 	cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count;
372 
373 	if (!mlxcx_cmd_create_cq(mlxp, cq)) {
374 		mutex_exit(&cq->mlcq_mtx);
375 		return (B_FALSE);
376 	}
377 
378 	mutex_exit(&cq->mlcq_mtx);
379 
380 	mutex_enter(&eq->mleq_mtx);
381 	mutex_enter(&cq->mlcq_mtx);
382 	ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL);
383 	avl_add(&eq->mleq_cqs, cq);
384 	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_EQAVL);
385 	mlxcx_arm_cq(mlxp, cq);
386 	mutex_exit(&cq->mlcq_mtx);
387 	mutex_exit(&eq->mleq_mtx);
388 
389 	*cqp = cq;
390 	return (B_TRUE);
391 }
392 
393 static boolean_t
394 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
395     mlxcx_work_queue_t *wq)
396 {
397 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
398 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
399 
400 	list_insert_tail(&mlxp->mlx_wqs, wq);
401 
402 	mutex_enter(&wq->mlwq_mtx);
403 
404 	wq->mlwq_mlx = mlxp;
405 	wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ;
406 	wq->mlwq_cq = cq;
407 	wq->mlwq_pd = &mlxp->mlx_pd;
408 	wq->mlwq_uar = &mlxp->mlx_uar;
409 
410 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
411 
412 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
413 		mutex_exit(&wq->mlwq_mtx);
414 		return (B_FALSE);
415 	}
416 
417 	if (!mlxcx_cmd_create_rq(mlxp, wq)) {
418 		mutex_exit(&wq->mlwq_mtx);
419 		return (B_FALSE);
420 	}
421 
422 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
423 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
424 
425 	mutex_exit(&wq->mlwq_mtx);
426 
427 	mutex_enter(&cq->mlcq_mtx);
428 	mutex_enter(&wq->mlwq_mtx);
429 	ASSERT3P(cq->mlcq_wq, ==, NULL);
430 	cq->mlcq_wq = wq;
431 	mutex_exit(&wq->mlwq_mtx);
432 	mutex_exit(&cq->mlcq_mtx);
433 
434 	return (B_TRUE);
435 }
436 
437 static boolean_t
438 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
439     mlxcx_tis_t *tis, mlxcx_work_queue_t *wq)
440 {
441 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
442 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
443 
444 	list_insert_tail(&mlxp->mlx_wqs, wq);
445 
446 	mutex_enter(&wq->mlwq_mtx);
447 
448 	wq->mlwq_mlx = mlxp;
449 	wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ;
450 	wq->mlwq_cq = cq;
451 	wq->mlwq_pd = &mlxp->mlx_pd;
452 	wq->mlwq_uar = &mlxp->mlx_uar;
453 	wq->mlwq_tis = tis;
454 
455 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
456 	wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp);
457 
458 	VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2);
459 	wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2;
460 
461 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
462 		mutex_exit(&wq->mlwq_mtx);
463 		return (B_FALSE);
464 	}
465 
466 	if (!mlxcx_cmd_create_sq(mlxp, wq)) {
467 		mutex_exit(&wq->mlwq_mtx);
468 		return (B_FALSE);
469 	}
470 
471 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
472 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
473 
474 	mutex_exit(&wq->mlwq_mtx);
475 
476 	mutex_enter(&cq->mlcq_mtx);
477 	mutex_enter(&wq->mlwq_mtx);
478 	ASSERT3P(cq->mlcq_wq, ==, NULL);
479 	cq->mlcq_wq = wq;
480 	mutex_exit(&wq->mlwq_mtx);
481 	mutex_exit(&cq->mlcq_mtx);
482 
483 	return (B_TRUE);
484 }
485 
486 /*
487  * Before we tear down the queues associated with the rx group,
488  * flag each cq as being torn down and wake up any tasks.
489  */
490 static void
491 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
492 {
493 	mlxcx_work_queue_t *wq;
494 	mlxcx_completion_queue_t *cq;
495 	mlxcx_buf_shard_t *s;
496 	uint_t i;
497 
498 	mutex_enter(&g->mlg_mtx);
499 
500 	for (i = 0; i < g->mlg_nwqs; ++i) {
501 		wq = &g->mlg_wqs[i];
502 		cq = wq->mlwq_cq;
503 		if (cq != NULL) {
504 			s = wq->mlwq_bufs;
505 			mutex_enter(&s->mlbs_mtx);
506 			atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
507 			cv_broadcast(&s->mlbs_free_nonempty);
508 			mutex_exit(&s->mlbs_mtx);
509 		}
510 	}
511 
512 	mutex_exit(&g->mlg_mtx);
513 }
514 
515 void
516 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
517 {
518 	mlxcx_work_queue_t *wq;
519 	mlxcx_completion_queue_t *cq;
520 	mlxcx_flow_entry_t *fe;
521 	mlxcx_flow_group_t *fg;
522 	mlxcx_flow_table_t *ft;
523 	uint_t i;
524 
525 	mutex_enter(&g->mlg_port->mlp_mtx);
526 	mutex_enter(&g->mlg_mtx);
527 
528 	if (g->mlg_state & MLXCX_GROUP_FLOWS) {
529 		mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g);
530 
531 		if (g->mlg_rx_vlan_ft != NULL)
532 			mlxcx_remove_all_vlan_entries(mlxp, g);
533 
534 		if (g == &mlxp->mlx_rx_groups[0]) {
535 			ft = g->mlg_port->mlp_rx_flow;
536 			mutex_enter(&ft->mlft_mtx);
537 
538 			fg = g->mlg_port->mlp_bcast;
539 			fe = list_head(&fg->mlfg_entries);
540 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
541 				(void) mlxcx_cmd_delete_flow_table_entry(
542 				    mlxp, fe);
543 			}
544 
545 			fg = g->mlg_port->mlp_promisc;
546 			fe = list_head(&fg->mlfg_entries);
547 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
548 				(void) mlxcx_cmd_delete_flow_table_entry(
549 				    mlxp, fe);
550 			}
551 
552 			mutex_exit(&ft->mlft_mtx);
553 		}
554 
555 		if (g->mlg_rx_vlan_ft != NULL) {
556 			mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx);
557 			ASSERT(list_is_empty(&g->mlg_rx_vlans));
558 			fg = g->mlg_rx_vlan_def_fg;
559 			if (fg != NULL) {
560 				fe = list_head(&fg->mlfg_entries);
561 				if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
562 					(void)
563 					    mlxcx_cmd_delete_flow_table_entry(
564 					    mlxp, fe);
565 				}
566 			}
567 			fg = g->mlg_rx_vlan_promisc_fg;
568 			if (fg != NULL) {
569 				fe = list_head(&fg->mlfg_entries);
570 				if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
571 					(void)
572 					    mlxcx_cmd_delete_flow_table_entry(
573 					    mlxp, fe);
574 				}
575 			}
576 			mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft);
577 			list_destroy(&g->mlg_rx_vlans);
578 
579 			g->mlg_rx_vlan_ft = NULL;
580 		}
581 
582 		mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx);
583 		mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft);
584 		g->mlg_rx_hash_ft = NULL;
585 
586 		avl_destroy(&g->mlg_rx_macs);
587 		g->mlg_state &= ~MLXCX_GROUP_FLOWS;
588 	}
589 
590 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
591 		for (i = 0; i < g->mlg_nwqs; ++i) {
592 			wq = &g->mlg_wqs[i];
593 			mutex_enter(&wq->mlwq_mtx);
594 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
595 			    !mlxcx_cmd_stop_rq(mlxp, wq)) {
596 				mlxcx_warn(mlxp, "failed to stop rq %x",
597 				    wq->mlwq_num);
598 			}
599 			mutex_exit(&wq->mlwq_mtx);
600 		}
601 		taskq_destroy(g->mlg_refill_tq);
602 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
603 	}
604 
605 	if (g->mlg_state & MLXCX_GROUP_TIRTIS) {
606 		for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
607 			mlxcx_tir_t *tir = &g->mlg_tir[i];
608 			if (tir->mltir_state & MLXCX_TIR_CREATED &&
609 			    !(tir->mltir_state & MLXCX_TIR_DESTROYED)) {
610 				if (!mlxcx_cmd_destroy_tir(mlxp, tir)) {
611 					mlxcx_warn(mlxp,
612 					    "failed to destroy tir %u "
613 					    "for rx ring", tir->mltir_num);
614 				}
615 			}
616 		}
617 		g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
618 	}
619 
620 	if (g->mlg_state & MLXCX_GROUP_RQT) {
621 		if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED &&
622 		    !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) {
623 			if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) {
624 				mlxcx_warn(mlxp, "failed to destroy rqt %u "
625 				    "for rx ring", g->mlg_rqt->mlrqt_num);
626 			}
627 			kmem_free(g->mlg_rqt->mlrqt_rq,
628 			    g->mlg_rqt->mlrqt_rq_size);
629 			g->mlg_rqt->mlrqt_rq = NULL;
630 			kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t));
631 			g->mlg_rqt = NULL;
632 		}
633 		g->mlg_state &= ~MLXCX_GROUP_RQT;
634 	}
635 
636 	for (i = 0; i < g->mlg_nwqs; ++i) {
637 		wq = &g->mlg_wqs[i];
638 		cq = wq->mlwq_cq;
639 		mlxcx_wq_teardown(mlxp, wq);
640 		if (cq != NULL)
641 			mlxcx_cq_teardown(mlxp, cq);
642 	}
643 	kmem_free(g->mlg_wqs, g->mlg_wqs_size);
644 	g->mlg_wqs = NULL;
645 	g->mlg_state &= ~MLXCX_GROUP_WQS;
646 
647 	mutex_exit(&g->mlg_mtx);
648 	mutex_exit(&g->mlg_port->mlp_mtx);
649 
650 	mutex_destroy(&g->mlg_mtx);
651 
652 	g->mlg_state &= ~MLXCX_GROUP_INIT;
653 	ASSERT3S(g->mlg_state, ==, 0);
654 }
655 
656 void
657 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
658 {
659 	mlxcx_work_queue_t *wq;
660 	mlxcx_completion_queue_t *cq;
661 	uint_t i;
662 
663 	mutex_enter(&g->mlg_mtx);
664 
665 	if (g->mlg_state & MLXCX_GROUP_WQS) {
666 		for (i = 0; i < g->mlg_nwqs; ++i) {
667 			wq = &g->mlg_wqs[i];
668 			mutex_enter(&wq->mlwq_mtx);
669 			cq = wq->mlwq_cq;
670 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
671 			    !mlxcx_cmd_stop_sq(mlxp, wq)) {
672 				mlxcx_warn(mlxp, "failed to stop sq %x",
673 				    wq->mlwq_num);
674 			}
675 			mutex_exit(&wq->mlwq_mtx);
676 			mlxcx_wq_teardown(mlxp, wq);
677 			if (cq != NULL)
678 				mlxcx_cq_teardown(mlxp, cq);
679 		}
680 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
681 		kmem_free(g->mlg_wqs, g->mlg_wqs_size);
682 		g->mlg_wqs = NULL;
683 		g->mlg_state &= ~MLXCX_GROUP_WQS;
684 	}
685 
686 	if ((g->mlg_state & MLXCX_GROUP_TIRTIS) &&
687 	    g->mlg_tis.mltis_state & MLXCX_TIS_CREATED &&
688 	    !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) {
689 		if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) {
690 			mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring",
691 			    g->mlg_tis.mltis_num);
692 		}
693 	}
694 	g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
695 
696 	mutex_exit(&g->mlg_mtx);
697 	mutex_destroy(&g->mlg_mtx);
698 	g->mlg_state &= ~MLXCX_GROUP_INIT;
699 	ASSERT3S(g->mlg_state, ==, 0);
700 }
701 
702 void
703 mlxcx_teardown_groups(mlxcx_t *mlxp)
704 {
705 	mlxcx_ring_group_t *g;
706 	uint_t i;
707 
708 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
709 		g = &mlxp->mlx_rx_groups[i];
710 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
711 			continue;
712 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
713 		mlxcx_quiesce_rx_cqs(mlxp, g);
714 	}
715 
716 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
717 		g = &mlxp->mlx_rx_groups[i];
718 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
719 			continue;
720 		mlxcx_teardown_rx_group(mlxp, g);
721 	}
722 
723 	kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
724 	mlxp->mlx_rx_groups = NULL;
725 
726 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
727 		g = &mlxp->mlx_tx_groups[i];
728 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
729 			continue;
730 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
731 		mlxcx_teardown_tx_group(mlxp, g);
732 	}
733 
734 	kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
735 	mlxp->mlx_tx_groups = NULL;
736 }
737 
738 boolean_t
739 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
740 {
741 	mlxcx_event_queue_t *eq;
742 	mlxcx_completion_queue_t *cq;
743 	mlxcx_work_queue_t *rq;
744 	mlxcx_flow_table_t *ft;
745 	mlxcx_flow_group_t *fg;
746 	mlxcx_flow_entry_t *fe;
747 	uint_t ent_shift;
748 	uint_t i, j;
749 
750 	ASSERT3S(g->mlg_state, ==, 0);
751 
752 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
753 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
754 	mutex_enter(&g->mlg_mtx);
755 	g->mlg_mlx = mlxp;
756 	g->mlg_type = MLXCX_GROUP_RX;
757 	g->mlg_port = &mlxp->mlx_ports[0];
758 	g->mlg_state |= MLXCX_GROUP_INIT;
759 
760 	g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group;
761 	i = g - &mlxp->mlx_rx_groups[0];
762 	if (i < mlxp->mlx_props.mldp_rx_ngroups_large)
763 		g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group;
764 
765 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
766 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
767 	g->mlg_state |= MLXCX_GROUP_WQS;
768 
769 	g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP);
770 	g->mlg_rqt->mlrqt_max = 2;
771 	while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs)
772 		g->mlg_rqt->mlrqt_max <<= 1;
773 	g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max *
774 	    sizeof (mlxcx_work_queue_t *);
775 	g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP);
776 	g->mlg_state |= MLXCX_GROUP_RQT;
777 
778 	for (i = 0; i < g->mlg_nwqs; ++i) {
779 		eq = NULL;
780 		while (eq == NULL) {
781 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
782 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
783 				mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
784 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
785 			    eq->mleq_type != MLXCX_EQ_TYPE_RX) {
786 				/* Try the next one */
787 				eq = NULL;
788 			}
789 		}
790 
791 		/*
792 		 * A single completion is indicated for each rq entry as
793 		 * it is used. So, the number of cq entries never needs
794 		 * to be larger than the rq.
795 		 */
796 		ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
797 		    mlxp->mlx_props.mldp_rq_size_shift);
798 		if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
799 			g->mlg_nwqs = i;
800 			break;
801 		}
802 
803 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
804 
805 		rq = &g->mlg_wqs[i];
806 		if (!mlxcx_rq_setup(mlxp, cq, rq)) {
807 			g->mlg_nwqs = i;
808 			break;
809 		}
810 		g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq;
811 		g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY;
812 		rq->mlwq_group = g;
813 	}
814 	if (g->mlg_nwqs == 0) {
815 		mutex_exit(&g->mlg_mtx);
816 		return (B_FALSE);
817 	}
818 
819 	if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) {
820 		mutex_exit(&g->mlg_mtx);
821 		return (B_FALSE);
822 	}
823 
824 	for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
825 		mlxcx_tir_t *tir = &g->mlg_tir[i];
826 		tir->mltir_tdom = &mlxp->mlx_tdom;
827 		switch (i) {
828 		case MLXCX_TIR_ROLE_OTHER:
829 			tir->mltir_type = MLXCX_TIR_DIRECT;
830 			tir->mltir_rq = &g->mlg_wqs[0];
831 			break;
832 		case MLXCX_TIR_ROLE_IPv4:
833 		case MLXCX_TIR_ROLE_IPv6:
834 		case MLXCX_TIR_ROLE_TCPv4:
835 		case MLXCX_TIR_ROLE_TCPv6:
836 		case MLXCX_TIR_ROLE_UDPv4:
837 		case MLXCX_TIR_ROLE_UDPv6:
838 			tir->mltir_type = MLXCX_TIR_INDIRECT;
839 			tir->mltir_rqtable = g->mlg_rqt;
840 			tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ;
841 			(void) random_get_pseudo_bytes(tir->mltir_toeplitz_key,
842 			    sizeof (tir->mltir_toeplitz_key));
843 			break;
844 		}
845 		switch (i) {
846 		case MLXCX_TIR_ROLE_OTHER:
847 			break;
848 		case MLXCX_TIR_ROLE_IPv4:
849 		case MLXCX_TIR_ROLE_TCPv4:
850 		case MLXCX_TIR_ROLE_UDPv4:
851 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4;
852 			tir->mltir_hash_fields =
853 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
854 			break;
855 		case MLXCX_TIR_ROLE_IPv6:
856 		case MLXCX_TIR_ROLE_TCPv6:
857 		case MLXCX_TIR_ROLE_UDPv6:
858 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6;
859 			tir->mltir_hash_fields =
860 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
861 			break;
862 		}
863 		switch (i) {
864 		case MLXCX_TIR_ROLE_OTHER:
865 		case MLXCX_TIR_ROLE_IPv4:
866 		case MLXCX_TIR_ROLE_IPv6:
867 			break;
868 		case MLXCX_TIR_ROLE_TCPv4:
869 		case MLXCX_TIR_ROLE_TCPv6:
870 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP;
871 			tir->mltir_hash_fields |=
872 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
873 			break;
874 		case MLXCX_TIR_ROLE_UDPv4:
875 		case MLXCX_TIR_ROLE_UDPv6:
876 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP;
877 			tir->mltir_hash_fields |=
878 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
879 			break;
880 		}
881 
882 		if (!mlxcx_cmd_create_tir(mlxp, tir)) {
883 			mutex_exit(&g->mlg_mtx);
884 			return (B_FALSE);
885 		}
886 
887 		g->mlg_state |= MLXCX_GROUP_TIRTIS;
888 	}
889 
890 	/*
891 	 * Flow table: our RX hashing breakout table for RSS
892 	 */
893 
894 	g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
895 	    KM_SLEEP));
896 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
897 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
898 	avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare,
899 	    sizeof (mlxcx_group_mac_t),
900 	    offsetof(mlxcx_group_mac_t, mlgm_group_entry));
901 	g->mlg_state |= MLXCX_GROUP_FLOWS;
902 
903 	mutex_enter(&ft->mlft_mtx);
904 
905 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
906 	ft->mlft_level = 2;
907 	ft->mlft_port = g->mlg_port;
908 	ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT;
909 	ft->mlft_nents = (1 << ft->mlft_entshift);
910 	ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP);
911 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
912 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
913 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
914 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
915 
916 	for (j = 0; j < ft->mlft_nents; ++j) {
917 		ft->mlft_ent[j].mlfe_table = ft;
918 		ft->mlft_ent[j].mlfe_index = j;
919 	}
920 
921 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
922 		mutex_exit(&ft->mlft_mtx);
923 		mutex_exit(&g->mlg_mtx);
924 		return (B_FALSE);
925 	}
926 
927 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
928 	list_insert_tail(&ft->mlft_groups, fg);
929 	fg->mlfg_table = ft;
930 	fg->mlfg_size = 1;
931 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
932 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
933 		mutex_exit(&ft->mlft_mtx);
934 		mutex_exit(&g->mlg_mtx);
935 		return (B_FALSE);
936 	}
937 	fe = list_head(&fg->mlfg_entries);
938 	fe->mlfe_ip_version = 6;
939 	fe->mlfe_ip_proto = IPPROTO_UDP;
940 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
941 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
942 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6];
943 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
944 		mutex_exit(&ft->mlft_mtx);
945 		mutex_exit(&g->mlg_mtx);
946 		return (B_FALSE);
947 	}
948 
949 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
950 	list_insert_tail(&ft->mlft_groups, fg);
951 	fg->mlfg_table = ft;
952 	fg->mlfg_size = 1;
953 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
954 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
955 		mutex_exit(&ft->mlft_mtx);
956 		mutex_exit(&g->mlg_mtx);
957 		return (B_FALSE);
958 	}
959 	fe = list_head(&fg->mlfg_entries);
960 	fe->mlfe_ip_version = 4;
961 	fe->mlfe_ip_proto = IPPROTO_UDP;
962 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
963 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
964 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4];
965 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
966 		mutex_exit(&ft->mlft_mtx);
967 		mutex_exit(&g->mlg_mtx);
968 		return (B_FALSE);
969 	}
970 
971 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
972 	list_insert_tail(&ft->mlft_groups, fg);
973 	fg->mlfg_table = ft;
974 	fg->mlfg_size = 1;
975 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
976 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
977 		mutex_exit(&ft->mlft_mtx);
978 		mutex_exit(&g->mlg_mtx);
979 		return (B_FALSE);
980 	}
981 	fe = list_head(&fg->mlfg_entries);
982 	fe->mlfe_ip_version = 6;
983 	fe->mlfe_ip_proto = IPPROTO_TCP;
984 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
985 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
986 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6];
987 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
988 		mutex_exit(&ft->mlft_mtx);
989 		mutex_exit(&g->mlg_mtx);
990 		return (B_FALSE);
991 	}
992 
993 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
994 	list_insert_tail(&ft->mlft_groups, fg);
995 	fg->mlfg_table = ft;
996 	fg->mlfg_size = 1;
997 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
998 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
999 		mutex_exit(&ft->mlft_mtx);
1000 		mutex_exit(&g->mlg_mtx);
1001 		return (B_FALSE);
1002 	}
1003 	fe = list_head(&fg->mlfg_entries);
1004 	fe->mlfe_ip_version = 4;
1005 	fe->mlfe_ip_proto = IPPROTO_TCP;
1006 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1007 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1008 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4];
1009 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1010 		mutex_exit(&ft->mlft_mtx);
1011 		mutex_exit(&g->mlg_mtx);
1012 		return (B_FALSE);
1013 	}
1014 
1015 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1016 	list_insert_tail(&ft->mlft_groups, fg);
1017 	fg->mlfg_table = ft;
1018 	fg->mlfg_size = 1;
1019 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1020 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1021 		mutex_exit(&ft->mlft_mtx);
1022 		mutex_exit(&g->mlg_mtx);
1023 		return (B_FALSE);
1024 	}
1025 	fe = list_head(&fg->mlfg_entries);
1026 	fe->mlfe_ip_version = 6;
1027 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1028 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1029 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv6];
1030 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1031 		mutex_exit(&ft->mlft_mtx);
1032 		mutex_exit(&g->mlg_mtx);
1033 		return (B_FALSE);
1034 	}
1035 
1036 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1037 	list_insert_tail(&ft->mlft_groups, fg);
1038 	fg->mlfg_table = ft;
1039 	fg->mlfg_size = 1;
1040 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1041 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1042 		mutex_exit(&ft->mlft_mtx);
1043 		mutex_exit(&g->mlg_mtx);
1044 		return (B_FALSE);
1045 	}
1046 	fe = list_head(&fg->mlfg_entries);
1047 	fe->mlfe_ip_version = 4;
1048 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1049 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1050 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv4];
1051 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1052 		mutex_exit(&ft->mlft_mtx);
1053 		mutex_exit(&g->mlg_mtx);
1054 		return (B_FALSE);
1055 	}
1056 
1057 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1058 	list_insert_tail(&ft->mlft_groups, fg);
1059 	fg->mlfg_table = ft;
1060 	fg->mlfg_size = 1;
1061 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1062 		mutex_exit(&ft->mlft_mtx);
1063 		mutex_exit(&g->mlg_mtx);
1064 		return (B_FALSE);
1065 	}
1066 	fe = list_head(&fg->mlfg_entries);
1067 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1068 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1069 	    &g->mlg_tir[MLXCX_TIR_ROLE_OTHER];
1070 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1071 		mutex_exit(&ft->mlft_mtx);
1072 		mutex_exit(&g->mlg_mtx);
1073 		return (B_FALSE);
1074 	}
1075 
1076 	mutex_exit(&ft->mlft_mtx);
1077 
1078 	/*
1079 	 * Flow table: the VLAN breakout table for doing VLAN filtering after
1080 	 * we've matched a MAC address.
1081 	 */
1082 
1083 	g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1084 	    KM_SLEEP));
1085 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1086 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1087 	list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t),
1088 	    offsetof(mlxcx_group_vlan_t, mlgv_entry));
1089 
1090 	mutex_enter(&ft->mlft_mtx);
1091 
1092 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1093 	ft->mlft_level = 1;
1094 	ft->mlft_port = g->mlg_port;
1095 	ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift;
1096 	ft->mlft_nents = (1 << ft->mlft_entshift);
1097 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1098 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1099 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1100 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
1101 
1102 	for (j = 0; j < ft->mlft_nents; ++j) {
1103 		fe = &ft->mlft_ent[j];
1104 		fe->mlfe_table = ft;
1105 		fe->mlfe_index = j;
1106 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1107 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1108 	}
1109 
1110 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1111 		mutex_exit(&ft->mlft_mtx);
1112 		mutex_exit(&g->mlg_mtx);
1113 		return (B_FALSE);
1114 	}
1115 
1116 	/* First group is all actual matched VLANs */
1117 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1118 	g->mlg_rx_vlan_fg = fg;
1119 	list_insert_tail(&ft->mlft_groups, fg);
1120 	fg->mlfg_table = ft;
1121 	fg->mlfg_size = ft->mlft_nents - 2;
1122 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN;
1123 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID;
1124 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1125 		mutex_exit(&ft->mlft_mtx);
1126 		mutex_exit(&g->mlg_mtx);
1127 		return (B_FALSE);
1128 	}
1129 
1130 	/*
1131 	 * Then the "default" entry which we enable when we have no VLAN IDs
1132 	 * added to the group (we start with this enabled).
1133 	 */
1134 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1135 	g->mlg_rx_vlan_def_fg = fg;
1136 	list_insert_tail(&ft->mlft_groups, fg);
1137 	fg->mlfg_table = ft;
1138 	fg->mlfg_size = 1;
1139 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1140 		mutex_exit(&ft->mlft_mtx);
1141 		mutex_exit(&g->mlg_mtx);
1142 		return (B_FALSE);
1143 	}
1144 	fe = list_head(&fg->mlfg_entries);
1145 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1146 		mutex_exit(&ft->mlft_mtx);
1147 		mutex_exit(&g->mlg_mtx);
1148 		return (B_FALSE);
1149 	}
1150 
1151 	/*
1152 	 * Finally, the promisc entry which points at the *hash ft* from the
1153 	 * default group. We only enable this when we have promisc on.
1154 	 */
1155 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1156 	g->mlg_rx_vlan_promisc_fg = fg;
1157 	list_insert_tail(&ft->mlft_groups, fg);
1158 	fg->mlfg_table = ft;
1159 	fg->mlfg_size = 1;
1160 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1161 		mutex_exit(&ft->mlft_mtx);
1162 		mutex_exit(&g->mlg_mtx);
1163 		return (B_FALSE);
1164 	}
1165 	fe = list_head(&fg->mlfg_entries);
1166 	fe->mlfe_ndest = 1;
1167 	fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft;
1168 
1169 	mutex_exit(&ft->mlft_mtx);
1170 
1171 	mutex_exit(&g->mlg_mtx);
1172 
1173 	return (B_TRUE);
1174 }
1175 
1176 boolean_t
1177 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1178     mlxcx_work_queue_t *rq)
1179 {
1180 	uint_t j;
1181 	mlxcx_buffer_t *b;
1182 	mlxcx_completion_queue_t *cq;
1183 
1184 	mutex_enter(&g->mlg_mtx);
1185 	/*
1186 	 * Sadly, even though MAC has the mgi_start callback, it is not always
1187 	 * called -- in particular when we are being managed under an aggr, the
1188 	 * mgi_start callback will only ever be called on the default group.
1189 	 *
1190 	 * So instead of asserting about the group state here, we have to
1191 	 * check it and call group start if needed.
1192 	 */
1193 	if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) {
1194 		mutex_exit(&g->mlg_mtx);
1195 		if (!mlxcx_rx_group_start(mlxp, g))
1196 			return (B_FALSE);
1197 		mutex_enter(&g->mlg_mtx);
1198 	}
1199 	ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING);
1200 
1201 	cq = rq->mlwq_cq;
1202 	ASSERT(cq != NULL);
1203 
1204 	mutex_enter(&cq->mlcq_mtx);
1205 	mutex_enter(&rq->mlwq_mtx);
1206 
1207 	if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1208 		mutex_exit(&rq->mlwq_mtx);
1209 		mutex_exit(&cq->mlcq_mtx);
1210 		mutex_exit(&g->mlg_mtx);
1211 		return (B_TRUE);
1212 	}
1213 
1214 	if (!mlxcx_cmd_start_rq(mlxp, rq)) {
1215 		mutex_exit(&rq->mlwq_mtx);
1216 		mutex_exit(&cq->mlcq_mtx);
1217 		mutex_exit(&g->mlg_mtx);
1218 		return (B_FALSE);
1219 	}
1220 	ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED);
1221 
1222 	ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS);
1223 	rq->mlwq_state |= MLXCX_WQ_BUFFERS;
1224 
1225 	mlxcx_shard_ready(rq->mlwq_bufs);
1226 
1227 	for (j = 0; j < rq->mlwq_nents; ++j) {
1228 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1229 			break;
1230 		mlxcx_buf_return(mlxp, b);
1231 	}
1232 	for (j = 0; j < rq->mlwq_nents / 2; ++j) {
1233 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1234 			break;
1235 		mlxcx_buf_return(mlxp, b);
1236 	}
1237 
1238 	mlxcx_rq_refill(mlxp, rq);
1239 
1240 	mutex_exit(&rq->mlwq_mtx);
1241 	mutex_exit(&cq->mlcq_mtx);
1242 	mutex_exit(&g->mlg_mtx);
1243 
1244 	return (B_TRUE);
1245 }
1246 
1247 boolean_t
1248 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1249 {
1250 	mlxcx_flow_table_t *ft;
1251 	mlxcx_flow_group_t *fg;
1252 	mlxcx_flow_entry_t *fe;
1253 	char tq_name[TASKQ_NAMELEN];
1254 
1255 	mutex_enter(&g->mlg_mtx);
1256 
1257 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
1258 		mutex_exit(&g->mlg_mtx);
1259 		return (B_TRUE);
1260 	}
1261 
1262 	ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING);
1263 
1264 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1265 
1266 	(void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
1267 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
1268 	    g - &mlxp->mlx_rx_groups[0]);
1269 
1270 	/*
1271 	 * Create one refill taskq per group with one thread per work queue.
1272 	 * The refill task may block waiting for resources, so by effectively
1273 	 * having one thread per work queue we avoid work queues blocking each
1274 	 * other.
1275 	 */
1276 	if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
1277 	    g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
1278 		mlxcx_warn(mlxp, "failed to create rq refill task queue");
1279 		mutex_exit(&g->mlg_mtx);
1280 		return (B_FALSE);
1281 	}
1282 
1283 	if (g == &mlxp->mlx_rx_groups[0]) {
1284 		ft = g->mlg_port->mlp_rx_flow;
1285 		mutex_enter(&ft->mlft_mtx);
1286 
1287 		/*
1288 		 * Broadcast and promisc entries go directly to group 0's
1289 		 * RSS hash fanout flow table. They bypass VLAN filtering.
1290 		 */
1291 		fg = g->mlg_port->mlp_bcast;
1292 		fe = list_head(&fg->mlfg_entries);
1293 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1294 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1295 			mutex_exit(&ft->mlft_mtx);
1296 			g->mlg_state &= ~MLXCX_GROUP_RUNNING;
1297 			taskq_destroy(g->mlg_refill_tq);
1298 			mutex_exit(&g->mlg_mtx);
1299 			return (B_FALSE);
1300 		}
1301 
1302 		fg = g->mlg_port->mlp_promisc;
1303 		fe = list_head(&fg->mlfg_entries);
1304 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1305 		/*
1306 		 * Don't actually set the promisc entry until promisc is
1307 		 * enabled.
1308 		 */
1309 
1310 		mutex_exit(&ft->mlft_mtx);
1311 	}
1312 
1313 	mutex_exit(&g->mlg_mtx);
1314 
1315 	return (B_TRUE);
1316 }
1317 
1318 boolean_t
1319 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1320 {
1321 	mlxcx_event_queue_t *eq;
1322 	mlxcx_completion_queue_t *cq;
1323 	mlxcx_work_queue_t *sq;
1324 	uint_t i;
1325 
1326 	ASSERT3S(g->mlg_state, ==, 0);
1327 
1328 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
1329 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1330 	g->mlg_state |= MLXCX_GROUP_INIT;
1331 	mutex_enter(&g->mlg_mtx);
1332 
1333 	g->mlg_mlx = mlxp;
1334 	g->mlg_type = MLXCX_GROUP_TX;
1335 	g->mlg_port = &mlxp->mlx_ports[0];
1336 
1337 	g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group;
1338 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
1339 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
1340 	g->mlg_state |= MLXCX_GROUP_WQS;
1341 
1342 	g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom;
1343 
1344 	if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) {
1345 		mutex_exit(&g->mlg_mtx);
1346 		return (B_FALSE);
1347 	}
1348 
1349 	g->mlg_state |= MLXCX_GROUP_TIRTIS;
1350 
1351 	for (i = 0; i < g->mlg_nwqs; ++i) {
1352 		eq = NULL;
1353 		while (eq == NULL) {
1354 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
1355 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
1356 				mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
1357 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
1358 			    eq->mleq_type != MLXCX_EQ_TYPE_TX) {
1359 				/* Try the next one */
1360 				eq = NULL;
1361 			}
1362 		}
1363 
1364 		if (!mlxcx_cq_setup(mlxp, eq, &cq,
1365 		    mlxp->mlx_props.mldp_cq_size_shift))
1366 			return (B_FALSE);
1367 
1368 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
1369 
1370 		sq = &g->mlg_wqs[i];
1371 		if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) {
1372 			mutex_exit(&g->mlg_mtx);
1373 			return (B_FALSE);
1374 		}
1375 		sq->mlwq_group = g;
1376 	}
1377 
1378 	mutex_exit(&g->mlg_mtx);
1379 
1380 	return (B_TRUE);
1381 }
1382 
1383 boolean_t
1384 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1385     mlxcx_work_queue_t *sq)
1386 {
1387 	uint_t i;
1388 	mlxcx_buffer_t *b;
1389 	mlxcx_completion_queue_t *cq;
1390 
1391 	mutex_enter(&g->mlg_mtx);
1392 
1393 	cq = sq->mlwq_cq;
1394 	ASSERT(cq != NULL);
1395 
1396 	mutex_enter(&cq->mlcq_mtx);
1397 	mutex_enter(&sq->mlwq_mtx);
1398 	if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1399 		mutex_exit(&sq->mlwq_mtx);
1400 		mutex_exit(&cq->mlcq_mtx);
1401 		mutex_exit(&g->mlg_mtx);
1402 		return (B_TRUE);
1403 	}
1404 
1405 	ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS);
1406 	for (i = 0; i < sq->mlwq_nents; ++i) {
1407 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1408 			break;
1409 		mlxcx_buf_return(mlxp, b);
1410 	}
1411 	for (i = 0; i < sq->mlwq_nents / 2; ++i) {
1412 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1413 			break;
1414 		mlxcx_buf_return(mlxp, b);
1415 	}
1416 	for (i = 0; i < sq->mlwq_nents; ++i) {
1417 		if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b))
1418 			break;
1419 		mlxcx_buf_return(mlxp, b);
1420 	}
1421 	sq->mlwq_state |= MLXCX_WQ_BUFFERS;
1422 
1423 	mlxcx_shard_ready(sq->mlwq_bufs);
1424 	mlxcx_shard_ready(sq->mlwq_foreign_bufs);
1425 
1426 	if (!mlxcx_cmd_start_sq(mlxp, sq)) {
1427 		mutex_exit(&sq->mlwq_mtx);
1428 		mutex_exit(&cq->mlcq_mtx);
1429 		mutex_exit(&g->mlg_mtx);
1430 		return (B_FALSE);
1431 	}
1432 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1433 
1434 	(void) mlxcx_sq_add_nop(mlxp, sq);
1435 
1436 	mutex_exit(&sq->mlwq_mtx);
1437 	mutex_exit(&cq->mlcq_mtx);
1438 	mutex_exit(&g->mlg_mtx);
1439 
1440 	return (B_TRUE);
1441 }
1442 
1443 static boolean_t
1444 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first)
1445 {
1446 	uint_t idx;
1447 	mlxcx_bf_t *bf;
1448 	ddi_fm_error_t err;
1449 	uint_t try = 0;
1450 
1451 	ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ);
1452 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1453 
1454 	mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc);
1455 
1456 	ASSERT(mlwq->mlwq_cq != NULL);
1457 	ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL);
1458 	idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK;
1459 	bf = &mlwq->mlwq_uar->mlu_bf[idx];
1460 
1461 retry:
1462 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1463 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1464 	    DDI_FME_VERSION);
1465 	if (err.fme_status != DDI_FM_OK) {
1466 		if (try++ < mlxcx_doorbell_tries) {
1467 			ddi_fm_dma_err_clear(
1468 			    mlwq->mlwq_doorbell_dma.mxdb_dma_handle,
1469 			    DDI_FME_VERSION);
1470 			goto retry;
1471 		} else {
1472 			goto err;
1473 		}
1474 	}
1475 
1476 	mlxcx_put64(mlxp, bf->mbf_even, from_be64(
1477 	    mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0]));
1478 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
1479 	    DDI_FME_VERSION);
1480 	if (err.fme_status == DDI_FM_OK)
1481 		return (B_TRUE);
1482 	if (try++ < mlxcx_doorbell_tries) {
1483 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
1484 		goto retry;
1485 	}
1486 
1487 err:
1488 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1489 	return (B_FALSE);
1490 }
1491 
1492 boolean_t
1493 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1494 {
1495 	uint_t index, start_pc;
1496 	mlxcx_sendq_ent_t *ent0;
1497 	ddi_fm_error_t err;
1498 
1499 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1500 
1501 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1502 	ent0 = &mlwq->mlwq_send_ent[index];
1503 	start_pc = mlwq->mlwq_pc;
1504 	++mlwq->mlwq_pc;
1505 	/*
1506 	 * This counter is manipulated in the interrupt handler, which
1507 	 * does not hold the mlwq_mtx, hence the atomic.
1508 	 */
1509 	atomic_inc_64(&mlwq->mlwq_wqebb_used);
1510 
1511 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1512 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
1513 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1514 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc);
1515 
1516 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1517 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE);
1518 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1519 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1520 
1521 	ent0->mlsqe_control.mlcs_ds = 1;
1522 
1523 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1524 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1525 	    sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1526 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1527 	    DDI_FME_VERSION);
1528 	if (err.fme_status != DDI_FM_OK) {
1529 		return (B_FALSE);
1530 	}
1531 	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) {
1532 		return (B_FALSE);
1533 	}
1534 	return (B_TRUE);
1535 }
1536 
1537 boolean_t
1538 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1539     uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
1540     mlxcx_buffer_t *b0)
1541 {
1542 	uint_t index, first, ents;
1543 	mlxcx_completion_queue_t *cq;
1544 	mlxcx_sendq_ent_t *ent0;
1545 	mlxcx_sendq_extra_ent_t *ent;
1546 	mlxcx_wqe_data_seg_t *seg;
1547 	uint_t ptri, nptr;
1548 	const ddi_dma_cookie_t *c;
1549 	size_t rem;
1550 	uint64_t wqebb_used;
1551 	mlxcx_buffer_t *b;
1552 	ddi_fm_error_t err;
1553 	boolean_t rv;
1554 
1555 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1556 	ASSERT3P(b0->mlb_tx_head, ==, b0);
1557 	ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1558 	cq = mlwq->mlwq_cq;
1559 
1560 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1561 	ent0 = &mlwq->mlwq_send_ent[index];
1562 	b0->mlb_wqe_index = mlwq->mlwq_pc;
1563 	ents = 1;
1564 
1565 	first = index;
1566 
1567 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1568 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
1569 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1570 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index);
1571 
1572 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1573 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS);
1574 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1575 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1576 
1577 	VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers));
1578 	set_bits16(&ent0->mlsqe_eth.mles_szflags,
1579 	    MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen);
1580 	if (inlinelen > 0) {
1581 		bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers,
1582 		    inlinelen);
1583 	}
1584 
1585 	ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) /
1586 	    MLXCX_WQE_OCTOWORD;
1587 
1588 	if (chkflags & HCK_IPV4_HDRCKSUM) {
1589 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1590 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1591 		    MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM);
1592 	}
1593 	if (chkflags & HCK_FULLCKSUM) {
1594 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1595 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1596 		    MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
1597 	}
1598 
1599 	/*
1600 	 * mlwq_wqebb_used is only incremented whilst holding
1601 	 * the mlwq_mtx mutex, but it is decremented (atomically) in
1602 	 * the interrupt context *not* under mlwq_mtx mutex.
1603 	 * So, now take a snapshot of the number of used wqes which will
1604 	 * be a conistent maximum we can use whilst iterating through
1605 	 * the buffers and DMA cookies.
1606 	 */
1607 	wqebb_used = mlwq->mlwq_wqebb_used;
1608 
1609 	b = b0;
1610 	ptri = 0;
1611 	nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
1612 	seg = ent0->mlsqe_data;
1613 	while (b != NULL) {
1614 		rem = b->mlb_used;
1615 
1616 		c = NULL;
1617 		while (rem > 0 &&
1618 		    (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
1619 			if (ptri >= nptr) {
1620 				if ((ents + wqebb_used) >= mlwq->mlwq_nents)
1621 					return (B_FALSE);
1622 
1623 				index = (mlwq->mlwq_pc + ents) &
1624 				    (mlwq->mlwq_nents - 1);
1625 				ent = &mlwq->mlwq_send_extra_ent[index];
1626 				++ents;
1627 
1628 				seg = ent->mlsqe_data;
1629 				ptri = 0;
1630 				nptr = sizeof (ent->mlsqe_data) /
1631 				    sizeof (mlxcx_wqe_data_seg_t);
1632 			}
1633 
1634 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1635 			if (c->dmac_size > rem) {
1636 				seg->mlds_byte_count = to_be32(rem);
1637 				rem = 0;
1638 			} else {
1639 				seg->mlds_byte_count = to_be32(c->dmac_size);
1640 				rem -= c->dmac_size;
1641 			}
1642 			seg->mlds_address = to_be64(c->dmac_laddress);
1643 			++seg;
1644 			++ptri;
1645 			++ent0->mlsqe_control.mlcs_ds;
1646 
1647 			ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=,
1648 			    MLXCX_SQE_MAX_DS);
1649 		}
1650 
1651 		if (b == b0) {
1652 			b = list_head(&b0->mlb_tx_chain);
1653 		} else {
1654 			b = list_next(&b0->mlb_tx_chain, b);
1655 		}
1656 	}
1657 
1658 	b0->mlb_wqebbs = ents;
1659 	mlwq->mlwq_pc += ents;
1660 	atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
1661 
1662 	for (; ptri < nptr; ++ptri, ++seg) {
1663 		seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1664 		seg->mlds_byte_count = to_be32(0);
1665 		seg->mlds_address = to_be64(0);
1666 	}
1667 
1668 	/*
1669 	 * Make sure the workqueue entry is flushed out before updating
1670 	 * the doorbell.
1671 	 * If the ring has wrapped, we need to flush the front and back.
1672 	 */
1673 	if ((first + ents) > mlwq->mlwq_nents) {
1674 		uint_t sync_cnt = mlwq->mlwq_nents - first;
1675 
1676 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1677 		    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1678 		    sync_cnt * sizeof (mlxcx_sendq_ent_t),
1679 		    DDI_DMA_SYNC_FORDEV));
1680 
1681 		ent0 = &mlwq->mlwq_send_ent[0];
1682 		ents -= sync_cnt;
1683 	}
1684 
1685 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1686 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1687 	    ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1688 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1689 	    DDI_FME_VERSION);
1690 	if (err.fme_status != DDI_FM_OK) {
1691 		return (B_FALSE);
1692 	}
1693 
1694 	/*
1695 	 * Hold the bufmtx whilst ringing the doorbell, to prevent
1696 	 * the buffer from being moved to another list, so we can
1697 	 * safely remove it should the ring fail.
1698 	 */
1699 	mutex_enter(&cq->mlcq_bufbmtx);
1700 
1701 	list_insert_tail(&cq->mlcq_buffers_b, b0);
1702 	if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
1703 		atomic_inc_64(&cq->mlcq_bufcnt);
1704 	} else {
1705 		list_remove(&cq->mlcq_buffers_b, b0);
1706 	}
1707 
1708 	mutex_exit(&cq->mlcq_bufbmtx);
1709 
1710 	return (rv);
1711 }
1712 
1713 boolean_t
1714 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1715     mlxcx_buffer_t *buf)
1716 {
1717 	return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1));
1718 }
1719 
1720 boolean_t
1721 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1722     mlxcx_buffer_t **bufs, size_t nbufs)
1723 {
1724 	uint_t index;
1725 	mlxcx_recvq_ent_t *ent;
1726 	mlxcx_completion_queue_t *cq;
1727 	mlxcx_wqe_data_seg_t *seg;
1728 	uint_t bi, ptri;
1729 	const ddi_dma_cookie_t *c;
1730 	mlxcx_buffer_t *buf;
1731 	ddi_fm_error_t err;
1732 
1733 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1734 	cq = mlwq->mlwq_cq;
1735 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1736 
1737 	for (bi = 0; bi < nbufs; ++bi) {
1738 		buf = bufs[bi];
1739 		bufs[bi] = NULL;
1740 		ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1741 
1742 		index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1743 		ent = &mlwq->mlwq_recv_ent[index];
1744 		buf->mlb_wqe_index = mlwq->mlwq_pc;
1745 		buf->mlb_wqebbs = 1;
1746 
1747 		++mlwq->mlwq_pc;
1748 		atomic_inc_64(&mlwq->mlwq_wqebb_used);
1749 
1750 		mutex_enter(&cq->mlcq_bufbmtx);
1751 		list_insert_tail(&cq->mlcq_buffers, buf);
1752 		atomic_inc_64(&cq->mlcq_bufcnt);
1753 		mutex_exit(&cq->mlcq_bufbmtx);
1754 
1755 		ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS);
1756 		ptri = 0;
1757 		c = NULL;
1758 		while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) {
1759 			seg = &ent->mlrqe_data[ptri++];
1760 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1761 			seg->mlds_byte_count = to_be32(c->dmac_size);
1762 			seg->mlds_address = to_be64(c->dmac_laddress);
1763 		}
1764 		/*
1765 		 * Fill any unused scatter pointers with the special null
1766 		 * value.
1767 		 */
1768 		for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) {
1769 			seg = &ent->mlrqe_data[ptri];
1770 			seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1771 			seg->mlds_byte_count = to_be32(0);
1772 			seg->mlds_address = to_be64(0);
1773 		}
1774 
1775 		/*
1776 		 * Make sure the workqueue entry is flushed out before updating
1777 		 * the doorbell.
1778 		 */
1779 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1780 		    (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent,
1781 		    sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV));
1782 		ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1783 		    DDI_FME_VERSION);
1784 		if (err.fme_status != DDI_FM_OK) {
1785 			return (B_FALSE);
1786 		}
1787 	}
1788 
1789 	mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc);
1790 	/*
1791 	 * Flush the CQ doorbell as well so that HW knows how many
1792 	 * completions we've consumed.
1793 	 */
1794 	MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1795 	ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
1796 	    DDI_FME_VERSION);
1797 	if (err.fme_status != DDI_FM_OK) {
1798 		return (B_FALSE);
1799 	}
1800 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1801 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1802 	    DDI_FME_VERSION);
1803 	if (err.fme_status != DDI_FM_OK) {
1804 		return (B_FALSE);
1805 	}
1806 	return (B_TRUE);
1807 }
1808 
1809 static void
1810 mlxcx_rq_refill_task(void *arg)
1811 {
1812 	mlxcx_work_queue_t *wq = arg;
1813 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
1814 	mlxcx_t *mlxp = wq->mlwq_mlx;
1815 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
1816 	boolean_t refill, draining;
1817 
1818 	do {
1819 		/*
1820 		 * Wait here until one of 3 conditions:
1821 		 * 1. The shard is draining, or
1822 		 * 2. There are buffers on the free list, or
1823 		 * 3. The WQ is being shut down.
1824 		 */
1825 		mutex_enter(&s->mlbs_mtx);
1826 		while (s->mlbs_state != MLXCX_SHARD_DRAINING &&
1827 		    list_is_empty(&s->mlbs_free) &&
1828 		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) {
1829 			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
1830 		}
1831 
1832 		draining = (s->mlbs_state == MLXCX_SHARD_DRAINING);
1833 		mutex_exit(&s->mlbs_mtx);
1834 
1835 		mutex_enter(&cq->mlcq_mtx);
1836 		mutex_enter(&wq->mlwq_mtx);
1837 
1838 		if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
1839 			refill = B_FALSE;
1840 			wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1841 		} else {
1842 			mlxcx_rq_refill(mlxp, wq);
1843 
1844 			if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
1845 				refill = B_TRUE;
1846 			} else {
1847 				refill = B_FALSE;
1848 				wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1849 			}
1850 		}
1851 
1852 		mutex_exit(&wq->mlwq_mtx);
1853 		mutex_exit(&cq->mlcq_mtx);
1854 	} while (refill);
1855 }
1856 
1857 void
1858 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1859 {
1860 	size_t target, current, want, done, n;
1861 	mlxcx_completion_queue_t *cq;
1862 	mlxcx_ring_group_t *g;
1863 	mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
1864 	uint_t i;
1865 
1866 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1867 	cq = mlwq->mlwq_cq;
1868 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1869 
1870 	ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS);
1871 
1872 	target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP;
1873 	cq = mlwq->mlwq_cq;
1874 
1875 	if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0)
1876 		return;
1877 
1878 	if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0)
1879 		return;
1880 
1881 	current = cq->mlcq_bufcnt;
1882 
1883 	if (current >= target - MLXCX_RQ_REFILL_STEP)
1884 		return;
1885 
1886 	want = target - current;
1887 	done = 0;
1888 
1889 	while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
1890 		n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
1891 		if (n == 0) {
1892 			/*
1893 			 * We didn't get any buffers from the free queue.
1894 			 * It might not be an issue, schedule a taskq
1895 			 * to wait for free buffers if the completion
1896 			 * queue is low.
1897 			 */
1898 			if (current < MLXCX_RQ_REFILL_STEP &&
1899 			    (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
1900 				mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
1901 				g = mlwq->mlwq_group;
1902 				taskq_dispatch_ent(g->mlg_refill_tq,
1903 				    mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
1904 				    &mlwq->mlwq_tqe);
1905 			}
1906 
1907 			return;
1908 		}
1909 
1910 		if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) {
1911 			for (i = 0; i < n; ++i)
1912 				mlxcx_buf_return(mlxp, b[i]);
1913 			return;
1914 		}
1915 		if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) {
1916 			/*
1917 			 * mlxcx_rq_add_buffers NULLs out the buffers as it
1918 			 * enqueues them, so any that are non-NULL we have to
1919 			 * free now. The others now belong to the WQ, even if
1920 			 * we failed.
1921 			 */
1922 			for (i = 0; i < n; ++i) {
1923 				if (b[i] != NULL) {
1924 					mlxcx_buf_return(mlxp, b[i]);
1925 				}
1926 			}
1927 			return;
1928 		}
1929 		done += n;
1930 	}
1931 }
1932 
1933 static const char *
1934 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy)
1935 {
1936 	switch (sy) {
1937 	case MLXCX_CQ_ERR_LOCAL_LENGTH:
1938 		return ("LOCAL_LENGTH");
1939 	case MLXCX_CQ_ERR_LOCAL_QP_OP:
1940 		return ("LOCAL_QP_OP");
1941 	case MLXCX_CQ_ERR_LOCAL_PROTECTION:
1942 		return ("LOCAL_PROTECTION");
1943 	case MLXCX_CQ_ERR_WR_FLUSHED:
1944 		return ("WR_FLUSHED");
1945 	case MLXCX_CQ_ERR_MEM_WINDOW_BIND:
1946 		return ("MEM_WINDOW_BIND");
1947 	case MLXCX_CQ_ERR_BAD_RESPONSE:
1948 		return ("BAD_RESPONSE");
1949 	case MLXCX_CQ_ERR_LOCAL_ACCESS:
1950 		return ("LOCAL_ACCESS");
1951 	case MLXCX_CQ_ERR_XPORT_RETRY_CTR:
1952 		return ("XPORT_RETRY_CTR");
1953 	case MLXCX_CQ_ERR_RNR_RETRY_CTR:
1954 		return ("RNR_RETRY_CTR");
1955 	case MLXCX_CQ_ERR_ABORTED:
1956 		return ("ABORTED");
1957 	default:
1958 		return ("UNKNOWN");
1959 	}
1960 }
1961 
1962 static void
1963 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1964     mlxcx_completionq_error_ent_t *ent)
1965 {
1966 	uint64_t ena;
1967 	char buf[FM_MAX_CLASS];
1968 	const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome);
1969 
1970 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1971 		return;
1972 
1973 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1974 	    MLXCX_FM_SERVICE_MLXCX, "cqe.err");
1975 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1976 
1977 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1978 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1979 	    "syndrome", DATA_TYPE_STRING, name,
1980 	    "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome,
1981 	    "vendor_syndrome", DATA_TYPE_UINT8,
1982 	    ent->mlcqee_vendor_error_syndrome,
1983 	    "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter),
1984 	    "wq_type", DATA_TYPE_STRING,
1985 	    (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv",
1986 	    "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num,
1987 	    "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num,
1988 	    NULL);
1989 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1990 }
1991 
1992 void
1993 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1994     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1995 {
1996 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1997 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) {
1998 		mlxcx_completionq_error_ent_t *eent =
1999 		    (mlxcx_completionq_error_ent_t *)ent;
2000 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2001 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2002 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2003 		mlxcx_check_sq(mlxp, mlcq->mlcq_wq);
2004 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2005 		return;
2006 	}
2007 
2008 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) {
2009 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2010 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2011 		return;
2012 	}
2013 
2014 	if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) {
2015 		mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x",
2016 		    ent->mlcqe_send_wqe_opcode);
2017 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2018 		return;
2019 	}
2020 
2021 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2022 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2023 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2024 		return;
2025 	}
2026 
2027 	mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2028 }
2029 
2030 mblk_t *
2031 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
2032     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
2033 {
2034 	uint32_t chkflags = 0;
2035 	uint_t wqe_index;
2036 	ddi_fm_error_t err;
2037 
2038 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
2039 
2040 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) {
2041 		mlxcx_completionq_error_ent_t *eent =
2042 		    (mlxcx_completionq_error_ent_t *)ent;
2043 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2044 		mlxcx_buf_return(mlxp, buf);
2045 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2046 		mlxcx_check_rq(mlxp, mlcq->mlcq_wq);
2047 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2048 		return (NULL);
2049 	}
2050 
2051 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) {
2052 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2053 		mlxcx_buf_return(mlxp, buf);
2054 		return (NULL);
2055 	}
2056 
2057 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2058 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2059 		mlxcx_buf_return(mlxp, buf);
2060 		return (NULL);
2061 	}
2062 
2063 	if (ent->mlcqe_rx_drop_counter > 0) {
2064 		atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops,
2065 		    ent->mlcqe_rx_drop_counter);
2066 	}
2067 
2068 	MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU);
2069 	ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err,
2070 	    DDI_FME_VERSION);
2071 	if (err.fme_status != DDI_FM_OK) {
2072 		ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle,
2073 		    DDI_FME_VERSION);
2074 		mlxcx_buf_return(mlxp, buf);
2075 		return (NULL);
2076 	}
2077 
2078 	/*
2079 	 * mlxcx_buf_loan() will set mlb_wqe_index to zero.
2080 	 * Remember it for later.
2081 	 */
2082 	wqe_index = buf->mlb_wqe_index;
2083 
2084 	if (!mlxcx_buf_loan(mlxp, buf)) {
2085 		mlxcx_buf_return(mlxp, buf);
2086 		return (NULL);
2087 	}
2088 
2089 	buf->mlb_mp->b_next = NULL;
2090 	buf->mlb_mp->b_cont = NULL;
2091 	buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr +
2092 	    from_be32(ent->mlcqe_byte_cnt);
2093 
2094 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) {
2095 		chkflags |= HCK_FULLCKSUM_OK;
2096 	}
2097 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) {
2098 		chkflags |= HCK_IPV4_HDRCKSUM_OK;
2099 	}
2100 	if (chkflags != 0) {
2101 		mac_hcksum_set(buf->mlb_mp, 0, 0, 0,
2102 		    from_be16(ent->mlcqe_checksum), chkflags);
2103 	}
2104 
2105 	/*
2106 	 * Don't check if a refill is needed on every single completion,
2107 	 * since checking involves taking the RQ lock.
2108 	 */
2109 	if ((wqe_index & 0x7) == 0) {
2110 		mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
2111 		ASSERT(wq != NULL);
2112 		mutex_enter(&wq->mlwq_mtx);
2113 		if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN))
2114 			mlxcx_rq_refill(mlxp, wq);
2115 		mutex_exit(&wq->mlwq_mtx);
2116 	}
2117 
2118 	return (buf->mlb_mp);
2119 }
2120 
2121 static void
2122 mlxcx_buf_mp_return(caddr_t arg)
2123 {
2124 	mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg;
2125 	mlxcx_t *mlxp = b->mlb_mlx;
2126 
2127 	/* The mblk has been used now, so NULL it out. */
2128 	b->mlb_mp = NULL;
2129 
2130 	if (b->mlb_state == MLXCX_BUFFER_ON_LOAN)
2131 		mlxcx_buf_return(mlxp, b);
2132 }
2133 
2134 boolean_t
2135 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp)
2136 {
2137 	mlxcx_buffer_t *b;
2138 	ddi_device_acc_attr_t acc;
2139 	ddi_dma_attr_t attr;
2140 	boolean_t ret;
2141 
2142 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2143 	b->mlb_shard = shard;
2144 	b->mlb_foreign = B_FALSE;
2145 
2146 	mlxcx_dma_acc_attr(mlxp, &acc);
2147 	mlxcx_dma_buf_attr(mlxp, &attr);
2148 
2149 	ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc,
2150 	    B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE);
2151 	if (!ret) {
2152 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2153 		return (B_FALSE);
2154 	}
2155 
2156 	b->mlb_frtn.free_func = mlxcx_buf_mp_return;
2157 	b->mlb_frtn.free_arg = (caddr_t)b;
2158 	b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2159 	    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2160 
2161 	*bp = b;
2162 
2163 	return (B_TRUE);
2164 }
2165 
2166 boolean_t
2167 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
2168     mlxcx_buffer_t **bp)
2169 {
2170 	mlxcx_buffer_t *b;
2171 	ddi_dma_attr_t attr;
2172 	boolean_t ret;
2173 
2174 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2175 	b->mlb_shard = shard;
2176 	b->mlb_foreign = B_TRUE;
2177 
2178 	mlxcx_dma_buf_attr(mlxp, &attr);
2179 
2180 	ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE);
2181 	if (!ret) {
2182 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2183 		return (B_FALSE);
2184 	}
2185 
2186 	*bp = b;
2187 
2188 	return (B_TRUE);
2189 }
2190 
2191 static mlxcx_buffer_t *
2192 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2193 {
2194 	mlxcx_buffer_t *b;
2195 	mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
2196 
2197 	mutex_enter(&s->mlbs_mtx);
2198 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2199 		mutex_exit(&s->mlbs_mtx);
2200 		return (NULL);
2201 	}
2202 
2203 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2204 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2205 		ASSERT(b->mlb_foreign);
2206 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2207 		list_insert_tail(&s->mlbs_busy, b);
2208 	}
2209 	mutex_exit(&s->mlbs_mtx);
2210 
2211 	return (b);
2212 }
2213 
2214 static mlxcx_buffer_t *
2215 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
2216 {
2217 	ddi_fm_error_t err;
2218 	mlxcx_buffer_t *b;
2219 	uint_t attempts = 0;
2220 
2221 copyb:
2222 	if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
2223 		return (NULL);
2224 
2225 	ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
2226 	bcopy(rptr, b->mlb_dma.mxdb_va, sz);
2227 
2228 	MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
2229 
2230 	ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
2231 	    DDI_FME_VERSION);
2232 	if (err.fme_status != DDI_FM_OK) {
2233 		ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
2234 		    DDI_FME_VERSION);
2235 		mlxcx_buf_return(mlxp, b);
2236 		if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
2237 			return (NULL);
2238 		}
2239 		goto copyb;
2240 	}
2241 
2242 	return (b);
2243 }
2244 
2245 static mlxcx_buffer_t *
2246 mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2247     mblk_t *mp, size_t off)
2248 {
2249 	mlxcx_buffer_t *b;
2250 	uint8_t *rptr;
2251 	size_t sz;
2252 	boolean_t ret;
2253 
2254 	rptr = mp->b_rptr;
2255 	sz = MBLKL(mp);
2256 
2257 #ifdef DEBUG
2258 	if (off > 0) {
2259 		ASSERT3U(off, <, sz);
2260 	}
2261 #endif
2262 
2263 	rptr += off;
2264 	sz -= off;
2265 
2266 	if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
2267 		b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2268 	} else {
2269 		b = mlxcx_buf_take_foreign(mlxp, wq);
2270 		if (b == NULL)
2271 			return (NULL);
2272 
2273 		ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
2274 		    B_FALSE);
2275 
2276 		if (!ret) {
2277 			mlxcx_buf_return(mlxp, b);
2278 
2279 			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2280 		}
2281 	}
2282 
2283 	return (b);
2284 }
2285 
2286 uint_t
2287 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2288     mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
2289 {
2290 	mlxcx_buffer_t *b, *b0 = NULL;
2291 	boolean_t first = B_TRUE;
2292 	mblk_t *mp;
2293 	size_t offset = off;
2294 	size_t ncookies = 0;
2295 	uint_t count = 0;
2296 
2297 	for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS;
2298 	    mp = mp->b_cont) {
2299 		b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset);
2300 		if (b == NULL)
2301 			goto failed;
2302 
2303 		ncookies += b->mlb_dma.mxdb_ncookies;
2304 
2305 		if (first)
2306 			b0 = b;
2307 
2308 		if (!first)
2309 			b->mlb_state = MLXCX_BUFFER_ON_CHAIN;
2310 
2311 		b->mlb_tx_mp = mp;
2312 		b->mlb_tx_head = b0;
2313 		b->mlb_used = MBLKL(mp) - offset;
2314 
2315 		if (!first)
2316 			list_insert_tail(&b0->mlb_tx_chain, b);
2317 		first = B_FALSE;
2318 		offset = 0;
2319 
2320 		count++;
2321 	}
2322 
2323 	/*
2324 	 * The chain of mblks has resulted in too many cookies for
2325 	 * a single message. This is unusual, so take the hit to tidy
2326 	 * up, do a pullup to a single mblk and allocate the requisite
2327 	 * buf.
2328 	 */
2329 	if (ncookies > MLXCX_SQE_MAX_PTRS) {
2330 		DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq,
2331 		    mblk_t *, mpb, size_t, ncookies);
2332 
2333 		if (b0 != NULL)
2334 			mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2335 
2336 		if ((mp = msgpullup(mpb, -1)) == NULL)
2337 			return (0);
2338 
2339 		b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off);
2340 		if (b0 == NULL) {
2341 			freemsg(mp);
2342 			return (0);
2343 		}
2344 		freemsg(mpb);
2345 
2346 		b0->mlb_tx_mp = mp;
2347 		b0->mlb_tx_head = b0;
2348 		b0->mlb_used = MBLKL(mp) - off;
2349 
2350 		count = 1;
2351 	}
2352 
2353 	*bp = b0;
2354 
2355 	return (count);
2356 
2357 failed:
2358 	if (b0 != NULL)
2359 		mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2360 
2361 	return (0);
2362 }
2363 
2364 mlxcx_buffer_t *
2365 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2366 {
2367 	mlxcx_buffer_t *b;
2368 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
2369 
2370 	mutex_enter(&s->mlbs_mtx);
2371 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2372 		mutex_exit(&s->mlbs_mtx);
2373 		return (NULL);
2374 	}
2375 
2376 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2377 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2378 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2379 		list_insert_tail(&s->mlbs_busy, b);
2380 	}
2381 	mutex_exit(&s->mlbs_mtx);
2382 
2383 	return (b);
2384 }
2385 
2386 size_t
2387 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp,
2388     size_t nbufs)
2389 {
2390 	mlxcx_buffer_t *b;
2391 	size_t done = 0;
2392 	mlxcx_buf_shard_t *s;
2393 
2394 	s = wq->mlwq_bufs;
2395 
2396 	mutex_enter(&s->mlbs_mtx);
2397 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2398 		mutex_exit(&s->mlbs_mtx);
2399 		return (0);
2400 	}
2401 
2402 	while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
2403 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2404 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2405 		list_insert_tail(&s->mlbs_busy, b);
2406 		bp[done++] = b;
2407 	}
2408 	mutex_exit(&s->mlbs_mtx);
2409 	return (done);
2410 }
2411 
2412 boolean_t
2413 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2414 {
2415 	mlxcx_buf_shard_t *s = b->mlb_shard;
2416 
2417 	VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
2418 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2419 
2420 	if (b->mlb_mp == NULL) {
2421 		b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2422 		    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2423 		if (b->mlb_mp == NULL)
2424 			return (B_FALSE);
2425 	}
2426 
2427 	b->mlb_state = MLXCX_BUFFER_ON_LOAN;
2428 	b->mlb_wqe_index = 0;
2429 
2430 	mutex_enter(&s->mlbs_mtx);
2431 	list_remove(&s->mlbs_busy, b);
2432 	list_insert_tail(&s->mlbs_loaned, b);
2433 	mutex_exit(&s->mlbs_mtx);
2434 
2435 	return (B_TRUE);
2436 }
2437 
2438 void
2439 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp)
2440 {
2441 	mlxcx_buffer_t *b;
2442 
2443 	if (b0->mlb_tx_head != b0) {
2444 		mlxcx_buf_return(mlxp, b0);
2445 		return;
2446 	}
2447 
2448 	while ((b = list_head(&b0->mlb_tx_chain)) != NULL) {
2449 		mlxcx_buf_return(mlxp, b);
2450 	}
2451 	if (keepmp) {
2452 		b0->mlb_tx_mp = NULL;
2453 		b0->mlb_tx_head = NULL;
2454 	}
2455 	mlxcx_buf_return(mlxp, b0);
2456 }
2457 
2458 void
2459 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2460 {
2461 	mlxcx_buffer_state_t oldstate = b->mlb_state;
2462 	mlxcx_buffer_t *txhead = b->mlb_tx_head;
2463 	mlxcx_buf_shard_t *s = b->mlb_shard;
2464 	mblk_t *mp = b->mlb_tx_mp;
2465 
2466 	VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
2467 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2468 
2469 	/*
2470 	 * The mlbs_mtx held below is a heavily contended lock, so it is
2471 	 * imperative we do as much of the buffer clean up outside the lock
2472 	 * as is possible.
2473 	 */
2474 	b->mlb_state = MLXCX_BUFFER_FREE;
2475 	b->mlb_wqe_index = 0;
2476 	b->mlb_tx_head = NULL;
2477 	b->mlb_tx_mp = NULL;
2478 	b->mlb_used = 0;
2479 	b->mlb_wqebbs = 0;
2480 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2481 
2482 	if (b->mlb_foreign) {
2483 		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
2484 			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
2485 		}
2486 	}
2487 
2488 	mutex_enter(&s->mlbs_mtx);
2489 	switch (oldstate) {
2490 	case MLXCX_BUFFER_INIT:
2491 		break;
2492 	case MLXCX_BUFFER_ON_WQ:
2493 		list_remove(&s->mlbs_busy, b);
2494 		break;
2495 	case MLXCX_BUFFER_ON_LOAN:
2496 		ASSERT(!b->mlb_foreign);
2497 		list_remove(&s->mlbs_loaned, b);
2498 		if (s->mlbs_state == MLXCX_SHARD_DRAINING) {
2499 			/*
2500 			 * When we're draining, Eg during mac_stop(),
2501 			 * we destroy the buffer immediately rather than
2502 			 * recycling it. Otherwise we risk leaving it
2503 			 * on the free list and leaking it.
2504 			 */
2505 			list_insert_tail(&s->mlbs_free, b);
2506 			mlxcx_buf_destroy(mlxp, b);
2507 			/*
2508 			 * Teardown might be waiting for loaned list to empty.
2509 			 */
2510 			cv_broadcast(&s->mlbs_free_nonempty);
2511 			mutex_exit(&s->mlbs_mtx);
2512 			return;
2513 		}
2514 		break;
2515 	case MLXCX_BUFFER_FREE:
2516 		VERIFY(0);
2517 		break;
2518 	case MLXCX_BUFFER_ON_CHAIN:
2519 		ASSERT(txhead != NULL);
2520 		list_remove(&txhead->mlb_tx_chain, b);
2521 		list_remove(&s->mlbs_busy, b);
2522 		break;
2523 	}
2524 
2525 	list_insert_tail(&s->mlbs_free, b);
2526 	cv_broadcast(&s->mlbs_free_nonempty);
2527 
2528 	mutex_exit(&s->mlbs_mtx);
2529 
2530 	/*
2531 	 * For TX chain heads, free the mblk_t after we let go of the lock.
2532 	 * This might be a borrowed buf that we in turn loaned to MAC, in which
2533 	 * case calling freemsg() on it will re-enter this very function -- so
2534 	 * we better not be holding the lock!
2535 	 */
2536 	if (txhead == b)
2537 		freemsg(mp);
2538 }
2539 
2540 void
2541 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2542 {
2543 	mlxcx_buf_shard_t *s = b->mlb_shard;
2544 
2545 	VERIFY(b->mlb_state == MLXCX_BUFFER_FREE ||
2546 	    b->mlb_state == MLXCX_BUFFER_INIT);
2547 	ASSERT(mutex_owned(&s->mlbs_mtx));
2548 
2549 	if (b->mlb_state == MLXCX_BUFFER_FREE)
2550 		list_remove(&s->mlbs_free, b);
2551 
2552 	/*
2553 	 * This is going back to the kmem cache, so it needs to be set up in
2554 	 * the same way we expect a new buffer to come out (state INIT, other
2555 	 * fields NULL'd)
2556 	 */
2557 	b->mlb_state = MLXCX_BUFFER_INIT;
2558 	b->mlb_shard = NULL;
2559 	if (b->mlb_mp != NULL) {
2560 		freeb(b->mlb_mp);
2561 		ASSERT(b->mlb_mp == NULL);
2562 	}
2563 	mlxcx_dma_free(&b->mlb_dma);
2564 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2565 
2566 	kmem_cache_free(mlxp->mlx_bufs_cache, b);
2567 }
2568 
2569 void
2570 mlxcx_shard_ready(mlxcx_buf_shard_t *s)
2571 {
2572 	mutex_enter(&s->mlbs_mtx);
2573 	s->mlbs_state = MLXCX_SHARD_READY;
2574 	mutex_exit(&s->mlbs_mtx);
2575 }
2576 
2577 void
2578 mlxcx_shard_draining(mlxcx_buf_shard_t *s)
2579 {
2580 	mutex_enter(&s->mlbs_mtx);
2581 	s->mlbs_state = MLXCX_SHARD_DRAINING;
2582 	cv_broadcast(&s->mlbs_free_nonempty);
2583 	mutex_exit(&s->mlbs_mtx);
2584 }
2585