xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_ring.c (revision ebb7c6fd4f966f94af3e235242b8a39b7a53664a)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  */
16 
17 /*
18  * Mellanox Connect-X 4/5/6 driver.
19  */
20 
21 #include <sys/modctl.h>
22 #include <sys/conf.h>
23 #include <sys/devops.h>
24 #include <sys/sysmacros.h>
25 #include <sys/atomic.h>
26 #include <sys/cpuvar.h>
27 
28 #include <sys/pattr.h>
29 #include <sys/dlpi.h>
30 
31 #include <sys/mac_provider.h>
32 
33 #include <sys/random.h>
34 
35 #include <mlxcx.h>
36 
37 boolean_t
38 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
39 {
40 	ddi_device_acc_attr_t acc;
41 	ddi_dma_attr_t attr;
42 	boolean_t ret;
43 	size_t sz;
44 
45 	VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
46 
47 	/* Receive and send queue entries might be different sizes. */
48 	switch (mlwq->mlwq_type) {
49 	case MLXCX_WQ_TYPE_SENDQ:
50 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift;
51 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
52 		sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t);
53 		break;
54 	case MLXCX_WQ_TYPE_RECVQ:
55 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift;
56 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
57 		sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t);
58 		break;
59 	default:
60 		VERIFY(0);
61 		return (B_FALSE);
62 	}
63 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
64 
65 	mlxcx_dma_acc_attr(mlxp, &acc);
66 	mlxcx_dma_queue_attr(mlxp, &attr);
67 
68 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc,
69 	    B_TRUE, sz, B_TRUE);
70 	if (!ret) {
71 		mlxcx_warn(mlxp, "failed to allocate WQ memory");
72 		return (B_FALSE);
73 	}
74 
75 	/*
76 	 * Just set the first pointer in the union. Yes, this is a strict
77 	 * aliasing violation. No, I don't care.
78 	 */
79 	mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va;
80 
81 	mlxcx_dma_acc_attr(mlxp, &acc);
82 	mlxcx_dma_qdbell_attr(mlxp, &attr);
83 	sz = sizeof (mlxcx_workq_doorbell_t);
84 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc,
85 	    B_TRUE, sz, B_TRUE);
86 	if (!ret) {
87 		mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory");
88 		mlxcx_dma_free(&mlwq->mlwq_dma);
89 		mlwq->mlwq_send_ent = NULL;
90 		return (B_FALSE);
91 	}
92 
93 	mlwq->mlwq_doorbell =
94 	    (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va;
95 
96 	mlwq->mlwq_state |= MLXCX_WQ_ALLOC;
97 
98 	return (B_TRUE);
99 }
100 
101 void
102 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
103 {
104 	VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
105 	if (mlwq->mlwq_state & MLXCX_WQ_CREATED)
106 		VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED);
107 
108 	mlxcx_dma_free(&mlwq->mlwq_dma);
109 	mlwq->mlwq_send_ent = NULL;
110 	mlxcx_dma_free(&mlwq->mlwq_doorbell_dma);
111 	mlwq->mlwq_doorbell = NULL;
112 
113 	mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
114 }
115 
116 boolean_t
117 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
118 {
119 	ddi_device_acc_attr_t acc;
120 	ddi_dma_attr_t attr;
121 	boolean_t ret;
122 	size_t sz, i;
123 
124 	VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
125 
126 	mlcq->mlcq_entshift = mlxp->mlx_props.mldp_cq_size_shift;
127 	mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
128 	sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
129 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
130 
131 	mlxcx_dma_acc_attr(mlxp, &acc);
132 	mlxcx_dma_queue_attr(mlxp, &attr);
133 
134 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc,
135 	    B_TRUE, sz, B_TRUE);
136 	if (!ret) {
137 		mlxcx_warn(mlxp, "failed to allocate CQ memory");
138 		return (B_FALSE);
139 	}
140 
141 	mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va;
142 
143 	for (i = 0; i < mlcq->mlcq_nents; ++i) {
144 		mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID;
145 		mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT;
146 	}
147 
148 	mlxcx_dma_acc_attr(mlxp, &acc);
149 	mlxcx_dma_qdbell_attr(mlxp, &attr);
150 	sz = sizeof (mlxcx_completionq_doorbell_t);
151 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc,
152 	    B_TRUE, sz, B_TRUE);
153 	if (!ret) {
154 		mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory");
155 		mlxcx_dma_free(&mlcq->mlcq_dma);
156 		mlcq->mlcq_ent = NULL;
157 		return (B_FALSE);
158 	}
159 
160 	mlcq->mlcq_doorbell =
161 	    (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va;
162 
163 	mlcq->mlcq_state |= MLXCX_CQ_ALLOC;
164 
165 	return (B_TRUE);
166 }
167 
168 void
169 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
170 {
171 	VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
172 	if (mlcq->mlcq_state & MLXCX_CQ_CREATED)
173 		VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
174 
175 	mlxcx_dma_free(&mlcq->mlcq_dma);
176 	mlcq->mlcq_ent = NULL;
177 	mlxcx_dma_free(&mlcq->mlcq_doorbell_dma);
178 	mlcq->mlcq_doorbell = NULL;
179 
180 	mlcq->mlcq_state &= ~MLXCX_CQ_ALLOC;
181 }
182 
183 void
184 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
185 {
186 	mlxcx_completion_queue_t *mlcq;
187 
188 	/*
189 	 * If something is holding the lock on a long operation like a
190 	 * refill, setting this flag asks them to exit early if possible.
191 	 */
192 	atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN);
193 
194 	mutex_enter(&mlwq->mlwq_mtx);
195 
196 	list_remove(&mlxp->mlx_wqs, mlwq);
197 
198 	if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) &&
199 	    !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) {
200 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
201 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
202 		    !mlxcx_cmd_stop_rq(mlxp, mlwq)) {
203 			mlxcx_warn(mlxp, "failed to stop "
204 			    "recv queue num %x", mlwq->mlwq_num);
205 		}
206 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
207 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
208 		    !mlxcx_cmd_stop_sq(mlxp, mlwq)) {
209 			mlxcx_warn(mlxp, "failed to stop "
210 			    "send queue num %x", mlwq->mlwq_num);
211 		}
212 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
213 		    !mlxcx_cmd_destroy_rq(mlxp, mlwq)) {
214 			mlxcx_warn(mlxp, "failed to destroy "
215 			    "recv queue num %x", mlwq->mlwq_num);
216 		}
217 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
218 		    !mlxcx_cmd_destroy_sq(mlxp, mlwq)) {
219 			mlxcx_warn(mlxp, "failed to destroy "
220 			    "send queue num %x", mlwq->mlwq_num);
221 		}
222 	}
223 	if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) {
224 		mlxcx_wq_rele_dma(mlxp, mlwq);
225 	}
226 	mlcq = mlwq->mlwq_cq;
227 
228 	/* These will be released by mlxcx_teardown_bufs() */
229 	mlwq->mlwq_bufs = NULL;
230 	mlwq->mlwq_foreign_bufs = NULL;
231 
232 	mutex_exit(&mlwq->mlwq_mtx);
233 
234 	mutex_enter(&mlcq->mlcq_mtx);
235 	mutex_enter(&mlwq->mlwq_mtx);
236 	ASSERT3P(mlcq->mlcq_wq, ==, mlwq);
237 	mlcq->mlcq_wq = NULL;
238 	mutex_exit(&mlwq->mlwq_mtx);
239 	mutex_exit(&mlcq->mlcq_mtx);
240 
241 	mutex_destroy(&mlwq->mlwq_mtx);
242 }
243 
244 void
245 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
246 {
247 	mlxcx_event_queue_t *mleq;
248 	mlxcx_buffer_t *b;
249 
250 	/*
251 	 * If something is holding the lock on a long operation like polling
252 	 * which we're going to abort anyway, this flag asks them to exit
253 	 * early if possible.
254 	 */
255 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN);
256 
257 	mutex_enter(&mlcq->mlcq_mtx);
258 
259 	list_remove(&mlxp->mlx_cqs, mlcq);
260 
261 	if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) &&
262 	    !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) {
263 		if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) {
264 			mlxcx_warn(mlxp, "failed to destroy "
265 			    "completion queue num %u",
266 			    mlcq->mlcq_num);
267 		}
268 	}
269 	if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) {
270 		mlxcx_cq_rele_dma(mlxp, mlcq);
271 	}
272 	/*
273 	 * If we're on an EQ AVL tree, then we need to grab
274 	 * the EQ's mutex to take it off. The ISR always takes
275 	 * EQ mutex before CQ mutex, so we have to let go of
276 	 * the CQ mutex then come back again.
277 	 *
278 	 * The ISR will bail out if tries to touch this CQ now since
279 	 * we added the CQ_DESTROYED flag above.
280 	 */
281 	if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
282 		mleq = mlcq->mlcq_eq;
283 	} else {
284 		mleq = NULL;
285 	}
286 
287 	/* Return any outstanding buffers to the free pool. */
288 	while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) {
289 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
290 	}
291 	mutex_enter(&mlcq->mlcq_bufbmtx);
292 	while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) {
293 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
294 	}
295 	mutex_exit(&mlcq->mlcq_bufbmtx);
296 
297 	/*
298 	 * Since the interrupt handlers take the EQ lock before the CQ one,
299 	 * we must do the same here. That means letting go of the lock
300 	 * for a brief window here (we'll double-check the state when we
301 	 * get back in).
302 	 */
303 	mutex_exit(&mlcq->mlcq_mtx);
304 
305 	if (mleq != NULL) {
306 		mutex_enter(&mleq->mleq_mtx);
307 		mutex_enter(&mlcq->mlcq_mtx);
308 		/*
309 		 * Double-check the state, we let go of the
310 		 * mutex briefly.
311 		 */
312 		if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
313 			avl_remove(&mleq->mleq_cqs, mlcq);
314 			mlcq->mlcq_state &= ~MLXCX_CQ_EQAVL;
315 		}
316 		mutex_exit(&mlcq->mlcq_mtx);
317 		mutex_exit(&mleq->mleq_mtx);
318 	}
319 
320 	mutex_enter(&mlcq->mlcq_mtx);
321 	ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED |
322 	    MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED));
323 	mutex_exit(&mlcq->mlcq_mtx);
324 
325 	mutex_destroy(&mlcq->mlcq_mtx);
326 	mutex_destroy(&mlcq->mlcq_bufbmtx);
327 	list_destroy(&mlcq->mlcq_buffers);
328 	list_destroy(&mlcq->mlcq_buffers_b);
329 	kmem_free(mlcq, sizeof (mlxcx_completion_queue_t));
330 }
331 
332 static boolean_t
333 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
334     mlxcx_completion_queue_t **cqp)
335 {
336 	mlxcx_completion_queue_t *cq;
337 
338 	cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP);
339 	mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER,
340 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
341 	mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER,
342 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
343 	list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t),
344 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
345 	list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t),
346 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
347 
348 	cq->mlcq_mlx = mlxp;
349 	list_insert_tail(&mlxp->mlx_cqs, cq);
350 
351 	mutex_enter(&cq->mlcq_mtx);
352 
353 	if (!mlxcx_cq_alloc_dma(mlxp, cq)) {
354 		mutex_exit(&cq->mlcq_mtx);
355 		return (B_FALSE);
356 	}
357 
358 	cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP;
359 	cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP;
360 
361 	cq->mlcq_uar = &mlxp->mlx_uar;
362 	cq->mlcq_eq = eq;
363 
364 	cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec;
365 	cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count;
366 
367 	if (!mlxcx_cmd_create_cq(mlxp, cq)) {
368 		mutex_exit(&cq->mlcq_mtx);
369 		return (B_FALSE);
370 	}
371 
372 	mutex_exit(&cq->mlcq_mtx);
373 
374 	mutex_enter(&eq->mleq_mtx);
375 	mutex_enter(&cq->mlcq_mtx);
376 	ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL);
377 	avl_add(&eq->mleq_cqs, cq);
378 	cq->mlcq_state |= MLXCX_CQ_EQAVL;
379 	mlxcx_arm_cq(mlxp, cq);
380 	mutex_exit(&cq->mlcq_mtx);
381 	mutex_exit(&eq->mleq_mtx);
382 
383 	*cqp = cq;
384 	return (B_TRUE);
385 }
386 
387 static boolean_t
388 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
389     mlxcx_work_queue_t *wq)
390 {
391 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
392 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
393 
394 	list_insert_tail(&mlxp->mlx_wqs, wq);
395 
396 	mutex_enter(&wq->mlwq_mtx);
397 
398 	wq->mlwq_mlx = mlxp;
399 	wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ;
400 	wq->mlwq_cq = cq;
401 	wq->mlwq_pd = &mlxp->mlx_pd;
402 	wq->mlwq_uar = &mlxp->mlx_uar;
403 
404 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
405 
406 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
407 		mutex_exit(&wq->mlwq_mtx);
408 		return (B_FALSE);
409 	}
410 
411 	if (!mlxcx_cmd_create_rq(mlxp, wq)) {
412 		mutex_exit(&wq->mlwq_mtx);
413 		return (B_FALSE);
414 	}
415 
416 	mutex_exit(&wq->mlwq_mtx);
417 
418 	mutex_enter(&cq->mlcq_mtx);
419 	mutex_enter(&wq->mlwq_mtx);
420 	ASSERT3P(cq->mlcq_wq, ==, NULL);
421 	cq->mlcq_wq = wq;
422 	mutex_exit(&wq->mlwq_mtx);
423 	mutex_exit(&cq->mlcq_mtx);
424 
425 	return (B_TRUE);
426 }
427 
428 static boolean_t
429 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
430     mlxcx_tis_t *tis, mlxcx_work_queue_t *wq)
431 {
432 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
433 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
434 
435 	list_insert_tail(&mlxp->mlx_wqs, wq);
436 
437 	mutex_enter(&wq->mlwq_mtx);
438 
439 	wq->mlwq_mlx = mlxp;
440 	wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ;
441 	wq->mlwq_cq = cq;
442 	wq->mlwq_pd = &mlxp->mlx_pd;
443 	wq->mlwq_uar = &mlxp->mlx_uar;
444 	wq->mlwq_tis = tis;
445 
446 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
447 	wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp);
448 
449 	VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2);
450 	wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2;
451 
452 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
453 		mutex_exit(&wq->mlwq_mtx);
454 		return (B_FALSE);
455 	}
456 
457 	if (!mlxcx_cmd_create_sq(mlxp, wq)) {
458 		mutex_exit(&wq->mlwq_mtx);
459 		return (B_FALSE);
460 	}
461 
462 	mutex_exit(&wq->mlwq_mtx);
463 
464 	mutex_enter(&cq->mlcq_mtx);
465 	mutex_enter(&wq->mlwq_mtx);
466 	ASSERT3P(cq->mlcq_wq, ==, NULL);
467 	cq->mlcq_wq = wq;
468 	mutex_exit(&wq->mlwq_mtx);
469 	mutex_exit(&cq->mlcq_mtx);
470 
471 	return (B_TRUE);
472 }
473 
474 void
475 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
476 {
477 	mlxcx_work_queue_t *wq;
478 	mlxcx_completion_queue_t *cq;
479 	mlxcx_flow_entry_t *fe;
480 	mlxcx_flow_group_t *fg;
481 	mlxcx_flow_table_t *ft;
482 	uint_t i;
483 
484 	mutex_enter(&g->mlg_port->mlp_mtx);
485 	mutex_enter(&g->mlg_mtx);
486 
487 	if (g->mlg_state & MLXCX_GROUP_FLOWS) {
488 		mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g);
489 
490 		if (g->mlg_rx_vlan_ft != NULL)
491 			mlxcx_remove_all_vlan_entries(mlxp, g);
492 
493 		if (g == &mlxp->mlx_rx_groups[0]) {
494 			ft = g->mlg_port->mlp_rx_flow;
495 			mutex_enter(&ft->mlft_mtx);
496 
497 			fg = g->mlg_port->mlp_bcast;
498 			fe = list_head(&fg->mlfg_entries);
499 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
500 				(void) mlxcx_cmd_delete_flow_table_entry(
501 				    mlxp, fe);
502 			}
503 
504 			fg = g->mlg_port->mlp_promisc;
505 			fe = list_head(&fg->mlfg_entries);
506 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
507 				(void) mlxcx_cmd_delete_flow_table_entry(
508 				    mlxp, fe);
509 			}
510 
511 			mutex_exit(&ft->mlft_mtx);
512 		}
513 
514 		if (g->mlg_rx_vlan_ft != NULL) {
515 			mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx);
516 			ASSERT(list_is_empty(&g->mlg_rx_vlans));
517 			fg = g->mlg_rx_vlan_def_fg;
518 			fe = list_head(&fg->mlfg_entries);
519 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
520 				(void) mlxcx_cmd_delete_flow_table_entry(
521 				    mlxp, fe);
522 			}
523 			fg = g->mlg_rx_vlan_promisc_fg;
524 			fe = list_head(&fg->mlfg_entries);
525 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
526 				(void) mlxcx_cmd_delete_flow_table_entry(
527 				    mlxp, fe);
528 			}
529 			mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft);
530 			list_destroy(&g->mlg_rx_vlans);
531 
532 			g->mlg_rx_vlan_ft = NULL;
533 		}
534 
535 		mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx);
536 		mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft);
537 		g->mlg_rx_hash_ft = NULL;
538 
539 		avl_destroy(&g->mlg_rx_macs);
540 		g->mlg_state &= ~MLXCX_GROUP_FLOWS;
541 	}
542 
543 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
544 		for (i = 0; i < g->mlg_nwqs; ++i) {
545 			wq = &g->mlg_wqs[i];
546 			mutex_enter(&wq->mlwq_mtx);
547 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
548 			    !mlxcx_cmd_stop_rq(mlxp, wq)) {
549 				mlxcx_warn(mlxp, "failed to stop rq %x",
550 				    wq->mlwq_num);
551 			}
552 			mutex_exit(&wq->mlwq_mtx);
553 		}
554 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
555 	}
556 
557 	if (g->mlg_state & MLXCX_GROUP_TIRTIS) {
558 		for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
559 			mlxcx_tir_t *tir = &g->mlg_tir[i];
560 			if (tir->mltir_state & MLXCX_TIR_CREATED &&
561 			    !(tir->mltir_state & MLXCX_TIR_DESTROYED)) {
562 				if (!mlxcx_cmd_destroy_tir(mlxp, tir)) {
563 					mlxcx_warn(mlxp,
564 					    "failed to destroy tir %u "
565 					    "for rx ring", tir->mltir_num);
566 				}
567 			}
568 		}
569 		g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
570 	}
571 
572 	if (g->mlg_state & MLXCX_GROUP_RQT) {
573 		if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED &&
574 		    !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) {
575 			if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) {
576 				mlxcx_warn(mlxp, "failed to destroy rqt %u "
577 				    "for rx ring", g->mlg_rqt->mlrqt_num);
578 			}
579 			kmem_free(g->mlg_rqt->mlrqt_rq,
580 			    g->mlg_rqt->mlrqt_rq_size);
581 			g->mlg_rqt->mlrqt_rq = NULL;
582 			kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t));
583 			g->mlg_rqt = NULL;
584 		}
585 		g->mlg_state &= ~MLXCX_GROUP_RQT;
586 	}
587 
588 	for (i = 0; i < g->mlg_nwqs; ++i) {
589 		wq = &g->mlg_wqs[i];
590 		cq = wq->mlwq_cq;
591 		mlxcx_wq_teardown(mlxp, wq);
592 		if (cq != NULL)
593 			mlxcx_cq_teardown(mlxp, cq);
594 	}
595 	kmem_free(g->mlg_wqs, g->mlg_wqs_size);
596 	g->mlg_wqs = NULL;
597 	g->mlg_state &= ~MLXCX_GROUP_WQS;
598 
599 	mutex_exit(&g->mlg_mtx);
600 	mutex_exit(&g->mlg_port->mlp_mtx);
601 
602 	mutex_destroy(&g->mlg_mtx);
603 
604 	g->mlg_state &= ~MLXCX_GROUP_INIT;
605 	ASSERT3S(g->mlg_state, ==, 0);
606 }
607 
608 void
609 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
610 {
611 	mlxcx_work_queue_t *wq;
612 	mlxcx_completion_queue_t *cq;
613 	uint_t i;
614 
615 	mutex_enter(&g->mlg_mtx);
616 
617 	if (g->mlg_state & MLXCX_GROUP_WQS) {
618 		for (i = 0; i < g->mlg_nwqs; ++i) {
619 			wq = &g->mlg_wqs[i];
620 			mutex_enter(&wq->mlwq_mtx);
621 			cq = wq->mlwq_cq;
622 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
623 			    !mlxcx_cmd_stop_sq(mlxp, wq)) {
624 				mlxcx_warn(mlxp, "failed to stop sq %x",
625 				    wq->mlwq_num);
626 			}
627 			mutex_exit(&wq->mlwq_mtx);
628 			mlxcx_wq_teardown(mlxp, wq);
629 			if (cq != NULL)
630 				mlxcx_cq_teardown(mlxp, cq);
631 		}
632 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
633 		kmem_free(g->mlg_wqs, g->mlg_wqs_size);
634 		g->mlg_wqs = NULL;
635 		g->mlg_state &= ~MLXCX_GROUP_WQS;
636 	}
637 
638 	if ((g->mlg_state & MLXCX_GROUP_TIRTIS) &&
639 	    g->mlg_tis.mltis_state & MLXCX_TIS_CREATED &&
640 	    !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) {
641 		if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) {
642 			mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring",
643 			    g->mlg_tis.mltis_num);
644 		}
645 	}
646 	g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
647 
648 	mutex_exit(&g->mlg_mtx);
649 	mutex_destroy(&g->mlg_mtx);
650 	g->mlg_state &= ~MLXCX_GROUP_INIT;
651 	ASSERT3S(g->mlg_state, ==, 0);
652 }
653 
654 void
655 mlxcx_teardown_groups(mlxcx_t *mlxp)
656 {
657 	mlxcx_ring_group_t *g;
658 	uint_t i;
659 
660 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
661 		g = &mlxp->mlx_rx_groups[i];
662 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
663 			continue;
664 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
665 		mlxcx_teardown_rx_group(mlxp, g);
666 	}
667 	kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
668 	mlxp->mlx_rx_groups = NULL;
669 
670 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
671 		g = &mlxp->mlx_tx_groups[i];
672 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
673 			continue;
674 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
675 		mlxcx_teardown_tx_group(mlxp, g);
676 	}
677 	kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
678 	mlxp->mlx_tx_groups = NULL;
679 }
680 
681 boolean_t
682 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
683 {
684 	mlxcx_event_queue_t *eq;
685 	mlxcx_completion_queue_t *cq;
686 	mlxcx_work_queue_t *rq;
687 	mlxcx_flow_table_t *ft;
688 	mlxcx_flow_group_t *fg;
689 	mlxcx_flow_entry_t *fe;
690 	uint_t i, j;
691 
692 	ASSERT3S(g->mlg_state, ==, 0);
693 
694 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
695 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
696 	mutex_enter(&g->mlg_mtx);
697 	g->mlg_mlx = mlxp;
698 	g->mlg_type = MLXCX_GROUP_RX;
699 	g->mlg_port = &mlxp->mlx_ports[0];
700 	g->mlg_state |= MLXCX_GROUP_INIT;
701 
702 	g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group;
703 	i = g - &mlxp->mlx_rx_groups[0];
704 	if (i < mlxp->mlx_props.mldp_rx_ngroups_large)
705 		g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group;
706 
707 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
708 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
709 	g->mlg_state |= MLXCX_GROUP_WQS;
710 
711 	g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP);
712 	g->mlg_rqt->mlrqt_max = 2;
713 	while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs)
714 		g->mlg_rqt->mlrqt_max <<= 1;
715 	g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max *
716 	    sizeof (mlxcx_work_queue_t *);
717 	g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP);
718 	g->mlg_state |= MLXCX_GROUP_RQT;
719 
720 	for (i = 0; i < g->mlg_nwqs; ++i) {
721 		eq = NULL;
722 		while (eq == NULL) {
723 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
724 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
725 				mlxp->mlx_next_eq = 1;
726 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
727 			    eq->mleq_type != MLXCX_EQ_TYPE_RX) {
728 				/* Try the next one */
729 				eq = NULL;
730 			}
731 		}
732 
733 		if (!mlxcx_cq_setup(mlxp, eq, &cq)) {
734 			g->mlg_nwqs = i;
735 			break;
736 		}
737 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
738 
739 		rq = &g->mlg_wqs[i];
740 		if (!mlxcx_rq_setup(mlxp, cq, rq)) {
741 			g->mlg_nwqs = i;
742 			break;
743 		}
744 		g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq;
745 		g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY;
746 		rq->mlwq_group = g;
747 	}
748 	if (g->mlg_nwqs == 0) {
749 		mutex_exit(&g->mlg_mtx);
750 		return (B_FALSE);
751 	}
752 
753 	if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) {
754 		mutex_exit(&g->mlg_mtx);
755 		return (B_FALSE);
756 	}
757 
758 	for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
759 		mlxcx_tir_t *tir = &g->mlg_tir[i];
760 		tir->mltir_tdom = &mlxp->mlx_tdom;
761 		switch (i) {
762 		case MLXCX_TIR_ROLE_OTHER:
763 			tir->mltir_type = MLXCX_TIR_DIRECT;
764 			tir->mltir_rq = &g->mlg_wqs[0];
765 			break;
766 		case MLXCX_TIR_ROLE_IPv4:
767 		case MLXCX_TIR_ROLE_IPv6:
768 		case MLXCX_TIR_ROLE_TCPv4:
769 		case MLXCX_TIR_ROLE_TCPv6:
770 		case MLXCX_TIR_ROLE_UDPv4:
771 		case MLXCX_TIR_ROLE_UDPv6:
772 			tir->mltir_type = MLXCX_TIR_INDIRECT;
773 			tir->mltir_rqtable = g->mlg_rqt;
774 			tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ;
775 			(void) random_get_pseudo_bytes(tir->mltir_toeplitz_key,
776 			    sizeof (tir->mltir_toeplitz_key));
777 			break;
778 		}
779 		switch (i) {
780 		case MLXCX_TIR_ROLE_OTHER:
781 			break;
782 		case MLXCX_TIR_ROLE_IPv4:
783 		case MLXCX_TIR_ROLE_TCPv4:
784 		case MLXCX_TIR_ROLE_UDPv4:
785 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4;
786 			tir->mltir_hash_fields =
787 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
788 			break;
789 		case MLXCX_TIR_ROLE_IPv6:
790 		case MLXCX_TIR_ROLE_TCPv6:
791 		case MLXCX_TIR_ROLE_UDPv6:
792 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6;
793 			tir->mltir_hash_fields =
794 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
795 			break;
796 		}
797 		switch (i) {
798 		case MLXCX_TIR_ROLE_OTHER:
799 		case MLXCX_TIR_ROLE_IPv4:
800 		case MLXCX_TIR_ROLE_IPv6:
801 			break;
802 		case MLXCX_TIR_ROLE_TCPv4:
803 		case MLXCX_TIR_ROLE_TCPv6:
804 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP;
805 			tir->mltir_hash_fields |=
806 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
807 			break;
808 		case MLXCX_TIR_ROLE_UDPv4:
809 		case MLXCX_TIR_ROLE_UDPv6:
810 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP;
811 			tir->mltir_hash_fields |=
812 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
813 			break;
814 		}
815 
816 		if (!mlxcx_cmd_create_tir(mlxp, tir)) {
817 			mutex_exit(&g->mlg_mtx);
818 			return (B_FALSE);
819 		}
820 
821 		g->mlg_state |= MLXCX_GROUP_TIRTIS;
822 	}
823 
824 	/*
825 	 * Flow table: our RX hashing breakout table for RSS
826 	 */
827 
828 	g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
829 	    KM_SLEEP));
830 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
831 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
832 	avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare,
833 	    sizeof (mlxcx_group_mac_t),
834 	    offsetof(mlxcx_group_mac_t, mlgm_group_entry));
835 	g->mlg_state |= MLXCX_GROUP_FLOWS;
836 
837 	mutex_enter(&ft->mlft_mtx);
838 
839 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
840 	ft->mlft_level = 2;
841 	ft->mlft_port = g->mlg_port;
842 	ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT;
843 	ft->mlft_nents = (1 << ft->mlft_entshift);
844 	ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP);
845 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
846 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
847 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
848 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
849 
850 	for (j = 0; j < ft->mlft_nents; ++j) {
851 		ft->mlft_ent[j].mlfe_table = ft;
852 		ft->mlft_ent[j].mlfe_index = j;
853 	}
854 
855 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
856 		mutex_exit(&ft->mlft_mtx);
857 		mutex_exit(&g->mlg_mtx);
858 		return (B_FALSE);
859 	}
860 
861 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
862 	list_insert_tail(&ft->mlft_groups, fg);
863 	fg->mlfg_table = ft;
864 	fg->mlfg_size = 1;
865 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
866 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
867 		mutex_exit(&ft->mlft_mtx);
868 		mutex_exit(&g->mlg_mtx);
869 		return (B_FALSE);
870 	}
871 	fe = list_head(&fg->mlfg_entries);
872 	fe->mlfe_ip_version = 6;
873 	fe->mlfe_ip_proto = IPPROTO_UDP;
874 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
875 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
876 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6];
877 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
878 		mutex_exit(&ft->mlft_mtx);
879 		mutex_exit(&g->mlg_mtx);
880 		return (B_FALSE);
881 	}
882 
883 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
884 	list_insert_tail(&ft->mlft_groups, fg);
885 	fg->mlfg_table = ft;
886 	fg->mlfg_size = 1;
887 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
888 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
889 		mutex_exit(&ft->mlft_mtx);
890 		mutex_exit(&g->mlg_mtx);
891 		return (B_FALSE);
892 	}
893 	fe = list_head(&fg->mlfg_entries);
894 	fe->mlfe_ip_version = 4;
895 	fe->mlfe_ip_proto = IPPROTO_UDP;
896 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
897 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
898 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4];
899 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
900 		mutex_exit(&ft->mlft_mtx);
901 		mutex_exit(&g->mlg_mtx);
902 		return (B_FALSE);
903 	}
904 
905 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
906 	list_insert_tail(&ft->mlft_groups, fg);
907 	fg->mlfg_table = ft;
908 	fg->mlfg_size = 1;
909 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
910 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
911 		mutex_exit(&ft->mlft_mtx);
912 		mutex_exit(&g->mlg_mtx);
913 		return (B_FALSE);
914 	}
915 	fe = list_head(&fg->mlfg_entries);
916 	fe->mlfe_ip_version = 6;
917 	fe->mlfe_ip_proto = IPPROTO_TCP;
918 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
919 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
920 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6];
921 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
922 		mutex_exit(&ft->mlft_mtx);
923 		mutex_exit(&g->mlg_mtx);
924 		return (B_FALSE);
925 	}
926 
927 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
928 	list_insert_tail(&ft->mlft_groups, fg);
929 	fg->mlfg_table = ft;
930 	fg->mlfg_size = 1;
931 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
932 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
933 		mutex_exit(&ft->mlft_mtx);
934 		mutex_exit(&g->mlg_mtx);
935 		return (B_FALSE);
936 	}
937 	fe = list_head(&fg->mlfg_entries);
938 	fe->mlfe_ip_version = 4;
939 	fe->mlfe_ip_proto = IPPROTO_TCP;
940 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
941 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
942 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4];
943 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
944 		mutex_exit(&ft->mlft_mtx);
945 		mutex_exit(&g->mlg_mtx);
946 		return (B_FALSE);
947 	}
948 
949 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
950 	list_insert_tail(&ft->mlft_groups, fg);
951 	fg->mlfg_table = ft;
952 	fg->mlfg_size = 1;
953 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
954 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
955 		mutex_exit(&ft->mlft_mtx);
956 		mutex_exit(&g->mlg_mtx);
957 		return (B_FALSE);
958 	}
959 	fe = list_head(&fg->mlfg_entries);
960 	fe->mlfe_ip_version = 6;
961 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
962 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
963 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv6];
964 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
965 		mutex_exit(&ft->mlft_mtx);
966 		mutex_exit(&g->mlg_mtx);
967 		return (B_FALSE);
968 	}
969 
970 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
971 	list_insert_tail(&ft->mlft_groups, fg);
972 	fg->mlfg_table = ft;
973 	fg->mlfg_size = 1;
974 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
975 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
976 		mutex_exit(&ft->mlft_mtx);
977 		mutex_exit(&g->mlg_mtx);
978 		return (B_FALSE);
979 	}
980 	fe = list_head(&fg->mlfg_entries);
981 	fe->mlfe_ip_version = 4;
982 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
983 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
984 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv4];
985 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
986 		mutex_exit(&ft->mlft_mtx);
987 		mutex_exit(&g->mlg_mtx);
988 		return (B_FALSE);
989 	}
990 
991 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
992 	list_insert_tail(&ft->mlft_groups, fg);
993 	fg->mlfg_table = ft;
994 	fg->mlfg_size = 1;
995 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
996 		mutex_exit(&ft->mlft_mtx);
997 		mutex_exit(&g->mlg_mtx);
998 		return (B_FALSE);
999 	}
1000 	fe = list_head(&fg->mlfg_entries);
1001 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1002 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1003 	    &g->mlg_tir[MLXCX_TIR_ROLE_OTHER];
1004 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1005 		mutex_exit(&ft->mlft_mtx);
1006 		mutex_exit(&g->mlg_mtx);
1007 		return (B_FALSE);
1008 	}
1009 
1010 	mutex_exit(&ft->mlft_mtx);
1011 
1012 	/*
1013 	 * Flow table: the VLAN breakout table for doing VLAN filtering after
1014 	 * we've matched a MAC address.
1015 	 */
1016 
1017 	g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1018 	    KM_SLEEP));
1019 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1020 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1021 	list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t),
1022 	    offsetof(mlxcx_group_vlan_t, mlgv_entry));
1023 
1024 	mutex_enter(&ft->mlft_mtx);
1025 
1026 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1027 	ft->mlft_level = 1;
1028 	ft->mlft_port = g->mlg_port;
1029 	ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift;
1030 	ft->mlft_nents = (1 << ft->mlft_entshift);
1031 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1032 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1033 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1034 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
1035 
1036 	for (j = 0; j < ft->mlft_nents; ++j) {
1037 		fe = &ft->mlft_ent[j];
1038 		fe->mlfe_table = ft;
1039 		fe->mlfe_index = j;
1040 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1041 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1042 	}
1043 
1044 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1045 		mutex_exit(&ft->mlft_mtx);
1046 		mutex_exit(&g->mlg_mtx);
1047 		return (B_FALSE);
1048 	}
1049 
1050 	/* First group is all actual matched VLANs */
1051 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1052 	g->mlg_rx_vlan_fg = fg;
1053 	list_insert_tail(&ft->mlft_groups, fg);
1054 	fg->mlfg_table = ft;
1055 	fg->mlfg_size = ft->mlft_nents - 2;
1056 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN;
1057 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID;
1058 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1059 		mutex_exit(&ft->mlft_mtx);
1060 		mutex_exit(&g->mlg_mtx);
1061 		return (B_FALSE);
1062 	}
1063 
1064 	/*
1065 	 * Then the "default" entry which we enable when we have no VLAN IDs
1066 	 * added to the group (we start with this enabled).
1067 	 */
1068 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1069 	g->mlg_rx_vlan_def_fg = fg;
1070 	list_insert_tail(&ft->mlft_groups, fg);
1071 	fg->mlfg_table = ft;
1072 	fg->mlfg_size = 1;
1073 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1074 		mutex_exit(&ft->mlft_mtx);
1075 		mutex_exit(&g->mlg_mtx);
1076 		return (B_FALSE);
1077 	}
1078 	fe = list_head(&fg->mlfg_entries);
1079 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1080 		mutex_exit(&ft->mlft_mtx);
1081 		mutex_exit(&g->mlg_mtx);
1082 		return (B_FALSE);
1083 	}
1084 
1085 	/*
1086 	 * Finally, the promisc entry which points at the *hash ft* from the
1087 	 * default group. We only enable this when we have promisc on.
1088 	 */
1089 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1090 	g->mlg_rx_vlan_promisc_fg = fg;
1091 	list_insert_tail(&ft->mlft_groups, fg);
1092 	fg->mlfg_table = ft;
1093 	fg->mlfg_size = 1;
1094 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1095 		mutex_exit(&ft->mlft_mtx);
1096 		mutex_exit(&g->mlg_mtx);
1097 		return (B_FALSE);
1098 	}
1099 	fe = list_head(&fg->mlfg_entries);
1100 	fe->mlfe_ndest = 1;
1101 	fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft;
1102 
1103 	mutex_exit(&ft->mlft_mtx);
1104 
1105 	mutex_exit(&g->mlg_mtx);
1106 
1107 	return (B_TRUE);
1108 }
1109 
1110 boolean_t
1111 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1112     mlxcx_work_queue_t *rq)
1113 {
1114 	uint_t j;
1115 	mlxcx_buffer_t *b;
1116 	mlxcx_completion_queue_t *cq;
1117 
1118 	mutex_enter(&g->mlg_mtx);
1119 	/*
1120 	 * Sadly, even though MAC has the mgi_start callback, it is not always
1121 	 * called -- in particular when we are being managed under an aggr, the
1122 	 * mgi_start callback will only ever be called on the default group.
1123 	 *
1124 	 * So instead of asserting about the group state here, we have to
1125 	 * check it and call group start if needed.
1126 	 */
1127 	if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) {
1128 		mutex_exit(&g->mlg_mtx);
1129 		if (!mlxcx_rx_group_start(mlxp, g))
1130 			return (B_FALSE);
1131 		mutex_enter(&g->mlg_mtx);
1132 	}
1133 	ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING);
1134 
1135 	cq = rq->mlwq_cq;
1136 	ASSERT(cq != NULL);
1137 
1138 	mutex_enter(&cq->mlcq_mtx);
1139 	mutex_enter(&rq->mlwq_mtx);
1140 
1141 	if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1142 		mutex_exit(&rq->mlwq_mtx);
1143 		mutex_exit(&cq->mlcq_mtx);
1144 		mutex_exit(&g->mlg_mtx);
1145 		return (B_TRUE);
1146 	}
1147 
1148 	if (!mlxcx_cmd_start_rq(mlxp, rq)) {
1149 		mutex_exit(&rq->mlwq_mtx);
1150 		mutex_exit(&cq->mlcq_mtx);
1151 		mutex_exit(&g->mlg_mtx);
1152 		return (B_FALSE);
1153 	}
1154 	ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED);
1155 
1156 	ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS);
1157 	rq->mlwq_state |= MLXCX_WQ_BUFFERS;
1158 
1159 	for (j = 0; j < rq->mlwq_nents; ++j) {
1160 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1161 			break;
1162 		mlxcx_buf_return(mlxp, b);
1163 	}
1164 	for (j = 0; j < rq->mlwq_nents / 2; ++j) {
1165 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1166 			break;
1167 		mlxcx_buf_return(mlxp, b);
1168 	}
1169 
1170 	mlxcx_rq_refill(mlxp, rq);
1171 
1172 	mutex_exit(&rq->mlwq_mtx);
1173 	mutex_exit(&cq->mlcq_mtx);
1174 	mutex_exit(&g->mlg_mtx);
1175 
1176 	return (B_TRUE);
1177 }
1178 
1179 boolean_t
1180 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1181 {
1182 	mlxcx_flow_table_t *ft;
1183 	mlxcx_flow_group_t *fg;
1184 	mlxcx_flow_entry_t *fe;
1185 
1186 	mutex_enter(&g->mlg_mtx);
1187 
1188 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
1189 		mutex_exit(&g->mlg_mtx);
1190 		return (B_TRUE);
1191 	}
1192 
1193 	ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING);
1194 
1195 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1196 
1197 	if (g == &mlxp->mlx_rx_groups[0]) {
1198 		ft = g->mlg_port->mlp_rx_flow;
1199 		mutex_enter(&ft->mlft_mtx);
1200 
1201 		/*
1202 		 * Broadcast and promisc entries go directly to group 0's
1203 		 * RSS hash fanout flow table. They bypass VLAN filtering.
1204 		 */
1205 		fg = g->mlg_port->mlp_bcast;
1206 		fe = list_head(&fg->mlfg_entries);
1207 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1208 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1209 			mutex_exit(&ft->mlft_mtx);
1210 			mutex_exit(&g->mlg_mtx);
1211 			return (B_FALSE);
1212 		}
1213 
1214 		fg = g->mlg_port->mlp_promisc;
1215 		fe = list_head(&fg->mlfg_entries);
1216 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1217 		/*
1218 		 * Don't actually set the promisc entry until promisc is
1219 		 * enabled.
1220 		 */
1221 
1222 		mutex_exit(&ft->mlft_mtx);
1223 	}
1224 
1225 	mutex_exit(&g->mlg_mtx);
1226 
1227 	return (B_TRUE);
1228 }
1229 
1230 boolean_t
1231 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1232 {
1233 	mlxcx_event_queue_t *eq;
1234 	mlxcx_completion_queue_t *cq;
1235 	mlxcx_work_queue_t *sq;
1236 	uint_t i;
1237 
1238 	ASSERT3S(g->mlg_state, ==, 0);
1239 
1240 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
1241 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1242 	g->mlg_state |= MLXCX_GROUP_INIT;
1243 	mutex_enter(&g->mlg_mtx);
1244 
1245 	g->mlg_mlx = mlxp;
1246 	g->mlg_type = MLXCX_GROUP_TX;
1247 	g->mlg_port = &mlxp->mlx_ports[0];
1248 
1249 	g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group;
1250 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
1251 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
1252 	g->mlg_state |= MLXCX_GROUP_WQS;
1253 
1254 	g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom;
1255 
1256 	if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) {
1257 		mutex_exit(&g->mlg_mtx);
1258 		return (B_FALSE);
1259 	}
1260 
1261 	g->mlg_state |= MLXCX_GROUP_TIRTIS;
1262 
1263 	for (i = 0; i < g->mlg_nwqs; ++i) {
1264 		eq = NULL;
1265 		while (eq == NULL) {
1266 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
1267 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
1268 				mlxp->mlx_next_eq = 1;
1269 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
1270 			    eq->mleq_type != MLXCX_EQ_TYPE_TX) {
1271 				/* Try the next one */
1272 				eq = NULL;
1273 			}
1274 		}
1275 
1276 		if (!mlxcx_cq_setup(mlxp, eq, &cq))
1277 			return (B_FALSE);
1278 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
1279 
1280 		sq = &g->mlg_wqs[i];
1281 		if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) {
1282 			mutex_exit(&g->mlg_mtx);
1283 			return (B_FALSE);
1284 		}
1285 		sq->mlwq_group = g;
1286 	}
1287 
1288 	mutex_exit(&g->mlg_mtx);
1289 
1290 	return (B_TRUE);
1291 }
1292 
1293 boolean_t
1294 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1295     mlxcx_work_queue_t *sq)
1296 {
1297 	uint_t i;
1298 	mlxcx_buffer_t *b;
1299 	mlxcx_completion_queue_t *cq;
1300 
1301 	mutex_enter(&g->mlg_mtx);
1302 
1303 	cq = sq->mlwq_cq;
1304 	ASSERT(cq != NULL);
1305 
1306 	mutex_enter(&cq->mlcq_mtx);
1307 	mutex_enter(&sq->mlwq_mtx);
1308 	if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1309 		mutex_exit(&sq->mlwq_mtx);
1310 		mutex_exit(&cq->mlcq_mtx);
1311 		mutex_exit(&g->mlg_mtx);
1312 		return (B_TRUE);
1313 	}
1314 
1315 	ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS);
1316 	for (i = 0; i < sq->mlwq_nents; ++i) {
1317 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1318 			break;
1319 		mlxcx_buf_return(mlxp, b);
1320 	}
1321 	for (i = 0; i < sq->mlwq_nents / 2; ++i) {
1322 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1323 			break;
1324 		mlxcx_buf_return(mlxp, b);
1325 	}
1326 	for (i = 0; i < sq->mlwq_nents; ++i) {
1327 		if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b))
1328 			break;
1329 		mlxcx_buf_return(mlxp, b);
1330 	}
1331 	sq->mlwq_state |= MLXCX_WQ_BUFFERS;
1332 
1333 	if (!mlxcx_cmd_start_sq(mlxp, sq)) {
1334 		mutex_exit(&sq->mlwq_mtx);
1335 		mutex_exit(&cq->mlcq_mtx);
1336 		mutex_exit(&g->mlg_mtx);
1337 		return (B_FALSE);
1338 	}
1339 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1340 
1341 	(void) mlxcx_sq_add_nop(mlxp, sq);
1342 
1343 	mutex_exit(&sq->mlwq_mtx);
1344 	mutex_exit(&cq->mlcq_mtx);
1345 	mutex_exit(&g->mlg_mtx);
1346 
1347 	return (B_TRUE);
1348 }
1349 
1350 static boolean_t
1351 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first)
1352 {
1353 	uint_t idx;
1354 	mlxcx_bf_t *bf;
1355 	ddi_fm_error_t err;
1356 	uint_t try = 0;
1357 
1358 	ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ);
1359 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1360 
1361 	mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc);
1362 
1363 	ASSERT(mlwq->mlwq_cq != NULL);
1364 	ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL);
1365 	idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK;
1366 	bf = &mlwq->mlwq_uar->mlu_bf[idx];
1367 
1368 retry:
1369 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1370 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1371 	    DDI_FME_VERSION);
1372 	if (err.fme_status != DDI_FM_OK) {
1373 		if (try++ < mlxcx_doorbell_tries) {
1374 			ddi_fm_dma_err_clear(
1375 			    mlwq->mlwq_doorbell_dma.mxdb_dma_handle,
1376 			    DDI_FME_VERSION);
1377 			goto retry;
1378 		} else {
1379 			goto err;
1380 		}
1381 	}
1382 
1383 	mlxcx_put64(mlxp, bf->mbf_even, from_be64(
1384 	    mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0]));
1385 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
1386 	    DDI_FME_VERSION);
1387 	if (err.fme_status == DDI_FM_OK)
1388 		return (B_TRUE);
1389 	if (try++ < mlxcx_doorbell_tries) {
1390 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
1391 		goto retry;
1392 	}
1393 
1394 err:
1395 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1396 	return (B_FALSE);
1397 }
1398 
1399 boolean_t
1400 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1401 {
1402 	uint_t index, start_pc;
1403 	mlxcx_sendq_ent_t *ent0;
1404 	ddi_fm_error_t err;
1405 
1406 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1407 
1408 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1409 	ent0 = &mlwq->mlwq_send_ent[index];
1410 	start_pc = mlwq->mlwq_pc;
1411 	++mlwq->mlwq_pc;
1412 
1413 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1414 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
1415 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1416 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc);
1417 
1418 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1419 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE);
1420 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1421 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1422 
1423 	ent0->mlsqe_control.mlcs_ds = 1;
1424 
1425 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1426 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1427 	    sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1428 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1429 	    DDI_FME_VERSION);
1430 	if (err.fme_status != DDI_FM_OK) {
1431 		return (B_FALSE);
1432 	}
1433 	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) {
1434 		return (B_FALSE);
1435 	}
1436 	return (B_TRUE);
1437 }
1438 
1439 boolean_t
1440 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1441     uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
1442     mlxcx_buffer_t *b0)
1443 {
1444 	uint_t index, first, ents = 0;
1445 	mlxcx_completion_queue_t *cq;
1446 	mlxcx_sendq_ent_t *ent0;
1447 	mlxcx_sendq_extra_ent_t *ent;
1448 	mlxcx_wqe_data_seg_t *seg;
1449 	uint_t ptri, nptr;
1450 	const ddi_dma_cookie_t *c;
1451 	size_t rem;
1452 	mlxcx_buffer_t *b;
1453 	ddi_fm_error_t err;
1454 
1455 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1456 	ASSERT3P(b0->mlb_tx_head, ==, b0);
1457 	ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1458 	cq = mlwq->mlwq_cq;
1459 
1460 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1461 	ent0 = &mlwq->mlwq_send_ent[index];
1462 	b0->mlb_wqe_index = mlwq->mlwq_pc;
1463 	++mlwq->mlwq_pc;
1464 	++ents;
1465 
1466 	first = index;
1467 
1468 	mutex_enter(&cq->mlcq_bufbmtx);
1469 	list_insert_tail(&cq->mlcq_buffers_b, b0);
1470 	atomic_inc_64(&cq->mlcq_bufcnt);
1471 	mutex_exit(&cq->mlcq_bufbmtx);
1472 
1473 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1474 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
1475 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1476 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index);
1477 
1478 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1479 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS);
1480 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1481 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1482 
1483 	VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers));
1484 	set_bits16(&ent0->mlsqe_eth.mles_szflags,
1485 	    MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen);
1486 	if (inlinelen > 0) {
1487 		bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers,
1488 		    inlinelen);
1489 	}
1490 
1491 	ent0->mlsqe_control.mlcs_ds =
1492 	    offsetof(mlxcx_sendq_ent_t, mlsqe_data) / 16;
1493 
1494 	if (chkflags & HCK_IPV4_HDRCKSUM) {
1495 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1496 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1497 		    MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM);
1498 	}
1499 	if (chkflags & HCK_FULLCKSUM) {
1500 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1501 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1502 		    MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
1503 	}
1504 
1505 	b = b0;
1506 	ptri = 0;
1507 	nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
1508 	seg = ent0->mlsqe_data;
1509 	while (b != NULL) {
1510 		rem = b->mlb_used;
1511 
1512 		c = NULL;
1513 		while (rem > 0 &&
1514 		    (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
1515 			if (ptri >= nptr) {
1516 				index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1517 				ent = &mlwq->mlwq_send_extra_ent[index];
1518 				++mlwq->mlwq_pc;
1519 				++ents;
1520 
1521 				seg = ent->mlsqe_data;
1522 				ptri = 0;
1523 				nptr = sizeof (ent->mlsqe_data) /
1524 				    sizeof (mlxcx_wqe_data_seg_t);
1525 			}
1526 
1527 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1528 			if (c->dmac_size > rem) {
1529 				seg->mlds_byte_count = to_be32(rem);
1530 				rem = 0;
1531 			} else {
1532 				seg->mlds_byte_count = to_be32(c->dmac_size);
1533 				rem -= c->dmac_size;
1534 			}
1535 			seg->mlds_address = to_be64(c->dmac_laddress);
1536 			++seg;
1537 			++ptri;
1538 			++ent0->mlsqe_control.mlcs_ds;
1539 
1540 			ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=,
1541 			    MLXCX_SQE_MAX_DS);
1542 		}
1543 
1544 		if (b == b0) {
1545 			b = list_head(&b0->mlb_tx_chain);
1546 		} else {
1547 			b = list_next(&b0->mlb_tx_chain, b);
1548 		}
1549 	}
1550 
1551 	for (; ptri < nptr; ++ptri, ++seg) {
1552 		seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1553 		seg->mlds_byte_count = to_be32(0);
1554 		seg->mlds_address = to_be64(0);
1555 	}
1556 
1557 	/*
1558 	 * Make sure the workqueue entry is flushed out before updating
1559 	 * the doorbell.
1560 	 */
1561 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1562 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1563 	    ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1564 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1565 	    DDI_FME_VERSION);
1566 	if (err.fme_status != DDI_FM_OK) {
1567 		return (B_FALSE);
1568 	}
1569 	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, first)) {
1570 		return (B_FALSE);
1571 	}
1572 	return (B_TRUE);
1573 }
1574 
1575 boolean_t
1576 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1577     mlxcx_buffer_t *buf)
1578 {
1579 	return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1));
1580 }
1581 
1582 boolean_t
1583 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1584     mlxcx_buffer_t **bufs, size_t nbufs)
1585 {
1586 	uint_t index;
1587 	mlxcx_recvq_ent_t *ent;
1588 	mlxcx_completion_queue_t *cq;
1589 	mlxcx_wqe_data_seg_t *seg;
1590 	uint_t bi, ptri;
1591 	const ddi_dma_cookie_t *c;
1592 	mlxcx_buffer_t *buf;
1593 	ddi_fm_error_t err;
1594 
1595 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1596 	cq = mlwq->mlwq_cq;
1597 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1598 
1599 	for (bi = 0; bi < nbufs; ++bi) {
1600 		buf = bufs[bi];
1601 		bufs[bi] = NULL;
1602 		ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1603 
1604 		index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1605 		ent = &mlwq->mlwq_recv_ent[index];
1606 		buf->mlb_wqe_index = mlwq->mlwq_pc;
1607 
1608 		++mlwq->mlwq_pc;
1609 
1610 		mutex_enter(&cq->mlcq_bufbmtx);
1611 		list_insert_tail(&cq->mlcq_buffers, buf);
1612 		atomic_inc_64(&cq->mlcq_bufcnt);
1613 		mutex_exit(&cq->mlcq_bufbmtx);
1614 
1615 		ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS);
1616 		ptri = 0;
1617 		c = NULL;
1618 		while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) {
1619 			seg = &ent->mlrqe_data[ptri++];
1620 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1621 			seg->mlds_byte_count = to_be32(c->dmac_size);
1622 			seg->mlds_address = to_be64(c->dmac_laddress);
1623 		}
1624 		/*
1625 		 * Fill any unused scatter pointers with the special null
1626 		 * value.
1627 		 */
1628 		for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) {
1629 			seg = &ent->mlrqe_data[ptri];
1630 			seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1631 			seg->mlds_byte_count = to_be32(0);
1632 			seg->mlds_address = to_be64(0);
1633 		}
1634 
1635 		/*
1636 		 * Make sure the workqueue entry is flushed out before updating
1637 		 * the doorbell.
1638 		 */
1639 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1640 		    (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent,
1641 		    sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV));
1642 		ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1643 		    DDI_FME_VERSION);
1644 		if (err.fme_status != DDI_FM_OK) {
1645 			return (B_FALSE);
1646 		}
1647 	}
1648 
1649 	mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc);
1650 	/*
1651 	 * Flush the CQ doorbell as well so that HW knows how many
1652 	 * completions we've consumed.
1653 	 */
1654 	MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1655 	ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
1656 	    DDI_FME_VERSION);
1657 	if (err.fme_status != DDI_FM_OK) {
1658 		return (B_FALSE);
1659 	}
1660 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1661 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1662 	    DDI_FME_VERSION);
1663 	if (err.fme_status != DDI_FM_OK) {
1664 		return (B_FALSE);
1665 	}
1666 	return (B_TRUE);
1667 }
1668 
1669 void
1670 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1671 {
1672 	size_t target, current, want, done, n;
1673 	mlxcx_completion_queue_t *cq;
1674 	mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
1675 	uint_t i;
1676 
1677 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1678 	cq = mlwq->mlwq_cq;
1679 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1680 
1681 	ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS);
1682 
1683 	target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP;
1684 	cq = mlwq->mlwq_cq;
1685 
1686 	if (cq->mlcq_state & MLXCX_CQ_TEARDOWN)
1687 		return;
1688 
1689 	current = cq->mlcq_bufcnt;
1690 
1691 	if (current >= target - MLXCX_RQ_REFILL_STEP)
1692 		return;
1693 
1694 	want = target - current;
1695 	done = 0;
1696 
1697 	while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
1698 		n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
1699 		if (n == 0) {
1700 			mlxcx_warn(mlxp, "!exiting rq refill early, done %u "
1701 			    "but wanted %u", done, want);
1702 			return;
1703 		}
1704 		if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) {
1705 			for (i = 0; i < n; ++i)
1706 				mlxcx_buf_return(mlxp, b[i]);
1707 			return;
1708 		}
1709 		if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) {
1710 			/*
1711 			 * mlxcx_rq_add_buffers NULLs out the buffers as it
1712 			 * enqueues them, so any that are non-NULL we have to
1713 			 * free now. The others now belong to the WQ, even if
1714 			 * we failed.
1715 			 */
1716 			for (i = 0; i < n; ++i) {
1717 				if (b[i] != NULL) {
1718 					mlxcx_buf_return(mlxp, b[i]);
1719 				}
1720 			}
1721 			return;
1722 		}
1723 		done += n;
1724 	}
1725 }
1726 
1727 static const char *
1728 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy)
1729 {
1730 	switch (sy) {
1731 	case MLXCX_CQ_ERR_LOCAL_LENGTH:
1732 		return ("LOCAL_LENGTH");
1733 	case MLXCX_CQ_ERR_LOCAL_QP_OP:
1734 		return ("LOCAL_QP_OP");
1735 	case MLXCX_CQ_ERR_LOCAL_PROTECTION:
1736 		return ("LOCAL_PROTECTION");
1737 	case MLXCX_CQ_ERR_WR_FLUSHED:
1738 		return ("WR_FLUSHED");
1739 	case MLXCX_CQ_ERR_MEM_WINDOW_BIND:
1740 		return ("MEM_WINDOW_BIND");
1741 	case MLXCX_CQ_ERR_BAD_RESPONSE:
1742 		return ("BAD_RESPONSE");
1743 	case MLXCX_CQ_ERR_LOCAL_ACCESS:
1744 		return ("LOCAL_ACCESS");
1745 	case MLXCX_CQ_ERR_XPORT_RETRY_CTR:
1746 		return ("XPORT_RETRY_CTR");
1747 	case MLXCX_CQ_ERR_RNR_RETRY_CTR:
1748 		return ("RNR_RETRY_CTR");
1749 	case MLXCX_CQ_ERR_ABORTED:
1750 		return ("ABORTED");
1751 	default:
1752 		return ("UNKNOWN");
1753 	}
1754 }
1755 
1756 static void
1757 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1758     mlxcx_completionq_error_ent_t *ent)
1759 {
1760 	uint64_t ena;
1761 	char buf[FM_MAX_CLASS];
1762 	const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome);
1763 
1764 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1765 		return;
1766 
1767 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1768 	    MLXCX_FM_SERVICE_MLXCX, "cqe.err");
1769 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1770 
1771 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1772 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1773 	    "syndrome", DATA_TYPE_STRING, name,
1774 	    "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome,
1775 	    "vendor_syndrome", DATA_TYPE_UINT8,
1776 	    ent->mlcqee_vendor_error_syndrome,
1777 	    "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter),
1778 	    "wq_type", DATA_TYPE_STRING,
1779 	    (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv",
1780 	    "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num,
1781 	    "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num,
1782 	    NULL);
1783 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1784 }
1785 
1786 void
1787 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1788     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1789 {
1790 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1791 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) {
1792 		mlxcx_completionq_error_ent_t *eent =
1793 		    (mlxcx_completionq_error_ent_t *)ent;
1794 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
1795 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1796 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
1797 		mlxcx_check_sq(mlxp, mlcq->mlcq_wq);
1798 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
1799 		return;
1800 	}
1801 
1802 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) {
1803 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
1804 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1805 		return;
1806 	}
1807 
1808 	if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) {
1809 		mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x",
1810 		    ent->mlcqe_send_wqe_opcode);
1811 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1812 		return;
1813 	}
1814 
1815 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
1816 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
1817 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1818 		return;
1819 	}
1820 
1821 	mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1822 }
1823 
1824 mblk_t *
1825 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1826     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1827 {
1828 	uint32_t chkflags = 0;
1829 	ddi_fm_error_t err;
1830 
1831 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1832 
1833 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) {
1834 		mlxcx_completionq_error_ent_t *eent =
1835 		    (mlxcx_completionq_error_ent_t *)ent;
1836 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
1837 		mlxcx_buf_return(mlxp, buf);
1838 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
1839 		mlxcx_check_rq(mlxp, mlcq->mlcq_wq);
1840 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
1841 		return (NULL);
1842 	}
1843 
1844 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) {
1845 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
1846 		mlxcx_buf_return(mlxp, buf);
1847 		return (NULL);
1848 	}
1849 
1850 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
1851 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
1852 		mlxcx_buf_return(mlxp, buf);
1853 		return (NULL);
1854 	}
1855 
1856 	if (ent->mlcqe_rx_drop_counter > 0) {
1857 		atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops,
1858 		    ent->mlcqe_rx_drop_counter);
1859 	}
1860 
1861 	MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU);
1862 	ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err,
1863 	    DDI_FME_VERSION);
1864 	if (err.fme_status != DDI_FM_OK) {
1865 		ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle,
1866 		    DDI_FME_VERSION);
1867 		mlxcx_buf_return(mlxp, buf);
1868 		return (NULL);
1869 	}
1870 
1871 	if (!mlxcx_buf_loan(mlxp, buf)) {
1872 		mlxcx_warn(mlxp, "!loan failed, dropping packet");
1873 		mlxcx_buf_return(mlxp, buf);
1874 		return (NULL);
1875 	}
1876 
1877 	buf->mlb_mp->b_next = NULL;
1878 	buf->mlb_mp->b_cont = NULL;
1879 	buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr +
1880 	    from_be32(ent->mlcqe_byte_cnt);
1881 
1882 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) {
1883 		chkflags |= HCK_FULLCKSUM_OK;
1884 	}
1885 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) {
1886 		chkflags |= HCK_IPV4_HDRCKSUM_OK;
1887 	}
1888 	if (chkflags != 0) {
1889 		mac_hcksum_set(buf->mlb_mp, 0, 0, 0,
1890 		    from_be16(ent->mlcqe_checksum), chkflags);
1891 	}
1892 
1893 	/*
1894 	 * Don't check if a refill is needed on every single completion,
1895 	 * since checking involves taking the RQ lock.
1896 	 */
1897 	if ((buf->mlb_wqe_index & 0x7) == 0) {
1898 		mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
1899 		ASSERT(wq != NULL);
1900 		mutex_enter(&wq->mlwq_mtx);
1901 		if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN))
1902 			mlxcx_rq_refill(mlxp, wq);
1903 		mutex_exit(&wq->mlwq_mtx);
1904 	}
1905 
1906 	return (buf->mlb_mp);
1907 }
1908 
1909 static void
1910 mlxcx_buf_mp_return(caddr_t arg)
1911 {
1912 	mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg;
1913 	mlxcx_t *mlxp = b->mlb_mlx;
1914 
1915 	if (b->mlb_state != MLXCX_BUFFER_ON_LOAN) {
1916 		b->mlb_mp = NULL;
1917 		return;
1918 	}
1919 	/*
1920 	 * The mblk for this buffer_t (in its mlb_mp field) has been used now,
1921 	 * so NULL it out.
1922 	 */
1923 	b->mlb_mp = NULL;
1924 	mlxcx_buf_return(mlxp, b);
1925 }
1926 
1927 boolean_t
1928 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp)
1929 {
1930 	mlxcx_buffer_t *b;
1931 	ddi_device_acc_attr_t acc;
1932 	ddi_dma_attr_t attr;
1933 	boolean_t ret;
1934 
1935 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
1936 	b->mlb_shard = shard;
1937 	b->mlb_foreign = B_FALSE;
1938 
1939 	mlxcx_dma_acc_attr(mlxp, &acc);
1940 	mlxcx_dma_buf_attr(mlxp, &attr);
1941 
1942 	ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc,
1943 	    B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE);
1944 	if (!ret) {
1945 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
1946 		return (B_FALSE);
1947 	}
1948 
1949 	b->mlb_frtn.free_func = mlxcx_buf_mp_return;
1950 	b->mlb_frtn.free_arg = (caddr_t)b;
1951 	b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
1952 	    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
1953 
1954 	*bp = b;
1955 
1956 	return (B_TRUE);
1957 }
1958 
1959 boolean_t
1960 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
1961     mlxcx_buffer_t **bp)
1962 {
1963 	mlxcx_buffer_t *b;
1964 	ddi_dma_attr_t attr;
1965 	boolean_t ret;
1966 
1967 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
1968 	b->mlb_shard = shard;
1969 	b->mlb_foreign = B_TRUE;
1970 
1971 	mlxcx_dma_buf_attr(mlxp, &attr);
1972 
1973 	ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE);
1974 	if (!ret) {
1975 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
1976 		return (B_FALSE);
1977 	}
1978 
1979 	*bp = b;
1980 
1981 	return (B_TRUE);
1982 }
1983 
1984 static void
1985 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
1986     mlxcx_buffer_t **bp)
1987 {
1988 	mlxcx_buffer_t *b;
1989 	mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
1990 
1991 	mutex_enter(&s->mlbs_mtx);
1992 	while (list_is_empty(&s->mlbs_free))
1993 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
1994 	b = list_remove_head(&s->mlbs_free);
1995 	ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
1996 	ASSERT(b->mlb_foreign);
1997 	b->mlb_state = MLXCX_BUFFER_ON_WQ;
1998 	list_insert_tail(&s->mlbs_busy, b);
1999 	mutex_exit(&s->mlbs_mtx);
2000 
2001 	*bp = b;
2002 }
2003 
2004 boolean_t
2005 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2006     mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
2007 {
2008 	mlxcx_buffer_t *b, *b0 = NULL;
2009 	boolean_t first = B_TRUE;
2010 	ddi_fm_error_t err;
2011 	mblk_t *mp;
2012 	uint8_t *rptr;
2013 	size_t sz;
2014 	size_t ncookies = 0;
2015 	boolean_t ret;
2016 	uint_t attempts = 0;
2017 
2018 	for (mp = mpb; mp != NULL; mp = mp->b_cont) {
2019 		rptr = mp->b_rptr;
2020 		sz = MBLKL(mp);
2021 
2022 		if (off > 0)
2023 			ASSERT3U(off, <, sz);
2024 		rptr += off;
2025 		sz -= off;
2026 
2027 		if (sz < mlxp->mlx_props.mldp_tx_bind_threshold)
2028 			goto copyb;
2029 
2030 		mlxcx_buf_take_foreign(mlxp, wq, &b);
2031 		ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_FALSE);
2032 
2033 		if (!ret) {
2034 			mlxcx_buf_return(mlxp, b);
2035 
2036 copyb:
2037 			mlxcx_buf_take(mlxp, wq, &b);
2038 			ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
2039 			bcopy(rptr, b->mlb_dma.mxdb_va, sz);
2040 			MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
2041 			ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
2042 			    DDI_FME_VERSION);
2043 			if (err.fme_status != DDI_FM_OK) {
2044 				ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
2045 				    DDI_FME_VERSION);
2046 				mlxcx_buf_return(mlxp, b);
2047 				if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
2048 					*bp = NULL;
2049 					return (B_FALSE);
2050 				}
2051 				goto copyb;
2052 			}
2053 		}
2054 
2055 		/*
2056 		 * We might overestimate here when we've copied data, since
2057 		 * the buffer might be longer than what we copied into it. This
2058 		 * is safe since it's always wrong in the conservative
2059 		 * direction (and we will blow up later when we actually
2060 		 * generate the WQE anyway).
2061 		 *
2062 		 * If the assert below ever blows, we'll have to come and fix
2063 		 * this up so we can transmit these packets.
2064 		 */
2065 		ncookies += b->mlb_dma.mxdb_ncookies;
2066 
2067 		if (first)
2068 			b0 = b;
2069 
2070 		if (!first)
2071 			b->mlb_state = MLXCX_BUFFER_ON_CHAIN;
2072 
2073 		b->mlb_tx_mp = mp;
2074 		b->mlb_tx_head = b0;
2075 		b->mlb_used = sz;
2076 
2077 		if (!first)
2078 			list_insert_tail(&b0->mlb_tx_chain, b);
2079 		first = B_FALSE;
2080 		off = 0;
2081 	}
2082 
2083 	ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS);
2084 
2085 	*bp = b0;
2086 	return (B_TRUE);
2087 }
2088 
2089 void
2090 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp)
2091 {
2092 	mlxcx_buffer_t *b;
2093 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
2094 
2095 	mutex_enter(&s->mlbs_mtx);
2096 	while (list_is_empty(&s->mlbs_free))
2097 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
2098 	b = list_remove_head(&s->mlbs_free);
2099 	ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2100 	b->mlb_state = MLXCX_BUFFER_ON_WQ;
2101 	list_insert_tail(&s->mlbs_busy, b);
2102 	mutex_exit(&s->mlbs_mtx);
2103 
2104 	*bp = b;
2105 }
2106 
2107 #define	MLXCX_BUF_TAKE_N_TIMEOUT_USEC		5000
2108 #define	MLXCX_BUF_TAKE_N_MAX_RETRIES		3
2109 
2110 size_t
2111 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2112     mlxcx_buffer_t **bp, size_t nbufs)
2113 {
2114 	mlxcx_buffer_t *b;
2115 	size_t done = 0, empty = 0;
2116 	clock_t wtime = drv_usectohz(MLXCX_BUF_TAKE_N_TIMEOUT_USEC);
2117 	mlxcx_buf_shard_t *s;
2118 
2119 	s = wq->mlwq_bufs;
2120 
2121 	mutex_enter(&s->mlbs_mtx);
2122 	while (done < nbufs) {
2123 		while (list_is_empty(&s->mlbs_free)) {
2124 			(void) cv_reltimedwait(&s->mlbs_free_nonempty,
2125 			    &s->mlbs_mtx, wtime, TR_MILLISEC);
2126 			if (list_is_empty(&s->mlbs_free) &&
2127 			    empty++ >= MLXCX_BUF_TAKE_N_MAX_RETRIES) {
2128 				mutex_exit(&s->mlbs_mtx);
2129 				return (done);
2130 			}
2131 		}
2132 		b = list_remove_head(&s->mlbs_free);
2133 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2134 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2135 		list_insert_tail(&s->mlbs_busy, b);
2136 		bp[done++] = b;
2137 	}
2138 	mutex_exit(&s->mlbs_mtx);
2139 	return (done);
2140 }
2141 
2142 boolean_t
2143 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2144 {
2145 	VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
2146 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2147 
2148 	if (b->mlb_mp == NULL) {
2149 		b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2150 		    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2151 		if (b->mlb_mp == NULL)
2152 			return (B_FALSE);
2153 	}
2154 
2155 	b->mlb_state = MLXCX_BUFFER_ON_LOAN;
2156 	b->mlb_wqe_index = 0;
2157 	return (B_TRUE);
2158 }
2159 
2160 void
2161 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp)
2162 {
2163 	mlxcx_buffer_t *b;
2164 
2165 	if (b0->mlb_tx_head != b0) {
2166 		mlxcx_buf_return(mlxp, b0);
2167 		return;
2168 	}
2169 
2170 	while ((b = list_head(&b0->mlb_tx_chain)) != NULL) {
2171 		mlxcx_buf_return(mlxp, b);
2172 	}
2173 	if (keepmp) {
2174 		b0->mlb_tx_mp = NULL;
2175 		b0->mlb_tx_head = NULL;
2176 	}
2177 	mlxcx_buf_return(mlxp, b0);
2178 }
2179 
2180 void
2181 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2182 {
2183 	mlxcx_buffer_state_t oldstate = b->mlb_state;
2184 	mlxcx_buffer_t *txhead = b->mlb_tx_head;
2185 	mlxcx_buf_shard_t *s = b->mlb_shard;
2186 	mblk_t *mp = b->mlb_tx_mp;
2187 
2188 	VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
2189 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2190 	b->mlb_state = MLXCX_BUFFER_FREE;
2191 	b->mlb_wqe_index = 0;
2192 	b->mlb_tx_head = NULL;
2193 	b->mlb_tx_mp = NULL;
2194 	b->mlb_used = 0;
2195 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2196 
2197 	mutex_enter(&s->mlbs_mtx);
2198 	switch (oldstate) {
2199 	case MLXCX_BUFFER_INIT:
2200 		break;
2201 	case MLXCX_BUFFER_ON_WQ:
2202 		list_remove(&s->mlbs_busy, b);
2203 		break;
2204 	case MLXCX_BUFFER_ON_LOAN:
2205 		ASSERT(!b->mlb_foreign);
2206 		list_remove(&s->mlbs_busy, b);
2207 		break;
2208 	case MLXCX_BUFFER_FREE:
2209 		VERIFY(0);
2210 		break;
2211 	case MLXCX_BUFFER_ON_CHAIN:
2212 		ASSERT(txhead != NULL);
2213 		list_remove(&txhead->mlb_tx_chain, b);
2214 		list_remove(&s->mlbs_busy, b);
2215 		break;
2216 	}
2217 
2218 	if (b->mlb_foreign) {
2219 		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
2220 			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
2221 		}
2222 	}
2223 
2224 	list_insert_tail(&s->mlbs_free, b);
2225 	cv_signal(&s->mlbs_free_nonempty);
2226 
2227 	mutex_exit(&s->mlbs_mtx);
2228 
2229 	/*
2230 	 * For TX chain heads, free the mblk_t after we let go of the lock.
2231 	 * This might be a borrowed buf that we in turn loaned to MAC, in which
2232 	 * case calling freemsg() on it will re-enter this very function -- so
2233 	 * we better not be holding the lock!
2234 	 */
2235 	if (txhead == b)
2236 		freemsg(mp);
2237 }
2238 
2239 void
2240 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2241 {
2242 	mlxcx_buf_shard_t *s = b->mlb_shard;
2243 	VERIFY(b->mlb_state == MLXCX_BUFFER_FREE ||
2244 	    b->mlb_state == MLXCX_BUFFER_INIT);
2245 	ASSERT(mutex_owned(&s->mlbs_mtx));
2246 	if (b->mlb_state == MLXCX_BUFFER_FREE)
2247 		list_remove(&s->mlbs_free, b);
2248 
2249 	/*
2250 	 * This is going back to the kmem cache, so it needs to be set up in
2251 	 * the same way we expect a new buffer to come out (state INIT, other
2252 	 * fields NULL'd)
2253 	 */
2254 	b->mlb_state = MLXCX_BUFFER_INIT;
2255 	b->mlb_shard = NULL;
2256 	if (b->mlb_mp != NULL) {
2257 		freeb(b->mlb_mp);
2258 		ASSERT(b->mlb_mp == NULL);
2259 	}
2260 	mlxcx_dma_free(&b->mlb_dma);
2261 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2262 
2263 	kmem_cache_free(mlxp->mlx_bufs_cache, b);
2264 }
2265