xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_ring.c (revision fd7fa860de2ce9f847175f3d39dfd19f8d5735f9)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/atomic.h>
27 #include <sys/cpuvar.h>
28 
29 #include <sys/pattr.h>
30 #include <sys/dlpi.h>
31 
32 #include <sys/mac_provider.h>
33 
34 #include <sys/random.h>
35 
36 #include <mlxcx.h>
37 
38 boolean_t
39 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
40 {
41 	ddi_device_acc_attr_t acc;
42 	ddi_dma_attr_t attr;
43 	boolean_t ret;
44 	size_t sz;
45 
46 	VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
47 
48 	/* Receive and send queue entries might be different sizes. */
49 	switch (mlwq->mlwq_type) {
50 	case MLXCX_WQ_TYPE_SENDQ:
51 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift;
52 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
53 		sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t);
54 		break;
55 	case MLXCX_WQ_TYPE_RECVQ:
56 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift;
57 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
58 		sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t);
59 		break;
60 	default:
61 		VERIFY(0);
62 		return (B_FALSE);
63 	}
64 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
65 
66 	mlxcx_dma_acc_attr(mlxp, &acc);
67 	mlxcx_dma_queue_attr(mlxp, &attr);
68 
69 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc,
70 	    B_TRUE, sz, B_TRUE);
71 	if (!ret) {
72 		mlxcx_warn(mlxp, "failed to allocate WQ memory");
73 		return (B_FALSE);
74 	}
75 
76 	/*
77 	 * Just set the first pointer in the union. Yes, this is a strict
78 	 * aliasing violation. No, I don't care.
79 	 */
80 	mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va;
81 
82 	mlxcx_dma_acc_attr(mlxp, &acc);
83 	mlxcx_dma_qdbell_attr(mlxp, &attr);
84 	sz = sizeof (mlxcx_workq_doorbell_t);
85 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc,
86 	    B_TRUE, sz, B_TRUE);
87 	if (!ret) {
88 		mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory");
89 		mlxcx_dma_free(&mlwq->mlwq_dma);
90 		mlwq->mlwq_send_ent = NULL;
91 		return (B_FALSE);
92 	}
93 
94 	mlwq->mlwq_doorbell =
95 	    (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va;
96 
97 	mlwq->mlwq_state |= MLXCX_WQ_ALLOC;
98 
99 	return (B_TRUE);
100 }
101 
102 void
103 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
104 {
105 	VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
106 	if (mlwq->mlwq_state & MLXCX_WQ_CREATED)
107 		VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED);
108 
109 	mlxcx_dma_free(&mlwq->mlwq_dma);
110 	mlwq->mlwq_send_ent = NULL;
111 	mlxcx_dma_free(&mlwq->mlwq_doorbell_dma);
112 	mlwq->mlwq_doorbell = NULL;
113 
114 	mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
115 }
116 
117 static boolean_t
118 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
119     uint_t ent_shift)
120 {
121 	ddi_device_acc_attr_t acc;
122 	ddi_dma_attr_t attr;
123 	boolean_t ret;
124 	size_t sz, i;
125 
126 	VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
127 
128 	mlcq->mlcq_entshift = ent_shift;
129 	mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
130 	sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
131 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
132 
133 	mlxcx_dma_acc_attr(mlxp, &acc);
134 	mlxcx_dma_queue_attr(mlxp, &attr);
135 
136 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc,
137 	    B_TRUE, sz, B_TRUE);
138 	if (!ret) {
139 		mlxcx_warn(mlxp, "failed to allocate CQ memory");
140 		return (B_FALSE);
141 	}
142 
143 	mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va;
144 
145 	for (i = 0; i < mlcq->mlcq_nents; ++i) {
146 		mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID;
147 		mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT;
148 	}
149 
150 	mlxcx_dma_acc_attr(mlxp, &acc);
151 	mlxcx_dma_qdbell_attr(mlxp, &attr);
152 	sz = sizeof (mlxcx_completionq_doorbell_t);
153 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc,
154 	    B_TRUE, sz, B_TRUE);
155 	if (!ret) {
156 		mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory");
157 		mlxcx_dma_free(&mlcq->mlcq_dma);
158 		mlcq->mlcq_ent = NULL;
159 		return (B_FALSE);
160 	}
161 
162 	mlcq->mlcq_doorbell =
163 	    (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va;
164 
165 	mlcq->mlcq_state |= MLXCX_CQ_ALLOC;
166 
167 	return (B_TRUE);
168 }
169 
170 static void
171 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
172 {
173 	VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
174 	if (mlcq->mlcq_state & MLXCX_CQ_CREATED)
175 		VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
176 
177 	mlxcx_dma_free(&mlcq->mlcq_dma);
178 	mlcq->mlcq_ent = NULL;
179 	mlxcx_dma_free(&mlcq->mlcq_doorbell_dma);
180 	mlcq->mlcq_doorbell = NULL;
181 
182 	mlcq->mlcq_state &= ~MLXCX_CQ_ALLOC;
183 }
184 
185 void
186 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
187 {
188 	mlxcx_completion_queue_t *mlcq;
189 
190 	/*
191 	 * If something is holding the lock on a long operation like a
192 	 * refill, setting this flag asks them to exit early if possible.
193 	 */
194 	atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN);
195 
196 	mutex_enter(&mlwq->mlwq_mtx);
197 
198 	list_remove(&mlxp->mlx_wqs, mlwq);
199 
200 	if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) &&
201 	    !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) {
202 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
203 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
204 		    !mlxcx_cmd_stop_rq(mlxp, mlwq)) {
205 			mlxcx_warn(mlxp, "failed to stop "
206 			    "recv queue num %x", mlwq->mlwq_num);
207 		}
208 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
209 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
210 		    !mlxcx_cmd_stop_sq(mlxp, mlwq)) {
211 			mlxcx_warn(mlxp, "failed to stop "
212 			    "send queue num %x", mlwq->mlwq_num);
213 		}
214 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
215 		    !mlxcx_cmd_destroy_rq(mlxp, mlwq)) {
216 			mlxcx_warn(mlxp, "failed to destroy "
217 			    "recv queue num %x", mlwq->mlwq_num);
218 		}
219 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
220 		    !mlxcx_cmd_destroy_sq(mlxp, mlwq)) {
221 			mlxcx_warn(mlxp, "failed to destroy "
222 			    "send queue num %x", mlwq->mlwq_num);
223 		}
224 	}
225 	if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) {
226 		mlxcx_wq_rele_dma(mlxp, mlwq);
227 	}
228 	mlcq = mlwq->mlwq_cq;
229 
230 	/* These will be released by mlxcx_teardown_bufs() */
231 	mlwq->mlwq_bufs = NULL;
232 	mlwq->mlwq_foreign_bufs = NULL;
233 
234 	mutex_exit(&mlwq->mlwq_mtx);
235 
236 	mutex_enter(&mlcq->mlcq_mtx);
237 	mutex_enter(&mlwq->mlwq_mtx);
238 	ASSERT3P(mlcq->mlcq_wq, ==, mlwq);
239 	mlcq->mlcq_wq = NULL;
240 	mutex_exit(&mlwq->mlwq_mtx);
241 	mutex_exit(&mlcq->mlcq_mtx);
242 
243 	mutex_destroy(&mlwq->mlwq_mtx);
244 }
245 
246 void
247 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
248 {
249 	mlxcx_event_queue_t *mleq;
250 	mlxcx_buffer_t *b;
251 
252 	/*
253 	 * If something is holding the lock on a long operation like polling
254 	 * which we're going to abort anyway, this flag asks them to exit
255 	 * early if possible.
256 	 */
257 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN);
258 
259 	mutex_enter(&mlcq->mlcq_mtx);
260 
261 	list_remove(&mlxp->mlx_cqs, mlcq);
262 
263 	if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) &&
264 	    !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) {
265 		if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) {
266 			mlxcx_warn(mlxp, "failed to destroy "
267 			    "completion queue num %u",
268 			    mlcq->mlcq_num);
269 		}
270 	}
271 	if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) {
272 		mlxcx_cq_rele_dma(mlxp, mlcq);
273 	}
274 	/*
275 	 * If we're on an EQ AVL tree, then we need to grab
276 	 * the EQ's mutex to take it off. The ISR always takes
277 	 * EQ mutex before CQ mutex, so we have to let go of
278 	 * the CQ mutex then come back again.
279 	 *
280 	 * The ISR will bail out if tries to touch this CQ now since
281 	 * we added the CQ_DESTROYED flag above.
282 	 */
283 	if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
284 		mleq = mlcq->mlcq_eq;
285 	} else {
286 		mleq = NULL;
287 	}
288 
289 	/* Return any outstanding buffers to the free pool. */
290 	while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) {
291 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
292 	}
293 	mutex_enter(&mlcq->mlcq_bufbmtx);
294 	while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) {
295 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
296 	}
297 	mutex_exit(&mlcq->mlcq_bufbmtx);
298 
299 	/*
300 	 * Since the interrupt handlers take the EQ lock before the CQ one,
301 	 * we must do the same here. That means letting go of the lock
302 	 * for a brief window here (we'll double-check the state when we
303 	 * get back in).
304 	 */
305 	mutex_exit(&mlcq->mlcq_mtx);
306 
307 	if (mleq != NULL) {
308 		mutex_enter(&mleq->mleq_mtx);
309 		mutex_enter(&mlcq->mlcq_mtx);
310 		/*
311 		 * Double-check the state, we let go of the
312 		 * mutex briefly.
313 		 */
314 		if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
315 			avl_remove(&mleq->mleq_cqs, mlcq);
316 			mlcq->mlcq_state &= ~MLXCX_CQ_EQAVL;
317 		}
318 		mutex_exit(&mlcq->mlcq_mtx);
319 		mutex_exit(&mleq->mleq_mtx);
320 	}
321 
322 	mutex_enter(&mlcq->mlcq_mtx);
323 	ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED |
324 	    MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED));
325 	mutex_exit(&mlcq->mlcq_mtx);
326 
327 	mutex_destroy(&mlcq->mlcq_mtx);
328 	mutex_destroy(&mlcq->mlcq_bufbmtx);
329 	list_destroy(&mlcq->mlcq_buffers);
330 	list_destroy(&mlcq->mlcq_buffers_b);
331 	kmem_free(mlcq, sizeof (mlxcx_completion_queue_t));
332 }
333 
334 static boolean_t
335 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
336     mlxcx_completion_queue_t **cqp, uint_t ent_shift)
337 {
338 	mlxcx_completion_queue_t *cq;
339 
340 	cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP);
341 	mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER,
342 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
343 	mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER,
344 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
345 	list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t),
346 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
347 	list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t),
348 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
349 
350 	cq->mlcq_mlx = mlxp;
351 	list_insert_tail(&mlxp->mlx_cqs, cq);
352 
353 	mutex_enter(&cq->mlcq_mtx);
354 
355 	if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
356 		mutex_exit(&cq->mlcq_mtx);
357 		return (B_FALSE);
358 	}
359 
360 	cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP;
361 	cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP;
362 
363 	cq->mlcq_uar = &mlxp->mlx_uar;
364 	cq->mlcq_eq = eq;
365 
366 	cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec;
367 	cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count;
368 
369 	if (!mlxcx_cmd_create_cq(mlxp, cq)) {
370 		mutex_exit(&cq->mlcq_mtx);
371 		return (B_FALSE);
372 	}
373 
374 	mutex_exit(&cq->mlcq_mtx);
375 
376 	mutex_enter(&eq->mleq_mtx);
377 	mutex_enter(&cq->mlcq_mtx);
378 	ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL);
379 	avl_add(&eq->mleq_cqs, cq);
380 	cq->mlcq_state |= MLXCX_CQ_EQAVL;
381 	mlxcx_arm_cq(mlxp, cq);
382 	mutex_exit(&cq->mlcq_mtx);
383 	mutex_exit(&eq->mleq_mtx);
384 
385 	*cqp = cq;
386 	return (B_TRUE);
387 }
388 
389 static boolean_t
390 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
391     mlxcx_work_queue_t *wq)
392 {
393 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
394 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
395 
396 	list_insert_tail(&mlxp->mlx_wqs, wq);
397 
398 	mutex_enter(&wq->mlwq_mtx);
399 
400 	wq->mlwq_mlx = mlxp;
401 	wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ;
402 	wq->mlwq_cq = cq;
403 	wq->mlwq_pd = &mlxp->mlx_pd;
404 	wq->mlwq_uar = &mlxp->mlx_uar;
405 
406 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
407 
408 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
409 		mutex_exit(&wq->mlwq_mtx);
410 		return (B_FALSE);
411 	}
412 
413 	if (!mlxcx_cmd_create_rq(mlxp, wq)) {
414 		mutex_exit(&wq->mlwq_mtx);
415 		return (B_FALSE);
416 	}
417 
418 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
419 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
420 
421 	mutex_exit(&wq->mlwq_mtx);
422 
423 	mutex_enter(&cq->mlcq_mtx);
424 	mutex_enter(&wq->mlwq_mtx);
425 	ASSERT3P(cq->mlcq_wq, ==, NULL);
426 	cq->mlcq_wq = wq;
427 	mutex_exit(&wq->mlwq_mtx);
428 	mutex_exit(&cq->mlcq_mtx);
429 
430 	return (B_TRUE);
431 }
432 
433 static boolean_t
434 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
435     mlxcx_tis_t *tis, mlxcx_work_queue_t *wq)
436 {
437 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
438 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
439 
440 	list_insert_tail(&mlxp->mlx_wqs, wq);
441 
442 	mutex_enter(&wq->mlwq_mtx);
443 
444 	wq->mlwq_mlx = mlxp;
445 	wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ;
446 	wq->mlwq_cq = cq;
447 	wq->mlwq_pd = &mlxp->mlx_pd;
448 	wq->mlwq_uar = &mlxp->mlx_uar;
449 	wq->mlwq_tis = tis;
450 
451 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
452 	wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp);
453 
454 	VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2);
455 	wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2;
456 
457 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
458 		mutex_exit(&wq->mlwq_mtx);
459 		return (B_FALSE);
460 	}
461 
462 	if (!mlxcx_cmd_create_sq(mlxp, wq)) {
463 		mutex_exit(&wq->mlwq_mtx);
464 		return (B_FALSE);
465 	}
466 
467 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
468 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
469 
470 	mutex_exit(&wq->mlwq_mtx);
471 
472 	mutex_enter(&cq->mlcq_mtx);
473 	mutex_enter(&wq->mlwq_mtx);
474 	ASSERT3P(cq->mlcq_wq, ==, NULL);
475 	cq->mlcq_wq = wq;
476 	mutex_exit(&wq->mlwq_mtx);
477 	mutex_exit(&cq->mlcq_mtx);
478 
479 	return (B_TRUE);
480 }
481 
482 /*
483  * Before we tear down the queues associated with the rx group,
484  * flag each cq as being torn down and wake up any tasks.
485  */
486 static void
487 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
488 {
489 	mlxcx_work_queue_t *wq;
490 	mlxcx_completion_queue_t *cq;
491 	mlxcx_buf_shard_t *s;
492 	uint_t i;
493 
494 	mutex_enter(&g->mlg_mtx);
495 
496 	for (i = 0; i < g->mlg_nwqs; ++i) {
497 		wq = &g->mlg_wqs[i];
498 		cq = wq->mlwq_cq;
499 		if (cq != NULL) {
500 			s = wq->mlwq_bufs;
501 			mutex_enter(&s->mlbs_mtx);
502 			atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
503 			cv_broadcast(&s->mlbs_free_nonempty);
504 			mutex_exit(&s->mlbs_mtx);
505 		}
506 	}
507 
508 	mutex_exit(&g->mlg_mtx);
509 }
510 
511 void
512 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
513 {
514 	mlxcx_work_queue_t *wq;
515 	mlxcx_completion_queue_t *cq;
516 	mlxcx_flow_entry_t *fe;
517 	mlxcx_flow_group_t *fg;
518 	mlxcx_flow_table_t *ft;
519 	uint_t i;
520 
521 	mutex_enter(&g->mlg_port->mlp_mtx);
522 	mutex_enter(&g->mlg_mtx);
523 
524 	if (g->mlg_state & MLXCX_GROUP_FLOWS) {
525 		mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g);
526 
527 		if (g->mlg_rx_vlan_ft != NULL)
528 			mlxcx_remove_all_vlan_entries(mlxp, g);
529 
530 		if (g == &mlxp->mlx_rx_groups[0]) {
531 			ft = g->mlg_port->mlp_rx_flow;
532 			mutex_enter(&ft->mlft_mtx);
533 
534 			fg = g->mlg_port->mlp_bcast;
535 			fe = list_head(&fg->mlfg_entries);
536 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
537 				(void) mlxcx_cmd_delete_flow_table_entry(
538 				    mlxp, fe);
539 			}
540 
541 			fg = g->mlg_port->mlp_promisc;
542 			fe = list_head(&fg->mlfg_entries);
543 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
544 				(void) mlxcx_cmd_delete_flow_table_entry(
545 				    mlxp, fe);
546 			}
547 
548 			mutex_exit(&ft->mlft_mtx);
549 		}
550 
551 		if (g->mlg_rx_vlan_ft != NULL) {
552 			mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx);
553 			ASSERT(list_is_empty(&g->mlg_rx_vlans));
554 			fg = g->mlg_rx_vlan_def_fg;
555 			fe = list_head(&fg->mlfg_entries);
556 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
557 				(void) mlxcx_cmd_delete_flow_table_entry(
558 				    mlxp, fe);
559 			}
560 			fg = g->mlg_rx_vlan_promisc_fg;
561 			fe = list_head(&fg->mlfg_entries);
562 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
563 				(void) mlxcx_cmd_delete_flow_table_entry(
564 				    mlxp, fe);
565 			}
566 			mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft);
567 			list_destroy(&g->mlg_rx_vlans);
568 
569 			g->mlg_rx_vlan_ft = NULL;
570 		}
571 
572 		mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx);
573 		mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft);
574 		g->mlg_rx_hash_ft = NULL;
575 
576 		avl_destroy(&g->mlg_rx_macs);
577 		g->mlg_state &= ~MLXCX_GROUP_FLOWS;
578 	}
579 
580 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
581 		for (i = 0; i < g->mlg_nwqs; ++i) {
582 			wq = &g->mlg_wqs[i];
583 			mutex_enter(&wq->mlwq_mtx);
584 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
585 			    !mlxcx_cmd_stop_rq(mlxp, wq)) {
586 				mlxcx_warn(mlxp, "failed to stop rq %x",
587 				    wq->mlwq_num);
588 			}
589 			mutex_exit(&wq->mlwq_mtx);
590 		}
591 		taskq_destroy(g->mlg_refill_tq);
592 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
593 	}
594 
595 	if (g->mlg_state & MLXCX_GROUP_TIRTIS) {
596 		for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
597 			mlxcx_tir_t *tir = &g->mlg_tir[i];
598 			if (tir->mltir_state & MLXCX_TIR_CREATED &&
599 			    !(tir->mltir_state & MLXCX_TIR_DESTROYED)) {
600 				if (!mlxcx_cmd_destroy_tir(mlxp, tir)) {
601 					mlxcx_warn(mlxp,
602 					    "failed to destroy tir %u "
603 					    "for rx ring", tir->mltir_num);
604 				}
605 			}
606 		}
607 		g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
608 	}
609 
610 	if (g->mlg_state & MLXCX_GROUP_RQT) {
611 		if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED &&
612 		    !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) {
613 			if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) {
614 				mlxcx_warn(mlxp, "failed to destroy rqt %u "
615 				    "for rx ring", g->mlg_rqt->mlrqt_num);
616 			}
617 			kmem_free(g->mlg_rqt->mlrqt_rq,
618 			    g->mlg_rqt->mlrqt_rq_size);
619 			g->mlg_rqt->mlrqt_rq = NULL;
620 			kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t));
621 			g->mlg_rqt = NULL;
622 		}
623 		g->mlg_state &= ~MLXCX_GROUP_RQT;
624 	}
625 
626 	for (i = 0; i < g->mlg_nwqs; ++i) {
627 		wq = &g->mlg_wqs[i];
628 		cq = wq->mlwq_cq;
629 		mlxcx_wq_teardown(mlxp, wq);
630 		if (cq != NULL)
631 			mlxcx_cq_teardown(mlxp, cq);
632 	}
633 	kmem_free(g->mlg_wqs, g->mlg_wqs_size);
634 	g->mlg_wqs = NULL;
635 	g->mlg_state &= ~MLXCX_GROUP_WQS;
636 
637 	mutex_exit(&g->mlg_mtx);
638 	mutex_exit(&g->mlg_port->mlp_mtx);
639 
640 	mutex_destroy(&g->mlg_mtx);
641 
642 	g->mlg_state &= ~MLXCX_GROUP_INIT;
643 	ASSERT3S(g->mlg_state, ==, 0);
644 }
645 
646 void
647 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
648 {
649 	mlxcx_work_queue_t *wq;
650 	mlxcx_completion_queue_t *cq;
651 	uint_t i;
652 
653 	mutex_enter(&g->mlg_mtx);
654 
655 	if (g->mlg_state & MLXCX_GROUP_WQS) {
656 		for (i = 0; i < g->mlg_nwqs; ++i) {
657 			wq = &g->mlg_wqs[i];
658 			mutex_enter(&wq->mlwq_mtx);
659 			cq = wq->mlwq_cq;
660 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
661 			    !mlxcx_cmd_stop_sq(mlxp, wq)) {
662 				mlxcx_warn(mlxp, "failed to stop sq %x",
663 				    wq->mlwq_num);
664 			}
665 			mutex_exit(&wq->mlwq_mtx);
666 			mlxcx_wq_teardown(mlxp, wq);
667 			if (cq != NULL)
668 				mlxcx_cq_teardown(mlxp, cq);
669 		}
670 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
671 		kmem_free(g->mlg_wqs, g->mlg_wqs_size);
672 		g->mlg_wqs = NULL;
673 		g->mlg_state &= ~MLXCX_GROUP_WQS;
674 	}
675 
676 	if ((g->mlg_state & MLXCX_GROUP_TIRTIS) &&
677 	    g->mlg_tis.mltis_state & MLXCX_TIS_CREATED &&
678 	    !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) {
679 		if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) {
680 			mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring",
681 			    g->mlg_tis.mltis_num);
682 		}
683 	}
684 	g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
685 
686 	mutex_exit(&g->mlg_mtx);
687 	mutex_destroy(&g->mlg_mtx);
688 	g->mlg_state &= ~MLXCX_GROUP_INIT;
689 	ASSERT3S(g->mlg_state, ==, 0);
690 }
691 
692 void
693 mlxcx_teardown_groups(mlxcx_t *mlxp)
694 {
695 	mlxcx_ring_group_t *g;
696 	uint_t i;
697 
698 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
699 		g = &mlxp->mlx_rx_groups[i];
700 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
701 			continue;
702 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
703 		mlxcx_quiesce_rx_cqs(mlxp, g);
704 	}
705 
706 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
707 		g = &mlxp->mlx_rx_groups[i];
708 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
709 			continue;
710 		mlxcx_teardown_rx_group(mlxp, g);
711 	}
712 
713 	kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
714 	mlxp->mlx_rx_groups = NULL;
715 
716 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
717 		g = &mlxp->mlx_tx_groups[i];
718 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
719 			continue;
720 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
721 		mlxcx_teardown_tx_group(mlxp, g);
722 	}
723 
724 	kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
725 	mlxp->mlx_tx_groups = NULL;
726 }
727 
728 boolean_t
729 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
730 {
731 	mlxcx_event_queue_t *eq;
732 	mlxcx_completion_queue_t *cq;
733 	mlxcx_work_queue_t *rq;
734 	mlxcx_flow_table_t *ft;
735 	mlxcx_flow_group_t *fg;
736 	mlxcx_flow_entry_t *fe;
737 	uint_t ent_shift;
738 	uint_t i, j;
739 
740 	ASSERT3S(g->mlg_state, ==, 0);
741 
742 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
743 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
744 	mutex_enter(&g->mlg_mtx);
745 	g->mlg_mlx = mlxp;
746 	g->mlg_type = MLXCX_GROUP_RX;
747 	g->mlg_port = &mlxp->mlx_ports[0];
748 	g->mlg_state |= MLXCX_GROUP_INIT;
749 
750 	g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group;
751 	i = g - &mlxp->mlx_rx_groups[0];
752 	if (i < mlxp->mlx_props.mldp_rx_ngroups_large)
753 		g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group;
754 
755 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
756 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
757 	g->mlg_state |= MLXCX_GROUP_WQS;
758 
759 	g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP);
760 	g->mlg_rqt->mlrqt_max = 2;
761 	while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs)
762 		g->mlg_rqt->mlrqt_max <<= 1;
763 	g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max *
764 	    sizeof (mlxcx_work_queue_t *);
765 	g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP);
766 	g->mlg_state |= MLXCX_GROUP_RQT;
767 
768 	for (i = 0; i < g->mlg_nwqs; ++i) {
769 		eq = NULL;
770 		while (eq == NULL) {
771 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
772 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
773 				mlxp->mlx_next_eq = 1;
774 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
775 			    eq->mleq_type != MLXCX_EQ_TYPE_RX) {
776 				/* Try the next one */
777 				eq = NULL;
778 			}
779 		}
780 
781 		/*
782 		 * A single completion is indicated for each rq entry as
783 		 * it is used. So, the number of cq entries never needs
784 		 * to be larger than the rq.
785 		 */
786 		ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
787 		    mlxp->mlx_props.mldp_rq_size_shift);
788 		if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
789 			g->mlg_nwqs = i;
790 			break;
791 		}
792 
793 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
794 
795 		rq = &g->mlg_wqs[i];
796 		if (!mlxcx_rq_setup(mlxp, cq, rq)) {
797 			g->mlg_nwqs = i;
798 			break;
799 		}
800 		g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq;
801 		g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY;
802 		rq->mlwq_group = g;
803 	}
804 	if (g->mlg_nwqs == 0) {
805 		mutex_exit(&g->mlg_mtx);
806 		return (B_FALSE);
807 	}
808 
809 	if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) {
810 		mutex_exit(&g->mlg_mtx);
811 		return (B_FALSE);
812 	}
813 
814 	for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
815 		mlxcx_tir_t *tir = &g->mlg_tir[i];
816 		tir->mltir_tdom = &mlxp->mlx_tdom;
817 		switch (i) {
818 		case MLXCX_TIR_ROLE_OTHER:
819 			tir->mltir_type = MLXCX_TIR_DIRECT;
820 			tir->mltir_rq = &g->mlg_wqs[0];
821 			break;
822 		case MLXCX_TIR_ROLE_IPv4:
823 		case MLXCX_TIR_ROLE_IPv6:
824 		case MLXCX_TIR_ROLE_TCPv4:
825 		case MLXCX_TIR_ROLE_TCPv6:
826 		case MLXCX_TIR_ROLE_UDPv4:
827 		case MLXCX_TIR_ROLE_UDPv6:
828 			tir->mltir_type = MLXCX_TIR_INDIRECT;
829 			tir->mltir_rqtable = g->mlg_rqt;
830 			tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ;
831 			(void) random_get_pseudo_bytes(tir->mltir_toeplitz_key,
832 			    sizeof (tir->mltir_toeplitz_key));
833 			break;
834 		}
835 		switch (i) {
836 		case MLXCX_TIR_ROLE_OTHER:
837 			break;
838 		case MLXCX_TIR_ROLE_IPv4:
839 		case MLXCX_TIR_ROLE_TCPv4:
840 		case MLXCX_TIR_ROLE_UDPv4:
841 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4;
842 			tir->mltir_hash_fields =
843 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
844 			break;
845 		case MLXCX_TIR_ROLE_IPv6:
846 		case MLXCX_TIR_ROLE_TCPv6:
847 		case MLXCX_TIR_ROLE_UDPv6:
848 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6;
849 			tir->mltir_hash_fields =
850 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
851 			break;
852 		}
853 		switch (i) {
854 		case MLXCX_TIR_ROLE_OTHER:
855 		case MLXCX_TIR_ROLE_IPv4:
856 		case MLXCX_TIR_ROLE_IPv6:
857 			break;
858 		case MLXCX_TIR_ROLE_TCPv4:
859 		case MLXCX_TIR_ROLE_TCPv6:
860 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP;
861 			tir->mltir_hash_fields |=
862 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
863 			break;
864 		case MLXCX_TIR_ROLE_UDPv4:
865 		case MLXCX_TIR_ROLE_UDPv6:
866 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP;
867 			tir->mltir_hash_fields |=
868 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
869 			break;
870 		}
871 
872 		if (!mlxcx_cmd_create_tir(mlxp, tir)) {
873 			mutex_exit(&g->mlg_mtx);
874 			return (B_FALSE);
875 		}
876 
877 		g->mlg_state |= MLXCX_GROUP_TIRTIS;
878 	}
879 
880 	/*
881 	 * Flow table: our RX hashing breakout table for RSS
882 	 */
883 
884 	g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
885 	    KM_SLEEP));
886 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
887 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
888 	avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare,
889 	    sizeof (mlxcx_group_mac_t),
890 	    offsetof(mlxcx_group_mac_t, mlgm_group_entry));
891 	g->mlg_state |= MLXCX_GROUP_FLOWS;
892 
893 	mutex_enter(&ft->mlft_mtx);
894 
895 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
896 	ft->mlft_level = 2;
897 	ft->mlft_port = g->mlg_port;
898 	ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT;
899 	ft->mlft_nents = (1 << ft->mlft_entshift);
900 	ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP);
901 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
902 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
903 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
904 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
905 
906 	for (j = 0; j < ft->mlft_nents; ++j) {
907 		ft->mlft_ent[j].mlfe_table = ft;
908 		ft->mlft_ent[j].mlfe_index = j;
909 	}
910 
911 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
912 		mutex_exit(&ft->mlft_mtx);
913 		mutex_exit(&g->mlg_mtx);
914 		return (B_FALSE);
915 	}
916 
917 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
918 	list_insert_tail(&ft->mlft_groups, fg);
919 	fg->mlfg_table = ft;
920 	fg->mlfg_size = 1;
921 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
922 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
923 		mutex_exit(&ft->mlft_mtx);
924 		mutex_exit(&g->mlg_mtx);
925 		return (B_FALSE);
926 	}
927 	fe = list_head(&fg->mlfg_entries);
928 	fe->mlfe_ip_version = 6;
929 	fe->mlfe_ip_proto = IPPROTO_UDP;
930 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
931 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
932 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6];
933 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
934 		mutex_exit(&ft->mlft_mtx);
935 		mutex_exit(&g->mlg_mtx);
936 		return (B_FALSE);
937 	}
938 
939 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
940 	list_insert_tail(&ft->mlft_groups, fg);
941 	fg->mlfg_table = ft;
942 	fg->mlfg_size = 1;
943 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
944 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
945 		mutex_exit(&ft->mlft_mtx);
946 		mutex_exit(&g->mlg_mtx);
947 		return (B_FALSE);
948 	}
949 	fe = list_head(&fg->mlfg_entries);
950 	fe->mlfe_ip_version = 4;
951 	fe->mlfe_ip_proto = IPPROTO_UDP;
952 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
953 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
954 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4];
955 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
956 		mutex_exit(&ft->mlft_mtx);
957 		mutex_exit(&g->mlg_mtx);
958 		return (B_FALSE);
959 	}
960 
961 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
962 	list_insert_tail(&ft->mlft_groups, fg);
963 	fg->mlfg_table = ft;
964 	fg->mlfg_size = 1;
965 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
966 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
967 		mutex_exit(&ft->mlft_mtx);
968 		mutex_exit(&g->mlg_mtx);
969 		return (B_FALSE);
970 	}
971 	fe = list_head(&fg->mlfg_entries);
972 	fe->mlfe_ip_version = 6;
973 	fe->mlfe_ip_proto = IPPROTO_TCP;
974 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
975 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
976 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6];
977 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
978 		mutex_exit(&ft->mlft_mtx);
979 		mutex_exit(&g->mlg_mtx);
980 		return (B_FALSE);
981 	}
982 
983 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
984 	list_insert_tail(&ft->mlft_groups, fg);
985 	fg->mlfg_table = ft;
986 	fg->mlfg_size = 1;
987 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
988 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
989 		mutex_exit(&ft->mlft_mtx);
990 		mutex_exit(&g->mlg_mtx);
991 		return (B_FALSE);
992 	}
993 	fe = list_head(&fg->mlfg_entries);
994 	fe->mlfe_ip_version = 4;
995 	fe->mlfe_ip_proto = IPPROTO_TCP;
996 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
997 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
998 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4];
999 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1000 		mutex_exit(&ft->mlft_mtx);
1001 		mutex_exit(&g->mlg_mtx);
1002 		return (B_FALSE);
1003 	}
1004 
1005 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1006 	list_insert_tail(&ft->mlft_groups, fg);
1007 	fg->mlfg_table = ft;
1008 	fg->mlfg_size = 1;
1009 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1010 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1011 		mutex_exit(&ft->mlft_mtx);
1012 		mutex_exit(&g->mlg_mtx);
1013 		return (B_FALSE);
1014 	}
1015 	fe = list_head(&fg->mlfg_entries);
1016 	fe->mlfe_ip_version = 6;
1017 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1018 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1019 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv6];
1020 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1021 		mutex_exit(&ft->mlft_mtx);
1022 		mutex_exit(&g->mlg_mtx);
1023 		return (B_FALSE);
1024 	}
1025 
1026 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1027 	list_insert_tail(&ft->mlft_groups, fg);
1028 	fg->mlfg_table = ft;
1029 	fg->mlfg_size = 1;
1030 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1031 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1032 		mutex_exit(&ft->mlft_mtx);
1033 		mutex_exit(&g->mlg_mtx);
1034 		return (B_FALSE);
1035 	}
1036 	fe = list_head(&fg->mlfg_entries);
1037 	fe->mlfe_ip_version = 4;
1038 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1039 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1040 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv4];
1041 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1042 		mutex_exit(&ft->mlft_mtx);
1043 		mutex_exit(&g->mlg_mtx);
1044 		return (B_FALSE);
1045 	}
1046 
1047 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1048 	list_insert_tail(&ft->mlft_groups, fg);
1049 	fg->mlfg_table = ft;
1050 	fg->mlfg_size = 1;
1051 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1052 		mutex_exit(&ft->mlft_mtx);
1053 		mutex_exit(&g->mlg_mtx);
1054 		return (B_FALSE);
1055 	}
1056 	fe = list_head(&fg->mlfg_entries);
1057 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1058 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1059 	    &g->mlg_tir[MLXCX_TIR_ROLE_OTHER];
1060 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1061 		mutex_exit(&ft->mlft_mtx);
1062 		mutex_exit(&g->mlg_mtx);
1063 		return (B_FALSE);
1064 	}
1065 
1066 	mutex_exit(&ft->mlft_mtx);
1067 
1068 	/*
1069 	 * Flow table: the VLAN breakout table for doing VLAN filtering after
1070 	 * we've matched a MAC address.
1071 	 */
1072 
1073 	g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1074 	    KM_SLEEP));
1075 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1076 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1077 	list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t),
1078 	    offsetof(mlxcx_group_vlan_t, mlgv_entry));
1079 
1080 	mutex_enter(&ft->mlft_mtx);
1081 
1082 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1083 	ft->mlft_level = 1;
1084 	ft->mlft_port = g->mlg_port;
1085 	ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift;
1086 	ft->mlft_nents = (1 << ft->mlft_entshift);
1087 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1088 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1089 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1090 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
1091 
1092 	for (j = 0; j < ft->mlft_nents; ++j) {
1093 		fe = &ft->mlft_ent[j];
1094 		fe->mlfe_table = ft;
1095 		fe->mlfe_index = j;
1096 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1097 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1098 	}
1099 
1100 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1101 		mutex_exit(&ft->mlft_mtx);
1102 		mutex_exit(&g->mlg_mtx);
1103 		return (B_FALSE);
1104 	}
1105 
1106 	/* First group is all actual matched VLANs */
1107 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1108 	g->mlg_rx_vlan_fg = fg;
1109 	list_insert_tail(&ft->mlft_groups, fg);
1110 	fg->mlfg_table = ft;
1111 	fg->mlfg_size = ft->mlft_nents - 2;
1112 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN;
1113 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID;
1114 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1115 		mutex_exit(&ft->mlft_mtx);
1116 		mutex_exit(&g->mlg_mtx);
1117 		return (B_FALSE);
1118 	}
1119 
1120 	/*
1121 	 * Then the "default" entry which we enable when we have no VLAN IDs
1122 	 * added to the group (we start with this enabled).
1123 	 */
1124 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1125 	g->mlg_rx_vlan_def_fg = fg;
1126 	list_insert_tail(&ft->mlft_groups, fg);
1127 	fg->mlfg_table = ft;
1128 	fg->mlfg_size = 1;
1129 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1130 		mutex_exit(&ft->mlft_mtx);
1131 		mutex_exit(&g->mlg_mtx);
1132 		return (B_FALSE);
1133 	}
1134 	fe = list_head(&fg->mlfg_entries);
1135 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1136 		mutex_exit(&ft->mlft_mtx);
1137 		mutex_exit(&g->mlg_mtx);
1138 		return (B_FALSE);
1139 	}
1140 
1141 	/*
1142 	 * Finally, the promisc entry which points at the *hash ft* from the
1143 	 * default group. We only enable this when we have promisc on.
1144 	 */
1145 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1146 	g->mlg_rx_vlan_promisc_fg = fg;
1147 	list_insert_tail(&ft->mlft_groups, fg);
1148 	fg->mlfg_table = ft;
1149 	fg->mlfg_size = 1;
1150 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1151 		mutex_exit(&ft->mlft_mtx);
1152 		mutex_exit(&g->mlg_mtx);
1153 		return (B_FALSE);
1154 	}
1155 	fe = list_head(&fg->mlfg_entries);
1156 	fe->mlfe_ndest = 1;
1157 	fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft;
1158 
1159 	mutex_exit(&ft->mlft_mtx);
1160 
1161 	mutex_exit(&g->mlg_mtx);
1162 
1163 	return (B_TRUE);
1164 }
1165 
1166 boolean_t
1167 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1168     mlxcx_work_queue_t *rq)
1169 {
1170 	uint_t j;
1171 	mlxcx_buffer_t *b;
1172 	mlxcx_completion_queue_t *cq;
1173 
1174 	mutex_enter(&g->mlg_mtx);
1175 	/*
1176 	 * Sadly, even though MAC has the mgi_start callback, it is not always
1177 	 * called -- in particular when we are being managed under an aggr, the
1178 	 * mgi_start callback will only ever be called on the default group.
1179 	 *
1180 	 * So instead of asserting about the group state here, we have to
1181 	 * check it and call group start if needed.
1182 	 */
1183 	if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) {
1184 		mutex_exit(&g->mlg_mtx);
1185 		if (!mlxcx_rx_group_start(mlxp, g))
1186 			return (B_FALSE);
1187 		mutex_enter(&g->mlg_mtx);
1188 	}
1189 	ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING);
1190 
1191 	cq = rq->mlwq_cq;
1192 	ASSERT(cq != NULL);
1193 
1194 	mutex_enter(&cq->mlcq_mtx);
1195 	mutex_enter(&rq->mlwq_mtx);
1196 
1197 	if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1198 		mutex_exit(&rq->mlwq_mtx);
1199 		mutex_exit(&cq->mlcq_mtx);
1200 		mutex_exit(&g->mlg_mtx);
1201 		return (B_TRUE);
1202 	}
1203 
1204 	if (!mlxcx_cmd_start_rq(mlxp, rq)) {
1205 		mutex_exit(&rq->mlwq_mtx);
1206 		mutex_exit(&cq->mlcq_mtx);
1207 		mutex_exit(&g->mlg_mtx);
1208 		return (B_FALSE);
1209 	}
1210 	ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED);
1211 
1212 	ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS);
1213 	rq->mlwq_state |= MLXCX_WQ_BUFFERS;
1214 
1215 	for (j = 0; j < rq->mlwq_nents; ++j) {
1216 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1217 			break;
1218 		mlxcx_buf_return(mlxp, b);
1219 	}
1220 	for (j = 0; j < rq->mlwq_nents / 2; ++j) {
1221 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1222 			break;
1223 		mlxcx_buf_return(mlxp, b);
1224 	}
1225 
1226 	mlxcx_rq_refill(mlxp, rq);
1227 
1228 	mutex_exit(&rq->mlwq_mtx);
1229 	mutex_exit(&cq->mlcq_mtx);
1230 	mutex_exit(&g->mlg_mtx);
1231 
1232 	return (B_TRUE);
1233 }
1234 
1235 boolean_t
1236 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1237 {
1238 	mlxcx_flow_table_t *ft;
1239 	mlxcx_flow_group_t *fg;
1240 	mlxcx_flow_entry_t *fe;
1241 	char tq_name[TASKQ_NAMELEN];
1242 
1243 	mutex_enter(&g->mlg_mtx);
1244 
1245 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
1246 		mutex_exit(&g->mlg_mtx);
1247 		return (B_TRUE);
1248 	}
1249 
1250 	ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING);
1251 
1252 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1253 
1254 	(void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
1255 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
1256 	    g - &mlxp->mlx_rx_groups[0]);
1257 
1258 	/*
1259 	 * Create one refill taskq per group with one thread per work queue.
1260 	 * The refill task may block waiting for resources, so by effectively
1261 	 * having one thread per work queue we avoid work queues blocking each
1262 	 * other.
1263 	 */
1264 	if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
1265 	    g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
1266 		mlxcx_warn(mlxp, "failed to create rq refill task queue");
1267 		mutex_exit(&g->mlg_mtx);
1268 		return (B_FALSE);
1269 	}
1270 
1271 	if (g == &mlxp->mlx_rx_groups[0]) {
1272 		ft = g->mlg_port->mlp_rx_flow;
1273 		mutex_enter(&ft->mlft_mtx);
1274 
1275 		/*
1276 		 * Broadcast and promisc entries go directly to group 0's
1277 		 * RSS hash fanout flow table. They bypass VLAN filtering.
1278 		 */
1279 		fg = g->mlg_port->mlp_bcast;
1280 		fe = list_head(&fg->mlfg_entries);
1281 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1282 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1283 			mutex_exit(&ft->mlft_mtx);
1284 			g->mlg_state &= ~MLXCX_GROUP_RUNNING;
1285 			taskq_destroy(g->mlg_refill_tq);
1286 			mutex_exit(&g->mlg_mtx);
1287 			return (B_FALSE);
1288 		}
1289 
1290 		fg = g->mlg_port->mlp_promisc;
1291 		fe = list_head(&fg->mlfg_entries);
1292 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1293 		/*
1294 		 * Don't actually set the promisc entry until promisc is
1295 		 * enabled.
1296 		 */
1297 
1298 		mutex_exit(&ft->mlft_mtx);
1299 	}
1300 
1301 	mutex_exit(&g->mlg_mtx);
1302 
1303 	return (B_TRUE);
1304 }
1305 
1306 boolean_t
1307 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1308 {
1309 	mlxcx_event_queue_t *eq;
1310 	mlxcx_completion_queue_t *cq;
1311 	mlxcx_work_queue_t *sq;
1312 	uint_t i;
1313 
1314 	ASSERT3S(g->mlg_state, ==, 0);
1315 
1316 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
1317 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1318 	g->mlg_state |= MLXCX_GROUP_INIT;
1319 	mutex_enter(&g->mlg_mtx);
1320 
1321 	g->mlg_mlx = mlxp;
1322 	g->mlg_type = MLXCX_GROUP_TX;
1323 	g->mlg_port = &mlxp->mlx_ports[0];
1324 
1325 	g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group;
1326 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
1327 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
1328 	g->mlg_state |= MLXCX_GROUP_WQS;
1329 
1330 	g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom;
1331 
1332 	if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) {
1333 		mutex_exit(&g->mlg_mtx);
1334 		return (B_FALSE);
1335 	}
1336 
1337 	g->mlg_state |= MLXCX_GROUP_TIRTIS;
1338 
1339 	for (i = 0; i < g->mlg_nwqs; ++i) {
1340 		eq = NULL;
1341 		while (eq == NULL) {
1342 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
1343 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
1344 				mlxp->mlx_next_eq = 1;
1345 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
1346 			    eq->mleq_type != MLXCX_EQ_TYPE_TX) {
1347 				/* Try the next one */
1348 				eq = NULL;
1349 			}
1350 		}
1351 
1352 		if (!mlxcx_cq_setup(mlxp, eq, &cq,
1353 		    mlxp->mlx_props.mldp_cq_size_shift))
1354 			return (B_FALSE);
1355 
1356 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
1357 
1358 		sq = &g->mlg_wqs[i];
1359 		if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) {
1360 			mutex_exit(&g->mlg_mtx);
1361 			return (B_FALSE);
1362 		}
1363 		sq->mlwq_group = g;
1364 	}
1365 
1366 	mutex_exit(&g->mlg_mtx);
1367 
1368 	return (B_TRUE);
1369 }
1370 
1371 boolean_t
1372 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1373     mlxcx_work_queue_t *sq)
1374 {
1375 	uint_t i;
1376 	mlxcx_buffer_t *b;
1377 	mlxcx_completion_queue_t *cq;
1378 
1379 	mutex_enter(&g->mlg_mtx);
1380 
1381 	cq = sq->mlwq_cq;
1382 	ASSERT(cq != NULL);
1383 
1384 	mutex_enter(&cq->mlcq_mtx);
1385 	mutex_enter(&sq->mlwq_mtx);
1386 	if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1387 		mutex_exit(&sq->mlwq_mtx);
1388 		mutex_exit(&cq->mlcq_mtx);
1389 		mutex_exit(&g->mlg_mtx);
1390 		return (B_TRUE);
1391 	}
1392 
1393 	ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS);
1394 	for (i = 0; i < sq->mlwq_nents; ++i) {
1395 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1396 			break;
1397 		mlxcx_buf_return(mlxp, b);
1398 	}
1399 	for (i = 0; i < sq->mlwq_nents / 2; ++i) {
1400 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1401 			break;
1402 		mlxcx_buf_return(mlxp, b);
1403 	}
1404 	for (i = 0; i < sq->mlwq_nents; ++i) {
1405 		if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b))
1406 			break;
1407 		mlxcx_buf_return(mlxp, b);
1408 	}
1409 	sq->mlwq_state |= MLXCX_WQ_BUFFERS;
1410 
1411 	if (!mlxcx_cmd_start_sq(mlxp, sq)) {
1412 		mutex_exit(&sq->mlwq_mtx);
1413 		mutex_exit(&cq->mlcq_mtx);
1414 		mutex_exit(&g->mlg_mtx);
1415 		return (B_FALSE);
1416 	}
1417 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1418 
1419 	(void) mlxcx_sq_add_nop(mlxp, sq);
1420 
1421 	mutex_exit(&sq->mlwq_mtx);
1422 	mutex_exit(&cq->mlcq_mtx);
1423 	mutex_exit(&g->mlg_mtx);
1424 
1425 	return (B_TRUE);
1426 }
1427 
1428 static boolean_t
1429 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first)
1430 {
1431 	uint_t idx;
1432 	mlxcx_bf_t *bf;
1433 	ddi_fm_error_t err;
1434 	uint_t try = 0;
1435 
1436 	ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ);
1437 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1438 
1439 	mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc);
1440 
1441 	ASSERT(mlwq->mlwq_cq != NULL);
1442 	ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL);
1443 	idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK;
1444 	bf = &mlwq->mlwq_uar->mlu_bf[idx];
1445 
1446 retry:
1447 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1448 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1449 	    DDI_FME_VERSION);
1450 	if (err.fme_status != DDI_FM_OK) {
1451 		if (try++ < mlxcx_doorbell_tries) {
1452 			ddi_fm_dma_err_clear(
1453 			    mlwq->mlwq_doorbell_dma.mxdb_dma_handle,
1454 			    DDI_FME_VERSION);
1455 			goto retry;
1456 		} else {
1457 			goto err;
1458 		}
1459 	}
1460 
1461 	mlxcx_put64(mlxp, bf->mbf_even, from_be64(
1462 	    mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0]));
1463 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
1464 	    DDI_FME_VERSION);
1465 	if (err.fme_status == DDI_FM_OK)
1466 		return (B_TRUE);
1467 	if (try++ < mlxcx_doorbell_tries) {
1468 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
1469 		goto retry;
1470 	}
1471 
1472 err:
1473 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1474 	return (B_FALSE);
1475 }
1476 
1477 boolean_t
1478 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1479 {
1480 	uint_t index, start_pc;
1481 	mlxcx_sendq_ent_t *ent0;
1482 	ddi_fm_error_t err;
1483 
1484 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1485 
1486 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1487 	ent0 = &mlwq->mlwq_send_ent[index];
1488 	start_pc = mlwq->mlwq_pc;
1489 	++mlwq->mlwq_pc;
1490 	/*
1491 	 * This counter is manipulated in the interrupt handler, which
1492 	 * does not hold the mlwq_mtx, hence the atomic.
1493 	 */
1494 	atomic_inc_64(&mlwq->mlwq_wqebb_used);
1495 
1496 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1497 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
1498 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1499 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc);
1500 
1501 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1502 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE);
1503 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1504 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1505 
1506 	ent0->mlsqe_control.mlcs_ds = 1;
1507 
1508 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1509 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1510 	    sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1511 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1512 	    DDI_FME_VERSION);
1513 	if (err.fme_status != DDI_FM_OK) {
1514 		return (B_FALSE);
1515 	}
1516 	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) {
1517 		return (B_FALSE);
1518 	}
1519 	return (B_TRUE);
1520 }
1521 
1522 boolean_t
1523 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1524     uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
1525     mlxcx_buffer_t *b0)
1526 {
1527 	uint_t index, first, ents;
1528 	mlxcx_completion_queue_t *cq;
1529 	mlxcx_sendq_ent_t *ent0;
1530 	mlxcx_sendq_extra_ent_t *ent;
1531 	mlxcx_wqe_data_seg_t *seg;
1532 	uint_t ptri, nptr;
1533 	const ddi_dma_cookie_t *c;
1534 	size_t rem;
1535 	uint64_t wqebb_used;
1536 	mlxcx_buffer_t *b;
1537 	ddi_fm_error_t err;
1538 	boolean_t rv;
1539 
1540 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1541 	ASSERT3P(b0->mlb_tx_head, ==, b0);
1542 	ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1543 	cq = mlwq->mlwq_cq;
1544 
1545 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1546 	ent0 = &mlwq->mlwq_send_ent[index];
1547 	b0->mlb_wqe_index = mlwq->mlwq_pc;
1548 	ents = 1;
1549 
1550 	first = index;
1551 
1552 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1553 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
1554 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1555 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index);
1556 
1557 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1558 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS);
1559 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1560 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1561 
1562 	VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers));
1563 	set_bits16(&ent0->mlsqe_eth.mles_szflags,
1564 	    MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen);
1565 	if (inlinelen > 0) {
1566 		bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers,
1567 		    inlinelen);
1568 	}
1569 
1570 	ent0->mlsqe_control.mlcs_ds =
1571 	    offsetof(mlxcx_sendq_ent_t, mlsqe_data) / 16;
1572 
1573 	if (chkflags & HCK_IPV4_HDRCKSUM) {
1574 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1575 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1576 		    MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM);
1577 	}
1578 	if (chkflags & HCK_FULLCKSUM) {
1579 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1580 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1581 		    MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
1582 	}
1583 
1584 	/*
1585 	 * mlwq_wqebb_used is only incremented whilst holding
1586 	 * the mlwq_mtx mutex, but it is decremented (atomically) in
1587 	 * the interrupt context *not* under mlwq_mtx mutex.
1588 	 * So, now take a snapshot of the number of used wqes which will
1589 	 * be a conistent maximum we can use whilst iterating through
1590 	 * the buffers and DMA cookies.
1591 	 */
1592 	wqebb_used = mlwq->mlwq_wqebb_used;
1593 
1594 	b = b0;
1595 	ptri = 0;
1596 	nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
1597 	seg = ent0->mlsqe_data;
1598 	while (b != NULL) {
1599 		rem = b->mlb_used;
1600 
1601 		c = NULL;
1602 		while (rem > 0 &&
1603 		    (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
1604 			if (ptri >= nptr) {
1605 				if ((ents + wqebb_used) >= mlwq->mlwq_nents)
1606 					return (B_FALSE);
1607 
1608 				index = (mlwq->mlwq_pc + ents) &
1609 				    (mlwq->mlwq_nents - 1);
1610 				ent = &mlwq->mlwq_send_extra_ent[index];
1611 				++ents;
1612 
1613 				seg = ent->mlsqe_data;
1614 				ptri = 0;
1615 				nptr = sizeof (ent->mlsqe_data) /
1616 				    sizeof (mlxcx_wqe_data_seg_t);
1617 			}
1618 
1619 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1620 			if (c->dmac_size > rem) {
1621 				seg->mlds_byte_count = to_be32(rem);
1622 				rem = 0;
1623 			} else {
1624 				seg->mlds_byte_count = to_be32(c->dmac_size);
1625 				rem -= c->dmac_size;
1626 			}
1627 			seg->mlds_address = to_be64(c->dmac_laddress);
1628 			++seg;
1629 			++ptri;
1630 			++ent0->mlsqe_control.mlcs_ds;
1631 
1632 			ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=,
1633 			    MLXCX_SQE_MAX_DS);
1634 		}
1635 
1636 		if (b == b0) {
1637 			b = list_head(&b0->mlb_tx_chain);
1638 		} else {
1639 			b = list_next(&b0->mlb_tx_chain, b);
1640 		}
1641 	}
1642 
1643 	b0->mlb_wqebbs = ents;
1644 	mlwq->mlwq_pc += ents;
1645 	atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
1646 
1647 	for (; ptri < nptr; ++ptri, ++seg) {
1648 		seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1649 		seg->mlds_byte_count = to_be32(0);
1650 		seg->mlds_address = to_be64(0);
1651 	}
1652 
1653 	/*
1654 	 * Make sure the workqueue entry is flushed out before updating
1655 	 * the doorbell.
1656 	 */
1657 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1658 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1659 	    ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1660 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1661 	    DDI_FME_VERSION);
1662 	if (err.fme_status != DDI_FM_OK) {
1663 		return (B_FALSE);
1664 	}
1665 
1666 	/*
1667 	 * Hold the bufmtx whilst ringing the doorbell, to prevent
1668 	 * the buffer from being moved to another list, so we can
1669 	 * safely remove it should the ring fail.
1670 	 */
1671 	mutex_enter(&cq->mlcq_bufbmtx);
1672 
1673 	list_insert_tail(&cq->mlcq_buffers_b, b0);
1674 	if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
1675 		atomic_inc_64(&cq->mlcq_bufcnt);
1676 	} else {
1677 		list_remove(&cq->mlcq_buffers_b, b0);
1678 	}
1679 
1680 	mutex_exit(&cq->mlcq_bufbmtx);
1681 
1682 	return (rv);
1683 }
1684 
1685 boolean_t
1686 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1687     mlxcx_buffer_t *buf)
1688 {
1689 	return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1));
1690 }
1691 
1692 boolean_t
1693 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1694     mlxcx_buffer_t **bufs, size_t nbufs)
1695 {
1696 	uint_t index;
1697 	mlxcx_recvq_ent_t *ent;
1698 	mlxcx_completion_queue_t *cq;
1699 	mlxcx_wqe_data_seg_t *seg;
1700 	uint_t bi, ptri;
1701 	const ddi_dma_cookie_t *c;
1702 	mlxcx_buffer_t *buf;
1703 	ddi_fm_error_t err;
1704 
1705 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1706 	cq = mlwq->mlwq_cq;
1707 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1708 
1709 	for (bi = 0; bi < nbufs; ++bi) {
1710 		buf = bufs[bi];
1711 		bufs[bi] = NULL;
1712 		ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1713 
1714 		index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1715 		ent = &mlwq->mlwq_recv_ent[index];
1716 		buf->mlb_wqe_index = mlwq->mlwq_pc;
1717 		buf->mlb_wqebbs = 1;
1718 
1719 		++mlwq->mlwq_pc;
1720 		atomic_inc_64(&mlwq->mlwq_wqebb_used);
1721 
1722 		mutex_enter(&cq->mlcq_bufbmtx);
1723 		list_insert_tail(&cq->mlcq_buffers, buf);
1724 		atomic_inc_64(&cq->mlcq_bufcnt);
1725 		mutex_exit(&cq->mlcq_bufbmtx);
1726 
1727 		ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS);
1728 		ptri = 0;
1729 		c = NULL;
1730 		while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) {
1731 			seg = &ent->mlrqe_data[ptri++];
1732 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1733 			seg->mlds_byte_count = to_be32(c->dmac_size);
1734 			seg->mlds_address = to_be64(c->dmac_laddress);
1735 		}
1736 		/*
1737 		 * Fill any unused scatter pointers with the special null
1738 		 * value.
1739 		 */
1740 		for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) {
1741 			seg = &ent->mlrqe_data[ptri];
1742 			seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1743 			seg->mlds_byte_count = to_be32(0);
1744 			seg->mlds_address = to_be64(0);
1745 		}
1746 
1747 		/*
1748 		 * Make sure the workqueue entry is flushed out before updating
1749 		 * the doorbell.
1750 		 */
1751 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1752 		    (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent,
1753 		    sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV));
1754 		ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1755 		    DDI_FME_VERSION);
1756 		if (err.fme_status != DDI_FM_OK) {
1757 			return (B_FALSE);
1758 		}
1759 	}
1760 
1761 	mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc);
1762 	/*
1763 	 * Flush the CQ doorbell as well so that HW knows how many
1764 	 * completions we've consumed.
1765 	 */
1766 	MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1767 	ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
1768 	    DDI_FME_VERSION);
1769 	if (err.fme_status != DDI_FM_OK) {
1770 		return (B_FALSE);
1771 	}
1772 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1773 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1774 	    DDI_FME_VERSION);
1775 	if (err.fme_status != DDI_FM_OK) {
1776 		return (B_FALSE);
1777 	}
1778 	return (B_TRUE);
1779 }
1780 
1781 static void
1782 mlxcx_rq_refill_task(void *arg)
1783 {
1784 	mlxcx_work_queue_t *wq = arg;
1785 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
1786 	mlxcx_t *mlxp = wq->mlwq_mlx;
1787 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
1788 	boolean_t refill;
1789 
1790 	do {
1791 		/*
1792 		 * Wait until there are some free buffers.
1793 		 */
1794 		mutex_enter(&s->mlbs_mtx);
1795 		while (list_is_empty(&s->mlbs_free) &&
1796 		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0)
1797 			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
1798 		mutex_exit(&s->mlbs_mtx);
1799 
1800 		mutex_enter(&cq->mlcq_mtx);
1801 		mutex_enter(&wq->mlwq_mtx);
1802 
1803 		if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
1804 			refill = B_FALSE;
1805 			wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1806 		} else {
1807 			mlxcx_rq_refill(mlxp, wq);
1808 
1809 			if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
1810 				refill = B_TRUE;
1811 			} else {
1812 				refill = B_FALSE;
1813 				wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1814 			}
1815 		}
1816 
1817 		mutex_exit(&wq->mlwq_mtx);
1818 		mutex_exit(&cq->mlcq_mtx);
1819 	} while (refill);
1820 }
1821 
1822 void
1823 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1824 {
1825 	size_t target, current, want, done, n;
1826 	mlxcx_completion_queue_t *cq;
1827 	mlxcx_ring_group_t *g;
1828 	mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
1829 	uint_t i;
1830 
1831 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1832 	cq = mlwq->mlwq_cq;
1833 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1834 
1835 	ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS);
1836 
1837 	target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP;
1838 	cq = mlwq->mlwq_cq;
1839 
1840 	if (cq->mlcq_state & MLXCX_CQ_TEARDOWN)
1841 		return;
1842 
1843 	current = cq->mlcq_bufcnt;
1844 
1845 	if (current >= target - MLXCX_RQ_REFILL_STEP)
1846 		return;
1847 
1848 	want = target - current;
1849 	done = 0;
1850 
1851 	while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
1852 		n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
1853 		if (n == 0) {
1854 			/*
1855 			 * We didn't get any buffers from the free queue.
1856 			 * It might not be an issue, schedule a taskq
1857 			 * to wait for free buffers if the completion
1858 			 * queue is low.
1859 			 */
1860 			if (current < MLXCX_RQ_REFILL_STEP &&
1861 			    (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
1862 				mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
1863 				g = mlwq->mlwq_group;
1864 				taskq_dispatch_ent(g->mlg_refill_tq,
1865 				    mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
1866 				    &mlwq->mlwq_tqe);
1867 			}
1868 
1869 			return;
1870 		}
1871 
1872 		if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) {
1873 			for (i = 0; i < n; ++i)
1874 				mlxcx_buf_return(mlxp, b[i]);
1875 			return;
1876 		}
1877 		if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) {
1878 			/*
1879 			 * mlxcx_rq_add_buffers NULLs out the buffers as it
1880 			 * enqueues them, so any that are non-NULL we have to
1881 			 * free now. The others now belong to the WQ, even if
1882 			 * we failed.
1883 			 */
1884 			for (i = 0; i < n; ++i) {
1885 				if (b[i] != NULL) {
1886 					mlxcx_buf_return(mlxp, b[i]);
1887 				}
1888 			}
1889 			return;
1890 		}
1891 		done += n;
1892 	}
1893 }
1894 
1895 static const char *
1896 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy)
1897 {
1898 	switch (sy) {
1899 	case MLXCX_CQ_ERR_LOCAL_LENGTH:
1900 		return ("LOCAL_LENGTH");
1901 	case MLXCX_CQ_ERR_LOCAL_QP_OP:
1902 		return ("LOCAL_QP_OP");
1903 	case MLXCX_CQ_ERR_LOCAL_PROTECTION:
1904 		return ("LOCAL_PROTECTION");
1905 	case MLXCX_CQ_ERR_WR_FLUSHED:
1906 		return ("WR_FLUSHED");
1907 	case MLXCX_CQ_ERR_MEM_WINDOW_BIND:
1908 		return ("MEM_WINDOW_BIND");
1909 	case MLXCX_CQ_ERR_BAD_RESPONSE:
1910 		return ("BAD_RESPONSE");
1911 	case MLXCX_CQ_ERR_LOCAL_ACCESS:
1912 		return ("LOCAL_ACCESS");
1913 	case MLXCX_CQ_ERR_XPORT_RETRY_CTR:
1914 		return ("XPORT_RETRY_CTR");
1915 	case MLXCX_CQ_ERR_RNR_RETRY_CTR:
1916 		return ("RNR_RETRY_CTR");
1917 	case MLXCX_CQ_ERR_ABORTED:
1918 		return ("ABORTED");
1919 	default:
1920 		return ("UNKNOWN");
1921 	}
1922 }
1923 
1924 static void
1925 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1926     mlxcx_completionq_error_ent_t *ent)
1927 {
1928 	uint64_t ena;
1929 	char buf[FM_MAX_CLASS];
1930 	const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome);
1931 
1932 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1933 		return;
1934 
1935 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1936 	    MLXCX_FM_SERVICE_MLXCX, "cqe.err");
1937 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1938 
1939 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1940 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1941 	    "syndrome", DATA_TYPE_STRING, name,
1942 	    "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome,
1943 	    "vendor_syndrome", DATA_TYPE_UINT8,
1944 	    ent->mlcqee_vendor_error_syndrome,
1945 	    "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter),
1946 	    "wq_type", DATA_TYPE_STRING,
1947 	    (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv",
1948 	    "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num,
1949 	    "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num,
1950 	    NULL);
1951 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1952 }
1953 
1954 void
1955 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1956     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1957 {
1958 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1959 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) {
1960 		mlxcx_completionq_error_ent_t *eent =
1961 		    (mlxcx_completionq_error_ent_t *)ent;
1962 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
1963 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1964 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
1965 		mlxcx_check_sq(mlxp, mlcq->mlcq_wq);
1966 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
1967 		return;
1968 	}
1969 
1970 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) {
1971 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
1972 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1973 		return;
1974 	}
1975 
1976 	if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) {
1977 		mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x",
1978 		    ent->mlcqe_send_wqe_opcode);
1979 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1980 		return;
1981 	}
1982 
1983 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
1984 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
1985 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1986 		return;
1987 	}
1988 
1989 	mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1990 }
1991 
1992 mblk_t *
1993 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1994     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1995 {
1996 	uint32_t chkflags = 0;
1997 	uint_t wqe_index;
1998 	ddi_fm_error_t err;
1999 
2000 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
2001 
2002 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) {
2003 		mlxcx_completionq_error_ent_t *eent =
2004 		    (mlxcx_completionq_error_ent_t *)ent;
2005 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2006 		mlxcx_buf_return(mlxp, buf);
2007 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2008 		mlxcx_check_rq(mlxp, mlcq->mlcq_wq);
2009 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2010 		return (NULL);
2011 	}
2012 
2013 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) {
2014 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2015 		mlxcx_buf_return(mlxp, buf);
2016 		return (NULL);
2017 	}
2018 
2019 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2020 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2021 		mlxcx_buf_return(mlxp, buf);
2022 		return (NULL);
2023 	}
2024 
2025 	if (ent->mlcqe_rx_drop_counter > 0) {
2026 		atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops,
2027 		    ent->mlcqe_rx_drop_counter);
2028 	}
2029 
2030 	MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU);
2031 	ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err,
2032 	    DDI_FME_VERSION);
2033 	if (err.fme_status != DDI_FM_OK) {
2034 		ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle,
2035 		    DDI_FME_VERSION);
2036 		mlxcx_buf_return(mlxp, buf);
2037 		return (NULL);
2038 	}
2039 
2040 	/*
2041 	 * mlxcx_buf_loan() will set mlb_wqe_index to zero.
2042 	 * Remember it for later.
2043 	 */
2044 	wqe_index = buf->mlb_wqe_index;
2045 
2046 	if (!mlxcx_buf_loan(mlxp, buf)) {
2047 		mlxcx_warn(mlxp, "!loan failed, dropping packet");
2048 		mlxcx_buf_return(mlxp, buf);
2049 		return (NULL);
2050 	}
2051 
2052 	buf->mlb_mp->b_next = NULL;
2053 	buf->mlb_mp->b_cont = NULL;
2054 	buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr +
2055 	    from_be32(ent->mlcqe_byte_cnt);
2056 
2057 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) {
2058 		chkflags |= HCK_FULLCKSUM_OK;
2059 	}
2060 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) {
2061 		chkflags |= HCK_IPV4_HDRCKSUM_OK;
2062 	}
2063 	if (chkflags != 0) {
2064 		mac_hcksum_set(buf->mlb_mp, 0, 0, 0,
2065 		    from_be16(ent->mlcqe_checksum), chkflags);
2066 	}
2067 
2068 	/*
2069 	 * Don't check if a refill is needed on every single completion,
2070 	 * since checking involves taking the RQ lock.
2071 	 */
2072 	if ((wqe_index & 0x7) == 0) {
2073 		mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
2074 		ASSERT(wq != NULL);
2075 		mutex_enter(&wq->mlwq_mtx);
2076 		if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN))
2077 			mlxcx_rq_refill(mlxp, wq);
2078 		mutex_exit(&wq->mlwq_mtx);
2079 	}
2080 
2081 	return (buf->mlb_mp);
2082 }
2083 
2084 static void
2085 mlxcx_buf_mp_return(caddr_t arg)
2086 {
2087 	mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg;
2088 	mlxcx_t *mlxp = b->mlb_mlx;
2089 
2090 	if (b->mlb_state != MLXCX_BUFFER_ON_LOAN) {
2091 		b->mlb_mp = NULL;
2092 		return;
2093 	}
2094 	/*
2095 	 * The mblk for this buffer_t (in its mlb_mp field) has been used now,
2096 	 * so NULL it out.
2097 	 */
2098 	b->mlb_mp = NULL;
2099 	mlxcx_buf_return(mlxp, b);
2100 }
2101 
2102 boolean_t
2103 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp)
2104 {
2105 	mlxcx_buffer_t *b;
2106 	ddi_device_acc_attr_t acc;
2107 	ddi_dma_attr_t attr;
2108 	boolean_t ret;
2109 
2110 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2111 	b->mlb_shard = shard;
2112 	b->mlb_foreign = B_FALSE;
2113 
2114 	mlxcx_dma_acc_attr(mlxp, &acc);
2115 	mlxcx_dma_buf_attr(mlxp, &attr);
2116 
2117 	ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc,
2118 	    B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE);
2119 	if (!ret) {
2120 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2121 		return (B_FALSE);
2122 	}
2123 
2124 	b->mlb_frtn.free_func = mlxcx_buf_mp_return;
2125 	b->mlb_frtn.free_arg = (caddr_t)b;
2126 	b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2127 	    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2128 
2129 	*bp = b;
2130 
2131 	return (B_TRUE);
2132 }
2133 
2134 boolean_t
2135 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
2136     mlxcx_buffer_t **bp)
2137 {
2138 	mlxcx_buffer_t *b;
2139 	ddi_dma_attr_t attr;
2140 	boolean_t ret;
2141 
2142 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2143 	b->mlb_shard = shard;
2144 	b->mlb_foreign = B_TRUE;
2145 
2146 	mlxcx_dma_buf_attr(mlxp, &attr);
2147 
2148 	ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE);
2149 	if (!ret) {
2150 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2151 		return (B_FALSE);
2152 	}
2153 
2154 	*bp = b;
2155 
2156 	return (B_TRUE);
2157 }
2158 
2159 static mlxcx_buffer_t *
2160 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2161 {
2162 	mlxcx_buffer_t *b;
2163 	mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
2164 
2165 	mutex_enter(&s->mlbs_mtx);
2166 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2167 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2168 		ASSERT(b->mlb_foreign);
2169 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2170 		list_insert_tail(&s->mlbs_busy, b);
2171 	}
2172 	mutex_exit(&s->mlbs_mtx);
2173 
2174 	return (b);
2175 }
2176 
2177 static mlxcx_buffer_t *
2178 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
2179 {
2180 	ddi_fm_error_t err;
2181 	mlxcx_buffer_t *b;
2182 	uint_t attempts = 0;
2183 
2184 copyb:
2185 	if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
2186 		return (NULL);
2187 
2188 	ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
2189 	bcopy(rptr, b->mlb_dma.mxdb_va, sz);
2190 
2191 	MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
2192 
2193 	ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
2194 	    DDI_FME_VERSION);
2195 	if (err.fme_status != DDI_FM_OK) {
2196 		ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
2197 		    DDI_FME_VERSION);
2198 		mlxcx_buf_return(mlxp, b);
2199 		if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
2200 			return (NULL);
2201 		}
2202 		goto copyb;
2203 	}
2204 
2205 	return (b);
2206 }
2207 
2208 mlxcx_buffer_t *
2209 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2210     mblk_t *mpb, size_t off)
2211 {
2212 	mlxcx_buffer_t *b, *b0 = NULL;
2213 	boolean_t first = B_TRUE;
2214 	mblk_t *mp;
2215 	uint8_t *rptr;
2216 	size_t sz;
2217 	size_t ncookies = 0;
2218 	boolean_t ret;
2219 
2220 	for (mp = mpb; mp != NULL; mp = mp->b_cont) {
2221 		rptr = mp->b_rptr;
2222 		sz = MBLKL(mp);
2223 
2224 		if (off > 0)
2225 			ASSERT3U(off, <, sz);
2226 		rptr += off;
2227 		sz -= off;
2228 
2229 		if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
2230 			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2231 			if (b == NULL)
2232 				goto failed;
2233 		} else {
2234 			b = mlxcx_buf_take_foreign(mlxp, wq);
2235 			if (b == NULL)
2236 				goto failed;
2237 
2238 			ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
2239 			    B_FALSE);
2240 
2241 			if (!ret) {
2242 				mlxcx_buf_return(mlxp, b);
2243 
2244 				b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2245 				if (b == NULL)
2246 					goto failed;
2247 			}
2248 		}
2249 
2250 		/*
2251 		 * We might overestimate here when we've copied data, since
2252 		 * the buffer might be longer than what we copied into it. This
2253 		 * is safe since it's always wrong in the conservative
2254 		 * direction (and we will blow up later when we actually
2255 		 * generate the WQE anyway).
2256 		 *
2257 		 * If the assert below ever blows, we'll have to come and fix
2258 		 * this up so we can transmit these packets.
2259 		 */
2260 		ncookies += b->mlb_dma.mxdb_ncookies;
2261 
2262 		if (first)
2263 			b0 = b;
2264 
2265 		if (!first)
2266 			b->mlb_state = MLXCX_BUFFER_ON_CHAIN;
2267 
2268 		b->mlb_tx_mp = mp;
2269 		b->mlb_tx_head = b0;
2270 		b->mlb_used = sz;
2271 
2272 		if (!first)
2273 			list_insert_tail(&b0->mlb_tx_chain, b);
2274 		first = B_FALSE;
2275 		off = 0;
2276 	}
2277 
2278 	ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS);
2279 
2280 	return (b0);
2281 
2282 failed:
2283 	if (b0 != NULL)
2284 		mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2285 
2286 	return (NULL);
2287 }
2288 
2289 mlxcx_buffer_t *
2290 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2291 {
2292 	mlxcx_buffer_t *b;
2293 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
2294 
2295 	mutex_enter(&s->mlbs_mtx);
2296 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2297 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2298 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2299 		list_insert_tail(&s->mlbs_busy, b);
2300 	}
2301 	mutex_exit(&s->mlbs_mtx);
2302 
2303 	return (b);
2304 }
2305 
2306 size_t
2307 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2308     mlxcx_buffer_t **bp, size_t nbufs)
2309 {
2310 	mlxcx_buffer_t *b;
2311 	size_t done = 0;
2312 	mlxcx_buf_shard_t *s;
2313 
2314 	s = wq->mlwq_bufs;
2315 
2316 	mutex_enter(&s->mlbs_mtx);
2317 	while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
2318 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2319 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2320 		list_insert_tail(&s->mlbs_busy, b);
2321 		bp[done++] = b;
2322 	}
2323 	mutex_exit(&s->mlbs_mtx);
2324 	return (done);
2325 }
2326 
2327 boolean_t
2328 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2329 {
2330 	VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
2331 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2332 
2333 	if (b->mlb_mp == NULL) {
2334 		b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2335 		    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2336 		if (b->mlb_mp == NULL)
2337 			return (B_FALSE);
2338 	}
2339 
2340 	b->mlb_state = MLXCX_BUFFER_ON_LOAN;
2341 	b->mlb_wqe_index = 0;
2342 	return (B_TRUE);
2343 }
2344 
2345 void
2346 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp)
2347 {
2348 	mlxcx_buffer_t *b;
2349 
2350 	if (b0->mlb_tx_head != b0) {
2351 		mlxcx_buf_return(mlxp, b0);
2352 		return;
2353 	}
2354 
2355 	while ((b = list_head(&b0->mlb_tx_chain)) != NULL) {
2356 		mlxcx_buf_return(mlxp, b);
2357 	}
2358 	if (keepmp) {
2359 		b0->mlb_tx_mp = NULL;
2360 		b0->mlb_tx_head = NULL;
2361 	}
2362 	mlxcx_buf_return(mlxp, b0);
2363 }
2364 
2365 void
2366 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2367 {
2368 	mlxcx_buffer_state_t oldstate = b->mlb_state;
2369 	mlxcx_buffer_t *txhead = b->mlb_tx_head;
2370 	mlxcx_buf_shard_t *s = b->mlb_shard;
2371 	mblk_t *mp = b->mlb_tx_mp;
2372 
2373 	VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
2374 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2375 
2376 	/*
2377 	 * The mlbs_mtx held below is a heavily contended lock, so it is
2378 	 * imperative we do as much of the buffer clean up outside the lock
2379 	 * as is possible.
2380 	 */
2381 	b->mlb_state = MLXCX_BUFFER_FREE;
2382 	b->mlb_wqe_index = 0;
2383 	b->mlb_tx_head = NULL;
2384 	b->mlb_tx_mp = NULL;
2385 	b->mlb_used = 0;
2386 	b->mlb_wqebbs = 0;
2387 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2388 
2389 	if (b->mlb_foreign) {
2390 		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
2391 			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
2392 		}
2393 	}
2394 
2395 	mutex_enter(&s->mlbs_mtx);
2396 	switch (oldstate) {
2397 	case MLXCX_BUFFER_INIT:
2398 		break;
2399 	case MLXCX_BUFFER_ON_WQ:
2400 		list_remove(&s->mlbs_busy, b);
2401 		break;
2402 	case MLXCX_BUFFER_ON_LOAN:
2403 		ASSERT(!b->mlb_foreign);
2404 		list_remove(&s->mlbs_busy, b);
2405 		break;
2406 	case MLXCX_BUFFER_FREE:
2407 		VERIFY(0);
2408 		break;
2409 	case MLXCX_BUFFER_ON_CHAIN:
2410 		ASSERT(txhead != NULL);
2411 		list_remove(&txhead->mlb_tx_chain, b);
2412 		list_remove(&s->mlbs_busy, b);
2413 		break;
2414 	}
2415 
2416 	list_insert_tail(&s->mlbs_free, b);
2417 	cv_signal(&s->mlbs_free_nonempty);
2418 
2419 	mutex_exit(&s->mlbs_mtx);
2420 
2421 	/*
2422 	 * For TX chain heads, free the mblk_t after we let go of the lock.
2423 	 * This might be a borrowed buf that we in turn loaned to MAC, in which
2424 	 * case calling freemsg() on it will re-enter this very function -- so
2425 	 * we better not be holding the lock!
2426 	 */
2427 	if (txhead == b)
2428 		freemsg(mp);
2429 }
2430 
2431 void
2432 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2433 {
2434 	mlxcx_buf_shard_t *s = b->mlb_shard;
2435 	VERIFY(b->mlb_state == MLXCX_BUFFER_FREE ||
2436 	    b->mlb_state == MLXCX_BUFFER_INIT);
2437 	ASSERT(mutex_owned(&s->mlbs_mtx));
2438 	if (b->mlb_state == MLXCX_BUFFER_FREE)
2439 		list_remove(&s->mlbs_free, b);
2440 
2441 	/*
2442 	 * This is going back to the kmem cache, so it needs to be set up in
2443 	 * the same way we expect a new buffer to come out (state INIT, other
2444 	 * fields NULL'd)
2445 	 */
2446 	b->mlb_state = MLXCX_BUFFER_INIT;
2447 	b->mlb_shard = NULL;
2448 	if (b->mlb_mp != NULL) {
2449 		freeb(b->mlb_mp);
2450 		ASSERT(b->mlb_mp == NULL);
2451 	}
2452 	mlxcx_dma_free(&b->mlb_dma);
2453 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2454 
2455 	kmem_cache_free(mlxp->mlx_bufs_cache, b);
2456 }
2457