xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_intr.c (revision e1447ca93391f31609bda487cb922dbff9dcdef5)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2020, the University of Queensland
14  * Copyright 2020 RackTop Systems, Inc.
15  */
16 
17 /*
18  * Mellanox Connect-X 4/5/6 driver.
19  */
20 
21 #include <sys/modctl.h>
22 #include <sys/conf.h>
23 #include <sys/devops.h>
24 #include <sys/sysmacros.h>
25 #include <sys/disp.h>
26 #include <sys/sdt.h>
27 
28 #include <sys/mac_provider.h>
29 
30 #include <mlxcx.h>
31 
32 /*
33  * CTASSERT(s) to cover bad values which would induce bugs.
34  */
35 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
36 
37 /*
38  * Disable interrupts.
39  * The act of calling ddi_intr_disable() does not guarantee an interrupt
40  * routine is not running, so flag the vector as quiescing and wait
41  * for anything active to finish.
42  */
43 void
44 mlxcx_intr_disable(mlxcx_t *mlxp)
45 {
46 	int i;
47 
48 	mlxcx_cmd_eq_disable(mlxp);
49 
50 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
51 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
52 
53 		mutex_enter(&mleq->mleq_mtx);
54 
55 		if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
56 			mutex_exit(&mleq->mleq_mtx);
57 			continue;
58 		}
59 
60 		(void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);
61 
62 		mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE;
63 		while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0)
64 			cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx);
65 
66 		mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED;
67 
68 		mutex_exit(&mleq->mleq_mtx);
69 	}
70 }
71 
72 void
73 mlxcx_intr_teardown(mlxcx_t *mlxp)
74 {
75 	int i;
76 	int ret;
77 
78 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
79 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
80 
81 		mutex_enter(&mleq->mleq_mtx);
82 		VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
83 		if (mleq->mleq_state & MLXCX_EQ_CREATED)
84 			VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
85 		if (i >= mlxp->mlx_intr_cq0) {
86 			VERIFY(avl_is_empty(&mleq->mleq_cqs));
87 			avl_destroy(&mleq->mleq_cqs);
88 		}
89 		mutex_exit(&mleq->mleq_mtx);
90 		(void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
91 		ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
92 		if (ret != DDI_SUCCESS) {
93 			mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
94 			    i, ret);
95 		}
96 		mutex_destroy(&mleq->mleq_mtx);
97 		cv_destroy(&mleq->mleq_cv);
98 	}
99 	kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
100 	kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
101 	mlxp->mlx_intr_handles = NULL;
102 	mlxp->mlx_eqs = NULL;
103 }
104 
105 /*
106  * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
107  */
108 static mlxcx_eventq_ent_t *
109 mlxcx_eq_next(mlxcx_event_queue_t *mleq)
110 {
111 	mlxcx_eventq_ent_t *ent;
112 	ddi_fm_error_t err;
113 	uint_t ci;
114 	const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);
115 
116 	/*
117 	 * This should only be called from interrupt context to ensure
118 	 * correctness of mleq_cc.
119 	 */
120 	ASSERT(servicing_interrupt());
121 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
122 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
123 
124 	/* mleq_nents is always a power of 2 */
125 	ci = mleq->mleq_cc & (mleq->mleq_nents - 1);
126 
127 	ent = &mleq->mleq_ent[ci];
128 	VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
129 	    (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
130 	    sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
131 	ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
132 	    DDI_FME_VERSION);
133 	if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
134 		/* The PRM says we have to membar here, so we're doing it */
135 		membar_consumer();
136 		++mleq->mleq_cc;
137 		return (ent);
138 	}
139 	/*
140 	 * In the case of a DMA error, we should re-arm this EQ and then come
141 	 * back and try again when the device wakes us back up.
142 	 *
143 	 * Hopefully the fault will be gone by then.
144 	 */
145 	ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);
146 
147 	return (NULL);
148 }
149 
150 void
151 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
152 {
153 	uint_t try = 0;
154 	ddi_fm_error_t err;
155 	bits32_t v = new_bits32();
156 
157 	/*
158 	 * This is only called during initialization when the EQ is
159 	 * armed for the first time, and when re-armed at the end of
160 	 * interrupt processing.
161 	 */
162 	ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt());
163 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
164 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
165 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
166 	ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);
167 
168 	mleq->mleq_state |= MLXCX_EQ_ARMED;
169 	mleq->mleq_cc_armed = mleq->mleq_cc;
170 
171 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
172 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
173 
174 retry:
175 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
176 	    from_bits32(v));
177 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
178 	    DDI_FME_VERSION);
179 	if (err.fme_status == DDI_FM_OK)
180 		return;
181 	if (try++ < mlxcx_doorbell_tries) {
182 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
183 		goto retry;
184 	}
185 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
186 }
187 
188 static void
189 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
190 {
191 	bits32_t v = new_bits32();
192 	ddi_fm_error_t err;
193 
194 	/*
195 	 * This should only be called from interrupt context to ensure
196 	 * correctness of mleq_cc.
197 	 */
198 	ASSERT(servicing_interrupt());
199 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
200 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
201 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
202 
203 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
204 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
205 
206 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
207 	    from_bits32(v));
208 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
209 	    DDI_FME_VERSION);
210 	ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
211 	/*
212 	 * Ignore the error, if it's still happening when we try to re-arm the
213 	 * EQ, we will note the impact then.
214 	 */
215 }
216 
217 static mlxcx_completionq_ent_t *
218 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
219 {
220 	mlxcx_completionq_ent_t *ent;
221 	ddi_fm_error_t err;
222 	uint_t ci;
223 	const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);
224 
225 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
226 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
227 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
228 
229 	/* mlcq_nents is always a power of 2 */
230 	ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);
231 
232 	ent = &mlcq->mlcq_ent[ci];
233 	VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
234 	    (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
235 	    sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
236 	ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
237 	    DDI_FME_VERSION);
238 	if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
239 		/* The PRM says we have to membar here, so we're doing it */
240 		membar_consumer();
241 		++mlcq->mlcq_cc;
242 		return (ent);
243 	}
244 	ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);
245 
246 	return (NULL);
247 }
248 
249 void
250 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
251 {
252 	ddi_fm_error_t err;
253 	uint_t try = 0;
254 
255 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
256 
257 retry:
258 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
259 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
260 	    DDI_FME_VERSION);
261 	if (err.fme_status != DDI_FM_OK) {
262 		if (try++ < mlxcx_doorbell_tries) {
263 			ddi_fm_dma_err_clear(
264 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
265 			    DDI_FME_VERSION);
266 			goto retry;
267 		} else {
268 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
269 			return;
270 		}
271 	}
272 }
273 
274 void
275 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
276 {
277 	bits32_t dbval = new_bits32();
278 	uint64_t udbval;
279 	ddi_fm_error_t err;
280 	uint_t try = 0;
281 
282 	ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx));
283 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
284 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
285 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
286 
287 	if (mlcq->mlcq_state & MLXCX_CQ_ARMED) {
288 		ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
289 	}
290 
291 	if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
292 		return;
293 
294 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED);
295 	mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
296 	mlcq->mlcq_ec_armed = mlcq->mlcq_ec;
297 
298 	set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
299 	set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);
300 
301 	udbval = (uint64_t)from_bits32(dbval) << 32;
302 	udbval |= mlcq->mlcq_num & 0xffffff;
303 
304 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
305 	mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;
306 
307 retry:
308 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
309 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
310 	    DDI_FME_VERSION);
311 	if (err.fme_status != DDI_FM_OK) {
312 		if (try++ < mlxcx_doorbell_tries) {
313 			ddi_fm_dma_err_clear(
314 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
315 			    DDI_FME_VERSION);
316 			goto retry;
317 		} else {
318 			goto err;
319 		}
320 	}
321 
322 	mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
323 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
324 	    DDI_FME_VERSION);
325 	if (err.fme_status == DDI_FM_OK)
326 		return;
327 	if (try++ < mlxcx_doorbell_tries) {
328 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
329 		goto retry;
330 	}
331 
332 err:
333 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
334 }
335 
336 const char *
337 mlxcx_event_name(mlxcx_event_t evt)
338 {
339 	switch (evt) {
340 	case MLXCX_EVENT_COMPLETION:
341 		return ("COMPLETION");
342 	case MLXCX_EVENT_PATH_MIGRATED:
343 		return ("PATH_MIGRATED");
344 	case MLXCX_EVENT_COMM_ESTABLISH:
345 		return ("COMM_ESTABLISH");
346 	case MLXCX_EVENT_SENDQ_DRAIN:
347 		return ("SENDQ_DRAIN");
348 	case MLXCX_EVENT_LAST_WQE:
349 		return ("LAST_WQE");
350 	case MLXCX_EVENT_SRQ_LIMIT:
351 		return ("SRQ_LIMIT");
352 	case MLXCX_EVENT_DCT_ALL_CLOSED:
353 		return ("DCT_ALL_CLOSED");
354 	case MLXCX_EVENT_DCT_ACCKEY_VIOL:
355 		return ("DCT_ACCKEY_VIOL");
356 	case MLXCX_EVENT_CQ_ERROR:
357 		return ("CQ_ERROR");
358 	case MLXCX_EVENT_WQ_CATASTROPHE:
359 		return ("WQ_CATASTROPHE");
360 	case MLXCX_EVENT_PATH_MIGRATE_FAIL:
361 		return ("PATH_MIGRATE_FAIL");
362 	case MLXCX_EVENT_PAGE_FAULT:
363 		return ("PAGE_FAULT");
364 	case MLXCX_EVENT_WQ_INVALID_REQ:
365 		return ("WQ_INVALID_REQ");
366 	case MLXCX_EVENT_WQ_ACCESS_VIOL:
367 		return ("WQ_ACCESS_VIOL");
368 	case MLXCX_EVENT_SRQ_CATASTROPHE:
369 		return ("SRQ_CATASTROPHE");
370 	case MLXCX_EVENT_INTERNAL_ERROR:
371 		return ("INTERNAL_ERROR");
372 	case MLXCX_EVENT_PORT_STATE:
373 		return ("PORT_STATE");
374 	case MLXCX_EVENT_GPIO:
375 		return ("GPIO");
376 	case MLXCX_EVENT_PORT_MODULE:
377 		return ("PORT_MODULE");
378 	case MLXCX_EVENT_TEMP_WARNING:
379 		return ("TEMP_WARNING");
380 	case MLXCX_EVENT_REMOTE_CONFIG:
381 		return ("REMOTE_CONFIG");
382 	case MLXCX_EVENT_DCBX_CHANGE:
383 		return ("DCBX_CHANGE");
384 	case MLXCX_EVENT_DOORBELL_CONGEST:
385 		return ("DOORBELL_CONGEST");
386 	case MLXCX_EVENT_STALL_VL:
387 		return ("STALL_VL");
388 	case MLXCX_EVENT_CMD_COMPLETION:
389 		return ("CMD_COMPLETION");
390 	case MLXCX_EVENT_PAGE_REQUEST:
391 		return ("PAGE_REQUEST");
392 	case MLXCX_EVENT_NIC_VPORT:
393 		return ("NIC_VPORT");
394 	case MLXCX_EVENT_EC_PARAMS_CHANGE:
395 		return ("EC_PARAMS_CHANGE");
396 	case MLXCX_EVENT_XRQ_ERROR:
397 		return ("XRQ_ERROR");
398 	}
399 	return ("UNKNOWN");
400 }
401 
402 /* Should be called only when link state has changed. */
403 void
404 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
405 {
406 	link_state_t ls;
407 
408 	mutex_enter(&port->mlp_mtx);
409 	(void) mlxcx_cmd_query_port_status(mlxp, port);
410 	(void) mlxcx_cmd_query_port_speed(mlxp, port);
411 	(void) mlxcx_cmd_query_port_fec(mlxp, port);
412 
413 	switch (port->mlp_oper_status) {
414 	case MLXCX_PORT_STATUS_UP:
415 	case MLXCX_PORT_STATUS_UP_ONCE:
416 		ls = LINK_STATE_UP;
417 		break;
418 	case MLXCX_PORT_STATUS_DOWN:
419 		ls = LINK_STATE_DOWN;
420 		break;
421 	default:
422 		ls = LINK_STATE_UNKNOWN;
423 	}
424 	mac_link_update(mlxp->mlx_mac_hdl, ls);
425 
426 	mutex_exit(&port->mlp_mtx);
427 }
428 
429 CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX);
430 
431 static void
432 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
433 {
434 	ddi_device_acc_attr_t acc;
435 	ddi_dma_attr_t attr;
436 	mlxcx_dev_page_t *mdp;
437 	mlxcx_dev_page_t **pages;
438 	size_t i;
439 	const ddi_dma_cookie_t *ck;
440 
441 	/*
442 	 * If this isn't enough, the HCA will ask for more
443 	 */
444 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
445 
446 	pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP);
447 
448 	for (i = 0; i < npages; i++) {
449 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
450 		mlxcx_dma_acc_attr(mlxp, &acc);
451 		mlxcx_dma_page_attr(mlxp, &attr);
452 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
453 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
454 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i,
455 			    npages);
456 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
457 			goto cleanup_npages;
458 		}
459 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
460 		mdp->mxdp_pa = ck->dmac_laddress;
461 		pages[i] = mdp;
462 	}
463 
464 	mutex_enter(&mlxp->mlx_pagemtx);
465 
466 	if (!mlxcx_cmd_give_pages(mlxp,
467 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
468 		mlxcx_warn(mlxp, "!hardware refused our gift of %lu "
469 		    "pages!", npages);
470 		mutex_exit(&mlxp->mlx_pagemtx);
471 		goto cleanup_npages;
472 	}
473 
474 	for (i = 0; i < npages; i++) {
475 		avl_add(&mlxp->mlx_pages, pages[i]);
476 	}
477 	mlxp->mlx_npages += npages;
478 	mutex_exit(&mlxp->mlx_pagemtx);
479 
480 	kmem_free(pages, sizeof (*pages) * npages);
481 
482 	return;
483 
484 cleanup_npages:
485 	for (i = 0; i < npages; i++) {
486 		if ((mdp = pages[i]) == NULL)
487 			break;
488 
489 		mlxcx_dma_free(&mdp->mxdp_dma);
490 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
491 	}
492 	/* Tell the hardware we had an allocation failure. */
493 	(void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
494 	    0, NULL);
495 	mutex_exit(&mlxp->mlx_pagemtx);
496 
497 	kmem_free(pages, sizeof (*pages) * npages);
498 }
499 
500 static void
501 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
502 {
503 	uint_t i;
504 	int32_t ret;
505 	uint64_t *pas;
506 	mlxcx_dev_page_t *mdp, probe;
507 
508 	pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP);
509 
510 	if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) {
511 		kmem_free(pas, sizeof (*pas) * npages);
512 		return;
513 	}
514 
515 	mutex_enter(&mlxp->mlx_pagemtx);
516 
517 	ASSERT0(avl_is_empty(&mlxp->mlx_pages));
518 
519 	for (i = 0; i < ret; i++) {
520 		bzero(&probe, sizeof (probe));
521 		probe.mxdp_pa = pas[i];
522 
523 		mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
524 
525 		if (mdp != NULL) {
526 			avl_remove(&mlxp->mlx_pages, mdp);
527 			mlxp->mlx_npages--;
528 			mlxcx_dma_free(&mdp->mxdp_dma);
529 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
530 		} else {
531 			mlxcx_warn(mlxp, "hardware returned a page "
532 			    "with PA 0x%" PRIx64 " but we have no "
533 			    "record of giving out such a page", pas[i]);
534 		}
535 	}
536 
537 	mutex_exit(&mlxp->mlx_pagemtx);
538 
539 	kmem_free(pas, sizeof (*pas) * npages);
540 }
541 
542 static void
543 mlxcx_pages_task(void *arg)
544 {
545 	mlxcx_async_param_t *param = arg;
546 	mlxcx_t *mlxp = param->mla_mlx;
547 	int32_t npages;
548 
549 	/*
550 	 * We can drop the pending status now, as we've extracted what
551 	 * is needed to process the pages request.
552 	 *
553 	 * Even though we should never get another pages request until
554 	 * we have responded to this, along with the guard in mlxcx_sync_intr,
555 	 * this safely allows the reuse of mlxcx_async_param_t.
556 	 */
557 	mutex_enter(&param->mla_mtx);
558 	npages = param->mla_pages.mlp_npages;
559 	param->mla_pending = B_FALSE;
560 	bzero(&param->mla_pages, sizeof (param->mla_pages));
561 	mutex_exit(&param->mla_mtx);
562 
563 	/*
564 	 * The PRM describes npages as: "Number of missing / unneeded pages
565 	 * (signed number, msb indicate sign)". The implication is that
566 	 * it will not be zero. We are expected to use this to give or
567 	 * take back pages (based on the sign) using the MANAGE_PAGES
568 	 * command but we can't determine whether to give or take
569 	 * when npages is zero. So we do nothing.
570 	 */
571 	if (npages > 0) {
572 		mlxcx_give_pages_once(mlxp, npages);
573 	} else if (npages < 0) {
574 		mlxcx_take_pages_once(mlxp, -1 * npages);
575 	}
576 }
577 
578 static void
579 mlxcx_link_state_task(void *arg)
580 {
581 	mlxcx_async_param_t *param = arg;
582 	mlxcx_port_t *port;
583 	mlxcx_t *mlxp;
584 
585 	/*
586 	 * Gather the argruments from the parameters and clear the
587 	 * pending status.
588 	 *
589 	 * The pending status must be cleared *before* we update the
590 	 * link state. This is both safe and required to ensure we always
591 	 * have the correct link state. It is safe because taskq_ents are
592 	 * reusable (by the caller of taskq_dispatch_ent()) once the
593 	 * task function has started executing. It is necessarily before
594 	 * updating the link state to guarantee further link state change
595 	 * events are not missed and we always have the current link state.
596 	 */
597 	mutex_enter(&param->mla_mtx);
598 	mlxp = param->mla_mlx;
599 	port = param->mla_port;
600 	param->mla_pending = B_FALSE;
601 	mutex_exit(&param->mla_mtx);
602 
603 	mlxcx_update_link_state(mlxp, port);
604 }
605 
606 static const char *
607 mlxcx_module_error_string(mlxcx_module_error_type_t err)
608 {
609 	switch (err) {
610 	case MLXCX_MODULE_ERR_POWER_BUDGET:
611 		return ("POWER_BUDGET");
612 	case MLXCX_MODULE_ERR_LONG_RANGE:
613 		return ("LONG_RANGE");
614 	case MLXCX_MODULE_ERR_BUS_STUCK:
615 		return ("BUS_STUCK");
616 	case MLXCX_MODULE_ERR_NO_EEPROM:
617 		return ("NO_EEPROM");
618 	case MLXCX_MODULE_ERR_ENFORCEMENT:
619 		return ("ENFORCEMENT");
620 	case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
621 		return ("UNKNOWN_IDENT");
622 	case MLXCX_MODULE_ERR_HIGH_TEMP:
623 		return ("HIGH_TEMP");
624 	case MLXCX_MODULE_ERR_CABLE_SHORTED:
625 		return ("CABLE_SHORTED");
626 	default:
627 		return ("UNKNOWN");
628 	}
629 }
630 
631 static void
632 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
633 {
634 	uint64_t ena;
635 	char buf[FM_MAX_CLASS];
636 	const char *lename;
637 	const char *ename;
638 	const char *stname;
639 	uint_t eno = 0;
640 	mlxcx_module_status_t state = evd->mled_port_mod_module_status;
641 
642 	switch (state) {
643 	case MLXCX_MODULE_ERROR:
644 		stname = "error";
645 		eno = evd->mled_port_mod_error_type;
646 		lename = mlxcx_module_error_string(eno);
647 		switch (eno) {
648 		case MLXCX_MODULE_ERR_ENFORCEMENT:
649 			ename = DDI_FM_TXR_ERROR_WHITELIST;
650 			break;
651 		case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
652 		case MLXCX_MODULE_ERR_NO_EEPROM:
653 			ename = DDI_FM_TXR_ERROR_NOTSUPP;
654 			break;
655 		case MLXCX_MODULE_ERR_HIGH_TEMP:
656 			ename = DDI_FM_TXR_ERROR_OVERTEMP;
657 			break;
658 		case MLXCX_MODULE_ERR_POWER_BUDGET:
659 		case MLXCX_MODULE_ERR_LONG_RANGE:
660 		case MLXCX_MODULE_ERR_CABLE_SHORTED:
661 			ename = DDI_FM_TXR_ERROR_HWFAIL;
662 			break;
663 		case MLXCX_MODULE_ERR_BUS_STUCK:
664 		default:
665 			ename = DDI_FM_TXR_ERROR_UNKNOWN;
666 		}
667 		break;
668 	default:
669 		return;
670 	}
671 
672 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
673 	    DDI_FM_NIC, DDI_FM_TXR_ERROR);
674 	ena = fm_ena_generate(0, FM_ENA_FMT1);
675 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
676 		return;
677 
678 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
679 	    /* compulsory FM props */
680 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
681 	    /* generic NIC txr error event props */
682 	    "error", DATA_TYPE_STRING, ename,
683 	    "port_index", DATA_TYPE_UINT8, 0,
684 	    "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
685 	    /* local props */
686 	    "mlxcx_state", DATA_TYPE_STRING, stname,
687 	    "mlxcx_error", DATA_TYPE_STRING, lename,
688 	    "mlxcx_error_num", DATA_TYPE_UINT8, eno,
689 	    NULL);
690 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
691 }
692 
693 /*
694  * Common beginning of interrupt processing.
695  * Confirm interrupt hasn't been disabled, verify its state and
696  * mark the vector as active.
697  */
698 static boolean_t
699 mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
700 {
701 	mutex_enter(&mleq->mleq_mtx);
702 
703 	if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
704 		mutex_exit(&mleq->mleq_mtx);
705 		return (B_FALSE);
706 	}
707 
708 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
709 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
710 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
711 		mlxcx_warn(mlxp, "intr %d in bad eq state",
712 		    mleq->mleq_intr_index);
713 		mutex_exit(&mleq->mleq_mtx);
714 		return (B_FALSE);
715 	}
716 
717 	mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE;
718 	mutex_exit(&mleq->mleq_mtx);
719 
720 	return (B_TRUE);
721 }
722 
723 /*
724  * End of interrupt processing.
725  * Mark vector as no longer active and if shutdown is blocked on this vector,
726  * wake it up.
727  */
728 static void
729 mlxcx_intr_fini(mlxcx_event_queue_t *mleq)
730 {
731 	mutex_enter(&mleq->mleq_mtx);
732 	if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0)
733 		cv_signal(&mleq->mleq_cv);
734 
735 	mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE;
736 	mutex_exit(&mleq->mleq_mtx);
737 }
738 
739 static uint_t
740 mlxcx_intr_async(caddr_t arg, caddr_t arg2)
741 {
742 	mlxcx_t *mlxp = (mlxcx_t *)arg;
743 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
744 	mlxcx_eventq_ent_t *ent;
745 	mlxcx_async_param_t *param;
746 	uint_t portn;
747 	uint16_t func;
748 
749 	if (!mlxcx_intr_ini(mlxp, mleq))
750 		return (DDI_INTR_CLAIMED);
751 
752 	ent = mlxcx_eq_next(mleq);
753 	if (ent == NULL) {
754 		goto done;
755 	}
756 
757 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
758 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
759 
760 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
761 		DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *,
762 		    ent);
763 
764 		switch (ent->mleqe_event_type) {
765 		case MLXCX_EVENT_CMD_COMPLETION:
766 			mlxcx_cmd_completion(mlxp, ent);
767 			break;
768 		case MLXCX_EVENT_PAGE_REQUEST:
769 			func = from_be16(ent->mleqe_page_request.
770 			    mled_page_request_function_id);
771 			VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX);
772 
773 			param = &mlxp->mlx_npages_req[func];
774 			mutex_enter(&param->mla_mtx);
775 			if (param->mla_pending) {
776 				/*
777 				 * The PRM states we will not get another
778 				 * page request event until any pending have
779 				 * been posted as complete to the HCA.
780 				 * This will guard against this anyway.
781 				 */
782 				mutex_exit(&param->mla_mtx);
783 				mlxcx_warn(mlxp, "Unexpected page request "
784 				    "whilst another is pending");
785 				break;
786 			}
787 			param->mla_pages.mlp_npages =
788 			    (int32_t)from_be32(ent->mleqe_page_request.
789 			    mled_page_request_num_pages);
790 			param->mla_pages.mlp_func = func;
791 			param->mla_pending = B_TRUE;
792 			ASSERT3P(param->mla_mlx, ==, mlxp);
793 			mutex_exit(&param->mla_mtx);
794 
795 			taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task,
796 			    param, 0, &param->mla_tqe);
797 			break;
798 		case MLXCX_EVENT_PORT_STATE:
799 			portn = get_bits8(
800 			    ent->mleqe_port_state.mled_port_state_port_num,
801 			    MLXCX_EVENT_PORT_NUM) - 1;
802 			if (portn >= mlxp->mlx_nports)
803 				break;
804 
805 			param = &mlxp->mlx_ports[portn].mlx_port_event;
806 			mutex_enter(&param->mla_mtx);
807 			if (param->mla_pending) {
808 				/*
809 				 * There is a link state event pending
810 				 * processing. When that event is handled
811 				 * it will get the current link state.
812 				 */
813 				mutex_exit(&param->mla_mtx);
814 				break;
815 			}
816 
817 			ASSERT3P(param->mla_mlx, ==, mlxp);
818 			ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]);
819 
820 			param->mla_pending = B_TRUE;
821 			mutex_exit(&param->mla_mtx);
822 
823 			taskq_dispatch_ent(mlxp->mlx_async_tq,
824 			    mlxcx_link_state_task, param, 0, &param->mla_tqe);
825 			break;
826 		case MLXCX_EVENT_PORT_MODULE:
827 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
828 			break;
829 		default:
830 			mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d",
831 			    ent->mleqe_event_type, mleq->mleq_intr_index);
832 		}
833 	}
834 
835 	mlxcx_arm_eq(mlxp, mleq);
836 
837 done:
838 	mlxcx_intr_fini(mleq);
839 	return (DDI_INTR_CLAIMED);
840 }
841 
842 static boolean_t
843 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
844     size_t bytelim)
845 {
846 	mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
847 	mlxcx_completionq_ent_t *cent;
848 	mblk_t *mp, *cmp, *nmp;
849 	mlxcx_buffer_t *buf;
850 	boolean_t found, added;
851 	size_t bytes = 0;
852 	uint_t rx_frames = 0;
853 	uint_t comp_cnt = 0;
854 	int64_t wqebbs, bufcnt;
855 
856 	*mpp = NULL;
857 
858 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
859 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
860 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
861 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
862 		return (B_FALSE);
863 	}
864 
865 	nmp = cmp = mp = NULL;
866 
867 	wqebbs = 0;
868 	bufcnt = 0;
869 	for (cent = mlxcx_cq_next(mlcq); cent != NULL;
870 	    cent = mlxcx_cq_next(mlcq)) {
871 		/*
872 		 * Teardown and ring stop can atomic_or this flag
873 		 * into our state if they want us to stop early.
874 		 */
875 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
876 			return (B_FALSE);
877 
878 		comp_cnt++;
879 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
880 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
881 			/* NOP */
882 			atomic_dec_64(&wq->mlwq_wqebb_used);
883 			goto nextcq;
884 		}
885 
886 lookagain:
887 		/*
888 		 * Generally the buffer we're looking for will be
889 		 * at the front of the list, so this loop won't
890 		 * need to look far.
891 		 */
892 		buf = list_head(&mlcq->mlcq_buffers);
893 		found = B_FALSE;
894 		while (buf != NULL) {
895 			if ((buf->mlb_wqe_index & UINT16_MAX) ==
896 			    from_be16(cent->mlcqe_wqe_counter)) {
897 				found = B_TRUE;
898 				break;
899 			}
900 			buf = list_next(&mlcq->mlcq_buffers, buf);
901 		}
902 
903 		if (!found) {
904 			/*
905 			 * If there's any buffers waiting on the
906 			 * buffers_b list, then merge those into
907 			 * the main list and have another look.
908 			 *
909 			 * The wq enqueue routines push new buffers
910 			 * into buffers_b so that they can avoid
911 			 * taking the mlcq_mtx and blocking us for
912 			 * every single packet.
913 			 */
914 			added = B_FALSE;
915 			mutex_enter(&mlcq->mlcq_bufbmtx);
916 			if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
917 				list_move_tail(&mlcq->mlcq_buffers,
918 				    &mlcq->mlcq_buffers_b);
919 				added = B_TRUE;
920 			}
921 			mutex_exit(&mlcq->mlcq_bufbmtx);
922 			if (added)
923 				goto lookagain;
924 
925 			/*
926 			 * This check could go just after the lookagain
927 			 * label, but it is a hot code path so we don't
928 			 * want to unnecessarily grab a lock and check
929 			 * a flag for a relatively rare event (the ring
930 			 * being stopped).
931 			 */
932 			mutex_enter(&wq->mlwq_mtx);
933 			if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) {
934 				mutex_exit(&wq->mlwq_mtx);
935 				goto nextcq;
936 			}
937 			mutex_exit(&wq->mlwq_mtx);
938 
939 			buf = list_head(&mlcq->mlcq_buffers);
940 			mlxcx_warn(mlxp, "got completion on CQ %x but "
941 			    "no buffer matching wqe found: %x (first "
942 			    "buffer counter = %x)", mlcq->mlcq_num,
943 			    from_be16(cent->mlcqe_wqe_counter),
944 			    buf == NULL ? UINT32_MAX :
945 			    buf->mlb_wqe_index);
946 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
947 			goto nextcq;
948 		}
949 
950 		/*
951 		 * The buf is likely to be freed below, count this now.
952 		 */
953 		wqebbs += buf->mlb_wqebbs;
954 
955 		list_remove(&mlcq->mlcq_buffers, buf);
956 		bufcnt++;
957 
958 		switch (mlcq->mlcq_wq->mlwq_type) {
959 		case MLXCX_WQ_TYPE_SENDQ:
960 			mlxcx_tx_completion(mlxp, mlcq, cent, buf);
961 			break;
962 		case MLXCX_WQ_TYPE_RECVQ:
963 			nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
964 			bytes += from_be32(cent->mlcqe_byte_cnt);
965 			if (nmp != NULL) {
966 				if (cmp != NULL) {
967 					cmp->b_next = nmp;
968 					cmp = nmp;
969 				} else {
970 					mp = cmp = nmp;
971 				}
972 
973 				rx_frames++;
974 			}
975 			break;
976 		}
977 
978 		/*
979 		 * Update the consumer index with what has been processed,
980 		 * followed by driver counters. It is important to tell the
981 		 * hardware first, otherwise when we throw more packets at
982 		 * it, it may get an overflow error.
983 		 * We do this whenever we've processed enough to bridge the
984 		 * high->low water mark.
985 		 */
986 		if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
987 			mlxcx_update_cqci(mlxp, mlcq);
988 			/*
989 			 * Both these variables are incremented using
990 			 * atomics as they are modified in other code paths
991 			 * (Eg during tx) which hold different locks.
992 			 */
993 			atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
994 			atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
995 			wqebbs = 0;
996 			bufcnt = 0;
997 			comp_cnt = 0;
998 		}
999 nextcq:
1000 		if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
1001 		    (bytelim != 0 && bytes > bytelim))
1002 			break;
1003 	}
1004 
1005 	if (comp_cnt > 0) {
1006 		mlxcx_update_cqci(mlxp, mlcq);
1007 		atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
1008 		atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
1009 	}
1010 
1011 	*mpp = mp;
1012 	return (B_TRUE);
1013 }
1014 
1015 
1016 mblk_t *
1017 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
1018 {
1019 	mblk_t *mp = NULL;
1020 
1021 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1022 
1023 	ASSERT(mlcq->mlcq_wq != NULL);
1024 	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
1025 
1026 	(void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
1027 
1028 	return (mp);
1029 }
1030 
1031 static uint_t
1032 mlxcx_intr_n(caddr_t arg, caddr_t arg2)
1033 {
1034 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1035 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
1036 	mlxcx_eventq_ent_t *ent;
1037 	mlxcx_completion_queue_t *mlcq, probe;
1038 	mlxcx_work_queue_t *mlwq;
1039 	mblk_t *mp = NULL;
1040 	boolean_t tellmac = B_FALSE;
1041 
1042 	if (!mlxcx_intr_ini(mlxp, mleq))
1043 		return (DDI_INTR_CLAIMED);
1044 
1045 	ent = mlxcx_eq_next(mleq);
1046 	if (ent == NULL) {
1047 		if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
1048 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
1049 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1050 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
1051 			    mleq->mleq_intr_index]);
1052 		}
1053 		goto done;
1054 	}
1055 	mleq->mleq_badintrs = 0;
1056 
1057 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
1058 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
1059 
1060 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
1061 		if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) {
1062 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
1063 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1064 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
1065 			    mleq->mleq_intr_index]);
1066 			goto done;
1067 		}
1068 		ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);
1069 
1070 		probe.mlcq_num =
1071 		    from_be24(ent->mleqe_completion.mled_completion_cqn);
1072 		mutex_enter(&mleq->mleq_mtx);
1073 		mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
1074 		mutex_exit(&mleq->mleq_mtx);
1075 
1076 		if (mlcq == NULL)
1077 			continue;
1078 
1079 		mlwq = mlcq->mlcq_wq;
1080 
1081 		/*
1082 		 * mlcq_arm_mtx is used to avoid race conditions between
1083 		 * this interrupt routine and the transition from polling
1084 		 * back to interrupt mode. When exiting poll mode the
1085 		 * CQ is likely to be un-armed, which means there will
1086 		 * be no events for the CQ coming though here,
1087 		 * consequently very low contention on mlcq_arm_mtx.
1088 		 *
1089 		 * mlcq_arm_mtx must be released before calls into mac
1090 		 * layer in order to avoid deadlocks.
1091 		 */
1092 		mutex_enter(&mlcq->mlcq_arm_mtx);
1093 		mlcq->mlcq_ec++;
1094 		atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);
1095 
1096 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
1097 			/*
1098 			 * If we failed to take the mutex because the
1099 			 * polling function has it, just move on.
1100 			 * We don't want to block other CQs behind
1101 			 * this one.
1102 			 */
1103 			if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) {
1104 				mutex_exit(&mlcq->mlcq_arm_mtx);
1105 				goto update_eq;
1106 			}
1107 
1108 			/* Otherwise we will wait. */
1109 			mutex_enter(&mlcq->mlcq_mtx);
1110 		}
1111 
1112 		if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
1113 		    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
1114 			/*
1115 			 * The ring is not in polling mode and we processed
1116 			 * some completion queue entries.
1117 			 */
1118 			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
1119 			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
1120 				atomic_and_uint(&mlcq->mlcq_state,
1121 				    ~MLXCX_CQ_BLOCKED_MAC);
1122 				tellmac = B_TRUE;
1123 			}
1124 
1125 			if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
1126 			    mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
1127 				atomic_and_uint(&mlwq->mlwq_state,
1128 				    ~MLXCX_WQ_BLOCKED_MAC);
1129 				tellmac = B_TRUE;
1130 			}
1131 
1132 			mlxcx_arm_cq(mlxp, mlcq);
1133 
1134 			mutex_exit(&mlcq->mlcq_mtx);
1135 			mutex_exit(&mlcq->mlcq_arm_mtx);
1136 
1137 			if (tellmac) {
1138 				mac_tx_ring_update(mlxp->mlx_mac_hdl,
1139 				    mlcq->mlcq_mac_hdl);
1140 				tellmac = B_FALSE;
1141 			}
1142 
1143 			if (mp != NULL) {
1144 				mac_rx_ring(mlxp->mlx_mac_hdl,
1145 				    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
1146 			}
1147 		} else {
1148 			mutex_exit(&mlcq->mlcq_mtx);
1149 			mutex_exit(&mlcq->mlcq_arm_mtx);
1150 		}
1151 
1152 update_eq:
1153 		/*
1154 		 * Updating the consumer counter for an EQ requires a write
1155 		 * to the UAR, which is possibly expensive.
1156 		 *
1157 		 * Try to do it only often enough to stop us wrapping around.
1158 		 */
1159 		if ((mleq->mleq_cc & 0x7) == 0)
1160 			mlxcx_update_eq(mlxp, mleq);
1161 	}
1162 
1163 	mlxcx_arm_eq(mlxp, mleq);
1164 
1165 done:
1166 	mlxcx_intr_fini(mleq);
1167 	return (DDI_INTR_CLAIMED);
1168 }
1169 
1170 boolean_t
1171 mlxcx_intr_setup(mlxcx_t *mlxp)
1172 {
1173 	dev_info_t *dip = mlxp->mlx_dip;
1174 	int ret;
1175 	int nintrs = 0;
1176 	int navail = 0;
1177 	int types, i;
1178 	mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;
1179 
1180 	ret = ddi_intr_get_supported_types(dip, &types);
1181 	if (ret != DDI_SUCCESS) {
1182 		mlxcx_warn(mlxp, "Failed to get supported interrupt types");
1183 		return (B_FALSE);
1184 	}
1185 
1186 	if (!(types & DDI_INTR_TYPE_MSIX)) {
1187 		mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
1188 		    "requires MSI-X");
1189 		return (B_FALSE);
1190 	}
1191 
1192 	ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
1193 	if (ret != DDI_SUCCESS) {
1194 		mlxcx_warn(mlxp, "Failed to get number of interrupts");
1195 		return (B_FALSE);
1196 	}
1197 	if (nintrs < 2) {
1198 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
1199 		    "requires 2", nintrs);
1200 		return (B_FALSE);
1201 	}
1202 
1203 	ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
1204 	if (navail < 2) {
1205 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
1206 		    "requires 2", navail);
1207 		return (B_FALSE);
1208 	}
1209 
1210 	mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
1211 	mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
1212 	/*
1213 	 * Interrupts for Completion Queues events start from vector 1
1214 	 * up to available vectors. Vector 0 is used for asynchronous
1215 	 * events.
1216 	 */
1217 	mlxp->mlx_intr_cq0 = 1;
1218 
1219 	ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
1220 	    0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
1221 	if (ret != DDI_SUCCESS) {
1222 		mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail);
1223 		mlxcx_intr_teardown(mlxp);
1224 		return (B_FALSE);
1225 	}
1226 	if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) {
1227 		mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx "
1228 		    "requires %d", mlxp->mlx_intr_count,
1229 		    mlxp->mlx_intr_cq0 + 1);
1230 		mlxcx_intr_teardown(mlxp);
1231 		return (B_FALSE);
1232 	}
1233 	mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;
1234 
1235 	ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
1236 	if (ret != DDI_SUCCESS) {
1237 		mlxcx_warn(mlxp, "Failed to get interrupt priority");
1238 		mlxcx_intr_teardown(mlxp);
1239 		return (B_FALSE);
1240 	}
1241 
1242 	/*
1243 	 * Set the interrupt priority for the asynchronous handler higher
1244 	 * than the ring handlers. Some operations which issue commands,
1245 	 * and thus rely on the async interrupt handler for posting
1246 	 * completion, do so with a CQ mutex held. The CQ mutex is also
1247 	 * acquired during ring processing, so if the ring processing vector
1248 	 * happens to be assigned to the same CPU as the async vector
1249 	 * it can hold off the async interrupt thread and lead to a deadlock.
1250 	 * By assigning a higher priority to the async vector, it will
1251 	 * always be dispatched.
1252 	 */
1253 	mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri;
1254 	if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) {
1255 		mlxp->mlx_async_intr_pri++;
1256 	} else {
1257 		mlxp->mlx_intr_pri--;
1258 	}
1259 
1260 	mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
1261 	    sizeof (mlxcx_event_queue_t);
1262 	mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
1263 
1264 	/*
1265 	 * In the failure path, mlxcx_intr_teardown() expects this
1266 	 * mutex and avl tree to be init'ed - so do it now.
1267 	 */
1268 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1269 		uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri :
1270 		    mlxp->mlx_intr_pri;
1271 
1272 		mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
1273 		    DDI_INTR_PRI(pri));
1274 		cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL);
1275 
1276 		if (i < mlxp->mlx_intr_cq0)
1277 			continue;
1278 
1279 		avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
1280 		    sizeof (mlxcx_completion_queue_t),
1281 		    offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
1282 	}
1283 
1284 	ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0],
1285 	    mlxp->mlx_async_intr_pri);
1286 	if (ret != DDI_SUCCESS) {
1287 		mlxcx_warn(mlxp, "Failed to set interrupt priority to %u for "
1288 		    "async interrupt vector", mlxp->mlx_async_intr_pri);
1289 		mlxcx_intr_teardown(mlxp);
1290 		return (B_FALSE);
1291 	}
1292 
1293 	ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async,
1294 	    (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
1295 	if (ret != DDI_SUCCESS) {
1296 		mlxcx_warn(mlxp, "Failed to add async interrupt handler");
1297 		mlxcx_intr_teardown(mlxp);
1298 		return (B_FALSE);
1299 	}
1300 
1301 	/*
1302 	 * If we have enough interrupts, set their "type" fields so that we
1303 	 * avoid mixing RX and TX queues on the same EQs.
1304 	 */
1305 	if (mlxp->mlx_intr_count >= 8) {
1306 		eqt = MLXCX_EQ_TYPE_RX;
1307 	}
1308 
1309 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
1310 		mlxp->mlx_eqs[i].mleq_intr_index = i;
1311 
1312 		mlxp->mlx_eqs[i].mleq_type = eqt;
1313 		/*
1314 		 * If eqt is still ANY, just leave it set to that
1315 		 * (no else here).
1316 		 */
1317 		if (eqt == MLXCX_EQ_TYPE_RX) {
1318 			eqt = MLXCX_EQ_TYPE_TX;
1319 		} else if (eqt == MLXCX_EQ_TYPE_TX) {
1320 			eqt = MLXCX_EQ_TYPE_RX;
1321 		}
1322 
1323 		ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i],
1324 		    mlxp->mlx_intr_pri);
1325 		if (ret != DDI_SUCCESS) {
1326 			mlxcx_warn(mlxp, "Failed to set interrupt priority to "
1327 			    "%u for interrupt vector %d", mlxp->mlx_intr_pri,
1328 			    i);
1329 			mlxcx_intr_teardown(mlxp);
1330 			return (B_FALSE);
1331 		}
1332 
1333 		ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
1334 		    mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
1335 		if (ret != DDI_SUCCESS) {
1336 			mlxcx_warn(mlxp, "Failed to add interrupt handler %d",
1337 			    i);
1338 			mlxcx_intr_teardown(mlxp);
1339 			return (B_FALSE);
1340 		}
1341 	}
1342 
1343 	return (B_TRUE);
1344 }
1345