xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision d77e6e0f12d19668c0e9068c0fcd7a2123da5373)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 /*
23  * The PRM for this family of parts is freely available, and can be found at:
24  * https://www.mellanox.com/related-docs/user_manuals/ \
25  *   Ethernet_Adapters_Programming_Manual.pdf
26  */
27 /*
28  * ConnectX glossary
29  * -----------------
30  *
31  * WR		Work Request: something we've asked the hardware to do by
32  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
33  *
34  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
35  *
36  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
37  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
38  *		types have different WQE structures, different commands for
39  *		creating and destroying them, etc, but share a common context
40  *		structure, counter setup and state graph.
41  * SQ		Send Queue, a specific type of WQ that sends packets
42  * RQ		Receive Queue, a specific type of WQ that receives packets
43  *
44  * CQ		Completion Queue: completion of WRs from a WQ are reported to
45  *		one of these, as a CQE on its entry ring.
46  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
47  *		info, as well as packet size, the ID of the WQ, and the index
48  *		of the WQE which completed. Does not contain any packet data.
49  *
50  * EQ		Event Queue: a ring of event structs from the hardware informing
51  *		us when particular events happen. Many events can point at a
52  *		a particular CQ which we should then go look at.
53  * EQE		Event Queue Entry: an entry on the EQ ring
54  *
55  * UAR		User Access Region, a page of the device's PCI BAR which is
56  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
57  *		ring to arm them for interrupts or wake them up for new work
58  *
59  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
60  *		as a single unit (for e.g. hashing/RSS).
61  *
62  * TIR		Transport Interface Recieve, a bucket of resources for the
63  *		reception of packets. TIRs have to point at either a single RQ
64  *		or a table of RQs (RQT). They then serve as a target for flow
65  *		table entries (FEs). TIRs that point at an RQT also contain the
66  *		settings for hashing for RSS.
67  *
68  * TIS		Transport Interface Send, a bucket of resources associated with
69  *		the transmission of packets. In particular, the temporary
70  *		resources used for LSO internally in the card are accounted to
71  *		a TIS.
72  *
73  * FT		Flow Table, a collection of FEs and FGs that can be referred to
74  *		as a single entity (e.g. used as a target from another flow
75  *		entry or set as the "root" table to handle incoming or outgoing
76  *		packets). Packets arriving at a FT are matched against the
77  *		FEs in the table until either one matches with a terminating
78  *		action or all FEs are exhausted (it's first-match-wins but with
79  *		some actions that are non-terminal, like counting actions).
80  *
81  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
82  *		they match on the same attributes of packets coming into the
83  *		flow).
84  *
85  * FE		Flow Entry, an individual set of values to match against
86  *		packets entering the flow table, combined with an action to
87  *		take upon a successful match. The action we use most is
88  *		"forward", which sends the packets to a TIR or another flow
89  *		table and then stops further processing within the FE's FT.
90  *
91  * lkey/mkey	A reference to something similar to a page table but in the
92  *		device's internal onboard MMU. Since Connect-X parts double as
93  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
94  *		features which we try very hard not to use. For our WQEs we use
95  *		the "reserved" lkey, which is a special value which indicates
96  *		that addresses we give are linear addresses and should not be
97  *		translated.
98  *
99  * PD		Protection Domain, an IB concept. We have to allocate one to
100  *		provide as a parameter for new WQs, but we don't do anything
101  *		with it.
102  *
103  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
104  *		provide it as a parameter to TIR/TIS creation, but we don't do
105  *		anything with it.
106  */
107 /*
108  *
109  * Data flow overview
110  * ------------------
111  *
112  * This driver is a MAC ring-enabled driver which maps rings to send and recv
113  * queues in hardware on the device.
114  *
115  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
116  * sufficient space, and simplify the logic needed to work out which buffer
117  * was completed.
118  *
119  * The CQs are then round-robin allocated onto EQs, of which we set up one per
120  * interrupt that the system gives us for the device. Normally this means we
121  * have 8 EQs.
122  *
123  * When we have >= 8 EQs available, we try to allocate only RX or only TX
124  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
125  *
126  * EQ #0 is reserved for all event types other than completion events, and has
127  * no CQs associated with it at any time. EQs #1 and upwards are only used for
128  * handling CQ completion events.
129  *
130  * +------+     +------+           +------+        +---------+
131  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
132  * +------+     +------+     |     +------+        +---------+
133  *                           |
134  * +------+     +------+     |
135  * | SQ 1 |---->| CQ 1 |---+ |     +------+
136  * +------+     +------+   | +---> |      |
137  *                         |       |      |
138  * +------+     +------+   |       | EQ 1 |        +---------+
139  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
140  * +------+     +------+   | +---> |      |        +---------+
141  *                         | |     +------+
142  *                         | |
143  *   ...                   | |
144  *                         | |     +------+
145  * +------+     +------+   +-----> |      |
146  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
147  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
148  *                           |     |      |        +---------+
149  * +------+     +------+     | +-> |      |
150  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
151  * +------+     +------+       |
152  *                             |     ....
153  * +------+     +------+       |
154  * | RQ 2 |---->| CQ 5 |-------+
155  * +------+     +------+
156  *
157  *   ... (note this diagram does not show RX-only or TX-only EQs)
158  *
159  * For TX, we advertise all of the SQs we create as plain rings to MAC with
160  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
161  * and use the rings as it sees fit.
162  *
163  * For RX, we advertise actual groups in order to make use of hardware
164  * classification.
165  *
166  * The hardware classification we use is based around Flow Tables, and we
167  * currently ignore all of the eswitch features of the card. The NIC VPORT
168  * is always set to promisc mode so that the eswitch sends us all of the
169  * traffic that arrives on the NIC, and we use flow entries to manage
170  * everything.
171  *
172  * We use 2 layers of flow tables for classification: traffic arrives at the
173  * root RX flow table which contains MAC address filters. Those then send
174  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
175  * presence and VID filters.
176  *
177  * Since these parts only support doing RSS hashing on a single protocol at a
178  * time, we have to use a third layer of flow tables as well to break traffic
179  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
180  * so that it can be sent to the appropriate TIR for hashing.
181  *
182  * Incoming packets
183  *        +           +---------+      +---------+
184  *        |        +->| group 0 |      | group 0 |
185  *        |        |  | vlan ft |  +-->| hash ft |
186  *        v        |  |   L1    |  |   |   L2    |
187  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
188  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
189  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
190  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
191  *        |        |  |         |  |   +---------+    +-----+    |     +------+
192  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
193  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
194  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
195  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
196  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
197  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
198  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
199  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
200  *   +---------+   |               ^   |  other  |-+
201  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
202  *   +---------+                   |               +->| TIR |--->| RQ0 |
203  *   |  MAC 1  |-+                 |                  +-----+    +-----+
204  *   +---------+ | +---------------+
205  *   |  MAC 2  |-+ |               ^
206  *   +---------+ | |               |
207  *   |  MAC 3  |-+ |  +---------+  |   +---------+
208  *   +---------+ | |  | group 1 |  |   | group 1 |
209  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
210  *   |         |   |  |   L1    |  | | |   L2    |
211  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
212  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
213  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
214  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
215  *                    |         |  |   +---------+    +-----+    |     +------+
216  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
217  *                    |         |  |   +---------+    +-----+    | RQT +------+
218  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
219  *                    |         |  |   +---------+    +-----+    |     |      |
220  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
221  *                    | promisc |--+   +---------+    +-----+    |     |      |
222  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
223  *                                     +---------+    +-----+    +-----+------+
224  *                                     |  other  |-+
225  *                                     +---------+ |
226  *                      .......                    |  +-----+    +-----+
227  *                                                 +->| TIR |--->| RQ3 |
228  *                                                    +-----+    +-----+
229  *
230  * Note that the "promisc" flow entries are only set/enabled when promisc
231  * mode is enabled for the NIC. All promisc flow entries point directly at
232  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
233  * the "default group" in MAC).
234  *
235  * The "default" entry in the L1 VLAN filter flow tables is used when there
236  * are no VLANs set for the group, to accept any traffic regardless of tag. It
237  * is deleted as soon as a VLAN filter is added (and re-instated if the
238  * last VLAN filter is removed).
239  *
240  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
241  * space for packet data (they're a collection of scatter pointers only). TX
242  * descriptors contain some space for "inline headers" (and the card requires
243  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
244  * but all the rest of the data comes from the gather pointers.
245  *
246  * When we get completions back they simply contain the ring index number of
247  * the WR (work request) which completed. So, we manage the buffers for actual
248  * packet data completely independently of the descriptors in this driver. When
249  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
250  * with the WQE index that we put it at, and therefore don't have to look at
251  * the original descriptor at all when handling completions.
252  *
253  * For RX, we create sufficient packet data buffers to fill 150% of the
254  * available descriptors for each ring. These all are pre-set-up for DMA and
255  * have an mblk_t associated with them (with desballoc()).
256  *
257  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
258  * large enough), or we copy it into a pre-allocated buffer set up in the same
259  * as as for RX.
260  */
261 
262 /*
263  * Buffer lifecycle: RX
264  * --------------------
265  *
266  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
267  * straightforward.
268  *
269  * It is created (and has all its memory allocated) at the time of starting up
270  * the RX ring it belongs to. Then it is placed on the "free" list in the
271  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
272  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
273  * before making a WQE for it.
274  *
275  * After a completion event occurs, the packet is either discarded (and the
276  * buffer_t returned to the free list), or it is readied for loaning to MAC
277  * and placed on the "loaned" list in the mlxcx_buffer_shard_t.
278  *
279  * Once MAC and the rest of the system have finished with the packet, they call
280  * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point
281  * the fate of the buffer_t is determined by the state of the
282  * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t
283  * will be returned to the free list, potentially to be recycled and used
284  * again. But if the shard is draining (E.g. after a ring stop) there will be
285  * no recycling and the buffer_t is immediately destroyed.
286  *
287  * At detach/teardown time, buffers are only every destroyed from the free list.
288  *
289  *
290  *                         +
291  *                         |
292  *                         | mlxcx_buf_create
293  *                         |
294  *                         v
295  *                    +----+----+
296  *                    | created |
297  *                    +----+----+                        +------+
298  *                         |                             | dead |
299  *                         |                             +------+
300  *                         | mlxcx_buf_return                ^
301  *                         |                                 |
302  *                         v                                 | mlxcx_buf_destroy
303  * mlxcx_buf_destroy  +----+----+          +-----------+     |
304  *          +---------|  free   |<------no-| draining? |-yes-+
305  *          |         +----+----+          +-----------+
306  *          |              |                     ^
307  *          |              |                     |
308  *          v              | mlxcx_buf_take      | mlxcx_buf_return
309  *      +---+--+           v                     |
310  *      | dead |       +---+---+                 |
311  *      +------+       | on WQ |- - - - - - - - >O
312  *                     +---+---+                 ^
313  *                         |                     |
314  *                         |                     |
315  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
316  *                         v                     |
317  *                 +-------+--------+            |
318  *                 | on loan to MAC |----------->O
319  *                 +----------------+  freemsg()
320  *
321  */
322 
323 /*
324  * Buffer lifecycle: TX
325  * --------------------
326  *
327  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
328  * "foreign" buffers.
329  *
330  * The former have their memory allocated and DMA bound by this driver, while
331  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
332  * not owned by us, though we do DMA bind it (and take responsibility for
333  * un-binding it when we're done with them).
334  *
335  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
336  * SQ. Thus, there is a separate free list and mutex for each kind.
337  *
338  * Since a TX packet might consist of multiple mblks, we translate each mblk
339  * into exactly one buffer_t. The buffer_ts are chained together in the same
340  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
341  *
342  * Each chain of TX buffers may consist of foreign or driver buffers, in any
343  * mixture.
344  *
345  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
346  * it from the rest of the chain buffers.
347  *
348  * TX buffer chains are always returned to the free list by
349  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
350  * freeing all of the members.
351  *
352  * We only call freemsg() once, on the head of the TX buffer chain's original
353  * mblk. This is true whether we copied it or bound it in a foreign buffer.
354  */
355 
356 /*
357  * Startup and command interface
358  * -----------------------------
359  *
360  * The command interface is the primary way in which we give control orders to
361  * the hardware (e.g. actions like "create this queue" or "delete this flow
362  * entry"). The command interface is never used to transmit or receive packets
363  * -- that takes place only on the queues that are set up through it.
364  *
365  * In mlxcx_cmd.c we implement our use of the command interface on top of a
366  * simple taskq. Since it's not performance critical, we busy-wait on command
367  * completions and only process a single command at a time.
368  *
369  * If this becomes a problem later we can wire command completions up to EQ 0
370  * once we have interrupts running.
371  *
372  * The startup/attach process for this card involves a bunch of different steps
373  * which are summarised pretty well in the PRM. We have to send a number of
374  * commands which do different things to start the card up, give it some pages
375  * of our own memory for it to use, then start creating all the entities that
376  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
377  * and TDoms.
378  */
379 
380 /*
381  * UARs
382  * ----
383  *
384  * The pages of the PCI BAR other than the first few are reserved for use as
385  * "UAR" sections in this device. Each UAR section can be used as a set of
386  * doorbells for our queues.
387  *
388  * Currently we just make one single UAR for all of our queues. It doesn't
389  * seem to be a major limitation yet.
390  *
391  * When we're sending packets through an SQ, the PRM is not awful clear about
392  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
393  * (it's clear on the pattern of alternation you're expected to use between
394  * even and odd for Blueflame sends, but not for regular doorbells).
395  *
396  * Currently we don't do the even-odd alternating pattern for ordinary
397  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
398  * least on Connect-X4 Lx.
399  */
400 
401 /*
402  * Lock ordering
403  * -------------
404  *
405  * Interrupt side:
406  *
407  *  - mleq_mtx
408  *    - mlcq_mtx
409  *      - mlcq_bufbmtx
410  *      - mlwq_mtx
411  *        - mlbs_mtx
412  *    - mlp_mtx
413  *
414  * GLD side:
415  *
416  *  - mlp_mtx
417  *    - mlg_mtx
418  *      - mlg_*.mlft_mtx
419  *    - mlp_*.mlft_mtx
420  *    - mlwq_mtx
421  *      - mlbs_mtx
422  *      - mlcq_bufbmtx
423  *  - mleq_mtx
424  *    - mlcq_mtx
425  *
426  */
427 
428 #include <sys/modctl.h>
429 #include <sys/conf.h>
430 #include <sys/devops.h>
431 #include <sys/sysmacros.h>
432 #include <sys/time.h>
433 
434 #include <sys/mac_provider.h>
435 
436 #include <mlxcx.h>
437 
438 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
439 
440 #define	MLXCX_MODULE_NAME	"mlxcx"
441 /*
442  * We give this to the firmware, so it has to be in a fixed format that it
443  * understands.
444  */
445 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
446 
447 /*
448  * Firmware may take a while to reclaim pages. Try a set number of times.
449  */
450 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
451 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
452 
453 static void *mlxcx_softstate;
454 
455 /*
456  * Fault detection thresholds.
457  */
458 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
459 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
460 
461 static void
462 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
463 {
464 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
465 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
466 
467 	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
468 	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
469 
470 	/*
471 	 * Currently we have different queue size defaults for two
472 	 * categories of queues. One set for devices which support a
473 	 * maximum speed of 10Gb/s, and another for those above that.
474 	 */
475 	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
476 	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
477 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
478 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
479 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
480 	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
481 	    MLXCX_PROTO_10G)) != 0) {
482 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
483 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
484 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
485 	} else {
486 		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
487 		    "recognize. Proto: 0x%x", port->mlp_max_proto);
488 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
489 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
490 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
491 	}
492 }
493 
494 /*
495  * Properties which may have different defaults based on hardware
496  * characteristics.
497  */
498 static void
499 mlxcx_load_model_props(mlxcx_t *mlxp)
500 {
501 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
502 
503 	mlxcx_load_prop_defaults(mlxp);
504 
505 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
506 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
507 	    p->mldp_cq_size_shift_default);
508 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
509 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
510 	    p->mldp_sq_size_shift_default);
511 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
512 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
513 	    p->mldp_rq_size_shift_default);
514 }
515 
516 static void
517 mlxcx_load_props(mlxcx_t *mlxp)
518 {
519 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
520 
521 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
522 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
523 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
524 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
525 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
526 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
527 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
528 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
529 	    MLXCX_CQEMOD_COUNT_DFLT);
530 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
531 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
532 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
533 
534 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
535 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
536 	    MLXCX_TX_NGROUPS_DFLT);
537 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
538 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
539 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
540 
541 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
542 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
543 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
544 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
545 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
546 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
547 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
548 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
549 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
550 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
551 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
552 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
553 
554 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
555 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
556 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
557 
558 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
559 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
560 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
561 
562 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
563 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
564 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
565 
566 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
567 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
568 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
569 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
570 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
571 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
572 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
573 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
574 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
575 
576 	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
577 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
578 	    MLXCX_RX_PER_CQ_DEFAULT);
579 
580 	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
581 	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
582 		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
583 		    "out of range. Defaulting to: %d. Valid values are from "
584 		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
585 		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
586 		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
587 	}
588 }
589 
590 void
591 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
592 {
593 	va_list ap;
594 
595 	va_start(ap, fmt);
596 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
597 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
598 	} else {
599 		vcmn_err(CE_NOTE, fmt, ap);
600 	}
601 	va_end(ap);
602 }
603 
604 void
605 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
606 {
607 	va_list ap;
608 
609 	va_start(ap, fmt);
610 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
611 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
612 	} else {
613 		vcmn_err(CE_WARN, fmt, ap);
614 	}
615 	va_end(ap);
616 }
617 
618 void
619 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
620 {
621 	va_list ap;
622 
623 	va_start(ap, fmt);
624 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
625 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
626 	} else {
627 		vcmn_err(CE_PANIC, fmt, ap);
628 	}
629 	va_end(ap);
630 }
631 
632 uint16_t
633 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
634 {
635 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
636 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
637 }
638 
639 uint32_t
640 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
641 {
642 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
643 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
644 }
645 
646 uint64_t
647 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
648 {
649 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
650 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
651 }
652 
653 void
654 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
655 {
656 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
657 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
658 }
659 
660 void
661 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
662 {
663 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
664 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
665 }
666 
667 void
668 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
669 {
670 	/*
671 	 * The UAR is always inside the first BAR, which we mapped as
672 	 * mlx_regs
673 	 */
674 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
675 	    (uintptr_t)mlxp->mlx_regs_base;
676 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
677 }
678 
679 void
680 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
681 {
682 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
683 	    (uintptr_t)mlxp->mlx_regs_base;
684 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
685 }
686 
687 static void
688 mlxcx_fm_fini(mlxcx_t *mlxp)
689 {
690 	if (mlxp->mlx_fm_caps == 0)
691 		return;
692 
693 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
694 		ddi_fm_handler_unregister(mlxp->mlx_dip);
695 
696 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
697 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
698 		pci_ereport_teardown(mlxp->mlx_dip);
699 
700 	ddi_fm_fini(mlxp->mlx_dip);
701 
702 	mlxp->mlx_fm_caps = 0;
703 }
704 
705 void
706 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
707 {
708 	uint64_t ena;
709 	char buf[FM_MAX_CLASS];
710 
711 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
712 		return;
713 
714 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
715 	ena = fm_ena_generate(0, FM_ENA_FMT1);
716 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
717 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
718 	    NULL);
719 }
720 
721 static int
722 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
723 {
724 	/*
725 	 * as the driver can always deal with an error in any dma or
726 	 * access handle, we can just return the fme_status value.
727 	 */
728 	pci_ereport_post(dip, err, NULL);
729 	return (err->fme_status);
730 }
731 
732 static void
733 mlxcx_fm_init(mlxcx_t *mlxp)
734 {
735 	ddi_iblock_cookie_t iblk;
736 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
737 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
738 
739 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
740 	    DDI_PROP_DONTPASS, "fm_capable", def);
741 
742 	if (mlxp->mlx_fm_caps < 0) {
743 		mlxp->mlx_fm_caps = 0;
744 	}
745 	mlxp->mlx_fm_caps &= def;
746 
747 	if (mlxp->mlx_fm_caps == 0)
748 		return;
749 
750 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
751 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
752 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
753 		pci_ereport_setup(mlxp->mlx_dip);
754 	}
755 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
756 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
757 		    (void *)mlxp);
758 	}
759 }
760 
761 static void
762 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
763 {
764 	mlxcx_buffer_t *buf;
765 
766 	mutex_enter(&s->mlbs_mtx);
767 
768 	while (!list_is_empty(&s->mlbs_busy))
769 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
770 
771 	while (!list_is_empty(&s->mlbs_loaned))
772 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
773 
774 	while ((buf = list_head(&s->mlbs_free)) != NULL)
775 		mlxcx_buf_destroy(mlxp, buf);
776 
777 	list_destroy(&s->mlbs_free);
778 	list_destroy(&s->mlbs_busy);
779 	list_destroy(&s->mlbs_loaned);
780 	mutex_exit(&s->mlbs_mtx);
781 
782 	cv_destroy(&s->mlbs_free_nonempty);
783 	mutex_destroy(&s->mlbs_mtx);
784 }
785 
786 static void
787 mlxcx_teardown_bufs(mlxcx_t *mlxp)
788 {
789 	mlxcx_buf_shard_t *s;
790 
791 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
792 		mlxcx_mlbs_teardown(mlxp, s);
793 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
794 	}
795 	list_destroy(&mlxp->mlx_buf_shards);
796 
797 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
798 }
799 
800 static void
801 mlxcx_teardown_pages(mlxcx_t *mlxp)
802 {
803 	uint_t nzeros = 0;
804 
805 	mutex_enter(&mlxp->mlx_pagemtx);
806 
807 	while (mlxp->mlx_npages > 0) {
808 		int32_t req, ret;
809 		uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES];
810 
811 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
812 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
813 
814 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
815 			mlxcx_warn(mlxp, "hardware refused to return pages, "
816 			    "leaking %u remaining pages", mlxp->mlx_npages);
817 			goto out;
818 		}
819 
820 		for (int32_t i = 0; i < ret; i++) {
821 			mlxcx_dev_page_t *mdp, probe;
822 			bzero(&probe, sizeof (probe));
823 			probe.mxdp_pa = pas[i];
824 
825 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
826 
827 			if (mdp != NULL) {
828 				avl_remove(&mlxp->mlx_pages, mdp);
829 				mlxp->mlx_npages--;
830 				mlxcx_dma_free(&mdp->mxdp_dma);
831 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
832 			} else {
833 				mlxcx_panic(mlxp, "hardware returned a page "
834 				    "with PA 0x%" PRIx64 " but we have no "
835 				    "record of giving out such a page", pas[i]);
836 			}
837 		}
838 
839 		/*
840 		 * If no pages were returned, note that fact.
841 		 */
842 		if (ret == 0) {
843 			nzeros++;
844 			if (nzeros > mlxcx_reclaim_tries) {
845 				mlxcx_warn(mlxp, "hardware refused to return "
846 				    "pages, leaking %u remaining pages",
847 				    mlxp->mlx_npages);
848 				goto out;
849 			}
850 			delay(drv_usectohz(mlxcx_reclaim_delay));
851 		}
852 	}
853 
854 	avl_destroy(&mlxp->mlx_pages);
855 
856 out:
857 	mutex_exit(&mlxp->mlx_pagemtx);
858 	mutex_destroy(&mlxp->mlx_pagemtx);
859 }
860 
861 static boolean_t
862 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
863 {
864 	ddi_device_acc_attr_t acc;
865 	ddi_dma_attr_t attr;
866 	boolean_t ret;
867 	size_t sz, i;
868 
869 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
870 
871 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
872 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
873 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
874 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
875 
876 	mlxcx_dma_acc_attr(mlxp, &acc);
877 	mlxcx_dma_queue_attr(mlxp, &attr);
878 
879 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
880 	    B_TRUE, sz, B_TRUE);
881 	if (!ret) {
882 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
883 		return (B_FALSE);
884 	}
885 
886 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
887 
888 	for (i = 0; i < mleq->mleq_nents; ++i)
889 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
890 
891 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
892 
893 	return (B_TRUE);
894 }
895 
896 static void
897 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
898 {
899 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
900 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
901 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
902 
903 	mlxcx_dma_free(&mleq->mleq_dma);
904 	mleq->mleq_ent = NULL;
905 
906 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
907 }
908 
909 void
910 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
911 {
912 	mlxcx_flow_group_t *fg;
913 	mlxcx_flow_entry_t *fe;
914 	int i;
915 
916 	ASSERT(mutex_owned(&ft->mlft_mtx));
917 
918 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
919 		fe = &ft->mlft_ent[i];
920 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
921 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
922 				mlxcx_panic(mlxp, "failed to delete flow "
923 				    "entry %u on table %u", i,
924 				    ft->mlft_num);
925 			}
926 		}
927 	}
928 
929 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
930 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
931 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
932 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
933 				mlxcx_panic(mlxp, "failed to destroy flow "
934 				    "group %u", fg->mlfg_num);
935 			}
936 		}
937 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
938 	}
939 	list_destroy(&ft->mlft_groups);
940 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
941 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
942 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
943 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
944 			    ft->mlft_num);
945 		}
946 	}
947 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
948 	ft->mlft_ent = NULL;
949 	mutex_exit(&ft->mlft_mtx);
950 	mutex_destroy(&ft->mlft_mtx);
951 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
952 }
953 
954 static void
955 mlxcx_teardown_ports(mlxcx_t *mlxp)
956 {
957 	uint_t i;
958 	mlxcx_port_t *p;
959 	mlxcx_flow_table_t *ft;
960 
961 	for (i = 0; i < mlxp->mlx_nports; ++i) {
962 		p = &mlxp->mlx_ports[i];
963 		if (!(p->mlp_init & MLXCX_PORT_INIT))
964 			continue;
965 		mutex_enter(&p->mlp_mtx);
966 		if ((ft = p->mlp_rx_flow) != NULL) {
967 			mutex_enter(&ft->mlft_mtx);
968 			/*
969 			 * teardown_flow_table() will destroy the mutex, so
970 			 * we don't release it here.
971 			 */
972 			mlxcx_teardown_flow_table(mlxp, ft);
973 		}
974 		mutex_exit(&p->mlp_mtx);
975 		mutex_destroy(&p->mlp_mtx);
976 		p->mlp_init &= ~MLXCX_PORT_INIT;
977 	}
978 
979 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
980 	mlxp->mlx_ports = NULL;
981 }
982 
983 static void
984 mlxcx_teardown_wqs(mlxcx_t *mlxp)
985 {
986 	mlxcx_work_queue_t *mlwq;
987 
988 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
989 		mlxcx_wq_teardown(mlxp, mlwq);
990 	}
991 	list_destroy(&mlxp->mlx_wqs);
992 }
993 
994 static void
995 mlxcx_teardown_cqs(mlxcx_t *mlxp)
996 {
997 	mlxcx_completion_queue_t *mlcq;
998 
999 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
1000 		mlxcx_cq_teardown(mlxp, mlcq);
1001 	}
1002 	list_destroy(&mlxp->mlx_cqs);
1003 }
1004 
1005 static void
1006 mlxcx_teardown_eqs(mlxcx_t *mlxp)
1007 {
1008 	mlxcx_event_queue_t *mleq;
1009 	uint_t i;
1010 
1011 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1012 		mleq = &mlxp->mlx_eqs[i];
1013 		mutex_enter(&mleq->mleq_mtx);
1014 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1015 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1016 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1017 				mlxcx_warn(mlxp, "failed to destroy "
1018 				    "event queue idx %u eqn %u",
1019 				    i, mleq->mleq_num);
1020 			}
1021 		}
1022 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1023 			mlxcx_eq_rele_dma(mlxp, mleq);
1024 		}
1025 		mutex_exit(&mleq->mleq_mtx);
1026 	}
1027 }
1028 
1029 static void
1030 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1031 {
1032 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1033 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1034 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1035 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1036 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1037 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1038 }
1039 
1040 static void
1041 mlxcx_teardown(mlxcx_t *mlxp)
1042 {
1043 	uint_t i;
1044 	dev_info_t *dip = mlxp->mlx_dip;
1045 
1046 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1047 		mlxcx_teardown_groups(mlxp);
1048 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1049 	}
1050 
1051 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1052 		mlxcx_teardown_checktimers(mlxp);
1053 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1054 	}
1055 
1056 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1057 		mlxcx_teardown_wqs(mlxp);
1058 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1059 	}
1060 
1061 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1062 		mlxcx_teardown_cqs(mlxp);
1063 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1064 	}
1065 
1066 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1067 		mlxcx_teardown_bufs(mlxp);
1068 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1069 	}
1070 
1071 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1072 		mlxcx_teardown_ports(mlxp);
1073 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1074 	}
1075 
1076 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1077 		mlxcx_teardown_eqs(mlxp);
1078 		mlxcx_intr_teardown(mlxp);
1079 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1080 	}
1081 
1082 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1083 		if (mlxp->mlx_uar.mlu_allocated) {
1084 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1085 				mlxcx_warn(mlxp, "failed to release UAR");
1086 			}
1087 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1088 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1089 		}
1090 		if (mlxp->mlx_pd.mlpd_allocated &&
1091 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1092 			mlxcx_warn(mlxp, "failed to release PD");
1093 		}
1094 		if (mlxp->mlx_tdom.mltd_allocated &&
1095 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1096 			mlxcx_warn(mlxp, "failed to release TDOM");
1097 		}
1098 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1099 	}
1100 
1101 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1102 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1103 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1104 			    "command during device detach");
1105 		}
1106 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1107 	}
1108 
1109 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1110 		mlxcx_teardown_pages(mlxp);
1111 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1112 	}
1113 
1114 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1115 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1116 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1117 			    "during device detach");
1118 		}
1119 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1120 	}
1121 
1122 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1123 		mlxcx_cmd_queue_fini(mlxp);
1124 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1125 	}
1126 
1127 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1128 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1129 		mlxp->mlx_caps = NULL;
1130 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1131 	}
1132 
1133 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1134 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1135 		mlxp->mlx_regs_handle = NULL;
1136 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1137 	}
1138 
1139 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1140 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1141 		mlxp->mlx_cfg_handle = NULL;
1142 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1143 	}
1144 
1145 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1146 		mlxcx_fm_fini(mlxp);
1147 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1148 	}
1149 
1150 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1151 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1152 	ddi_set_driver_private(dip, NULL);
1153 }
1154 
1155 static boolean_t
1156 mlxcx_regs_map(mlxcx_t *mlxp)
1157 {
1158 	off_t memsize;
1159 	int ret;
1160 	ddi_device_acc_attr_t da;
1161 
1162 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1163 	    DDI_SUCCESS) {
1164 		mlxcx_warn(mlxp, "failed to get register set size");
1165 		return (B_FALSE);
1166 	}
1167 
1168 	/*
1169 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1170 	 * device.
1171 	 */
1172 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1173 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1174 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1175 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1176 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1177 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1178 	} else {
1179 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1180 	}
1181 
1182 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1183 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1184 
1185 	if (ret != DDI_SUCCESS) {
1186 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1187 		return (B_FALSE);
1188 	}
1189 
1190 	return (B_TRUE);
1191 }
1192 
1193 static boolean_t
1194 mlxcx_check_issi(mlxcx_t *mlxp)
1195 {
1196 	uint32_t issi;
1197 
1198 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1199 		mlxcx_warn(mlxp, "failed to get ISSI");
1200 		return (B_FALSE);
1201 	}
1202 
1203 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1204 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1205 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1206 		return (B_FALSE);
1207 	}
1208 
1209 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1210 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1211 		    MLXCX_CURRENT_ISSI);
1212 		return (B_FALSE);
1213 	}
1214 
1215 	return (B_TRUE);
1216 }
1217 
1218 boolean_t
1219 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages)
1220 {
1221 	ddi_device_acc_attr_t acc;
1222 	ddi_dma_attr_t attr;
1223 	int32_t i;
1224 	list_t plist;
1225 	mlxcx_dev_page_t *mdp;
1226 	const ddi_dma_cookie_t *ck;
1227 
1228 	/*
1229 	 * If there are no pages required, then we're done here.
1230 	 */
1231 	if (npages <= 0) {
1232 		return (B_TRUE);
1233 	}
1234 
1235 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1236 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1237 
1238 	for (i = 0; i < npages; i++) {
1239 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1240 		mlxcx_dma_acc_attr(mlxp, &acc);
1241 		mlxcx_dma_page_attr(mlxp, &attr);
1242 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1243 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1244 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1245 			    npages);
1246 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1247 			goto cleanup_npages;
1248 		}
1249 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1250 		mdp->mxdp_pa = ck->dmac_laddress;
1251 
1252 		list_insert_tail(&plist, mdp);
1253 	}
1254 
1255 	/*
1256 	 * Now that all of the pages have been allocated, given them to hardware
1257 	 * in chunks.
1258 	 */
1259 	while (npages > 0) {
1260 		mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES];
1261 		int32_t togive = MIN(MLXCX_MANAGE_PAGES_MAX_PAGES, npages);
1262 
1263 		for (i = 0; i < togive; i++) {
1264 			pages[i] = list_remove_head(&plist);
1265 		}
1266 
1267 		if (!mlxcx_cmd_give_pages(mlxp,
1268 		    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) {
1269 			mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1270 			    "pages!", togive);
1271 			for (i = 0; i < togive; i++) {
1272 				list_insert_tail(&plist, pages[i]);
1273 			}
1274 			goto cleanup_npages;
1275 		}
1276 
1277 		mutex_enter(&mlxp->mlx_pagemtx);
1278 		for (i = 0; i < togive; i++) {
1279 			avl_add(&mlxp->mlx_pages, pages[i]);
1280 		}
1281 		mlxp->mlx_npages += togive;
1282 		mutex_exit(&mlxp->mlx_pagemtx);
1283 		npages -= togive;
1284 	}
1285 
1286 	list_destroy(&plist);
1287 
1288 	return (B_TRUE);
1289 
1290 cleanup_npages:
1291 	while ((mdp = list_remove_head(&plist)) != NULL) {
1292 		mlxcx_dma_free(&mdp->mxdp_dma);
1293 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1294 	}
1295 	list_destroy(&plist);
1296 	return (B_FALSE);
1297 }
1298 
1299 static boolean_t
1300 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1301 {
1302 	int32_t npages;
1303 
1304 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1305 		mlxcx_warn(mlxp, "failed to determine boot pages");
1306 		return (B_FALSE);
1307 	}
1308 
1309 	return (mlxcx_give_pages(mlxp, npages));
1310 }
1311 
1312 static int
1313 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1314 {
1315 	mlxcx_t *mlxp = cookie;
1316 	mlxcx_buffer_t *b = arg;
1317 
1318 	bzero(b, sizeof (mlxcx_buffer_t));
1319 	b->mlb_mlx = mlxp;
1320 	b->mlb_state = MLXCX_BUFFER_INIT;
1321 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1322 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1323 
1324 	return (0);
1325 }
1326 
1327 static void
1328 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1329 {
1330 	mlxcx_t *mlxp = cookie;
1331 	mlxcx_buffer_t *b = arg;
1332 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1333 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1334 	list_destroy(&b->mlb_tx_chain);
1335 }
1336 
1337 mlxcx_buf_shard_t *
1338 mlxcx_mlbs_create(mlxcx_t *mlxp)
1339 {
1340 	mlxcx_buf_shard_t *s;
1341 
1342 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1343 
1344 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1345 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1346 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1347 	    offsetof(mlxcx_buffer_t, mlb_entry));
1348 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1349 	    offsetof(mlxcx_buffer_t, mlb_entry));
1350 	list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t),
1351 	    offsetof(mlxcx_buffer_t, mlb_entry));
1352 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1353 
1354 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1355 
1356 	return (s);
1357 }
1358 
1359 static boolean_t
1360 mlxcx_setup_bufs(mlxcx_t *mlxp)
1361 {
1362 	char namebuf[KSTAT_STRLEN];
1363 
1364 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1365 	    ddi_get_instance(mlxp->mlx_dip));
1366 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1367 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1368 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1369 	    NULL, mlxp, NULL, 0);
1370 
1371 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1372 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1373 
1374 	return (B_TRUE);
1375 }
1376 
1377 static void
1378 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1379     const char *state, uint8_t statenum)
1380 {
1381 	uint64_t ena;
1382 	char buf[FM_MAX_CLASS];
1383 
1384 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1385 		return;
1386 
1387 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1388 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1389 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1390 
1391 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1392 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1393 	    "state", DATA_TYPE_STRING, state,
1394 	    "state_num", DATA_TYPE_UINT8, statenum,
1395 	    "qtype", DATA_TYPE_STRING, qtype,
1396 	    "qnum", DATA_TYPE_UINT32, qnum,
1397 	    NULL);
1398 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1399 }
1400 
1401 static void
1402 mlxcx_eq_check(void *arg)
1403 {
1404 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1405 	mlxcx_event_queue_t *eq;
1406 	mlxcx_eventq_ctx_t ctx;
1407 	const char *str;
1408 
1409 	uint_t i;
1410 
1411 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1412 		eq = &mlxp->mlx_eqs[i];
1413 		if (!(eq->mleq_state & MLXCX_EQ_CREATED) ||
1414 		    (eq->mleq_state & MLXCX_EQ_DESTROYED))
1415 			continue;
1416 		mutex_enter(&eq->mleq_mtx);
1417 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) {
1418 			mutex_exit(&eq->mleq_mtx);
1419 			continue;
1420 		}
1421 
1422 		str = "???";
1423 		switch (ctx.mleqc_status) {
1424 		case MLXCX_EQ_STATUS_OK:
1425 			break;
1426 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1427 			str = "WRITE_FAILURE";
1428 			break;
1429 		}
1430 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1431 			mlxcx_fm_qstate_ereport(mlxp, "event",
1432 			    eq->mleq_num, str, ctx.mleqc_status);
1433 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1434 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1435 		}
1436 
1437 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1438 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1439 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1440 			    ++eq->mleq_check_disarm_cnt >= 3) {
1441 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1442 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1443 				    eq->mleq_intr_index);
1444 			}
1445 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1446 		} else {
1447 			eq->mleq_check_disarm_cc = 0;
1448 			eq->mleq_check_disarm_cnt = 0;
1449 		}
1450 
1451 		mutex_exit(&eq->mleq_mtx);
1452 	}
1453 }
1454 
1455 static void
1456 mlxcx_cq_check(void *arg)
1457 {
1458 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1459 	mlxcx_completion_queue_t *cq;
1460 	mlxcx_completionq_ctx_t ctx;
1461 	const char *str, *type;
1462 	uint_t v;
1463 
1464 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1465 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1466 		mutex_enter(&cq->mlcq_mtx);
1467 		if (!(cq->mlcq_state & MLXCX_CQ_CREATED) ||
1468 		    (cq->mlcq_state & MLXCX_CQ_DESTROYED) ||
1469 		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
1470 			mutex_exit(&cq->mlcq_mtx);
1471 			continue;
1472 		}
1473 		if (cq->mlcq_fm_repd_qstate) {
1474 			mutex_exit(&cq->mlcq_mtx);
1475 			continue;
1476 		}
1477 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) {
1478 			mutex_exit(&cq->mlcq_mtx);
1479 			continue;
1480 		}
1481 		if (cq->mlcq_wq != NULL) {
1482 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1483 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1484 				type = "rx ";
1485 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1486 				type = "tx ";
1487 			else
1488 				type = "";
1489 		} else {
1490 			type = "";
1491 		}
1492 
1493 		str = "???";
1494 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1495 		switch (v) {
1496 		case MLXCX_CQC_STATUS_OK:
1497 			break;
1498 		case MLXCX_CQC_STATUS_OVERFLOW:
1499 			str = "OVERFLOW";
1500 			break;
1501 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1502 			str = "WRITE_FAIL";
1503 			break;
1504 		case MLXCX_CQC_STATUS_INVALID:
1505 			str = "INVALID";
1506 			break;
1507 		}
1508 		if (v != MLXCX_CQC_STATUS_OK) {
1509 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1510 			    cq->mlcq_num, str, v);
1511 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1512 			    type, cq->mlcq_num, v, str);
1513 			cq->mlcq_fm_repd_qstate = B_TRUE;
1514 		}
1515 
1516 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1517 		if (v != MLXCX_CQC_STATE_ARMED &&
1518 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1519 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1520 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1521 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1522 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1523 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1524 				    type, cq->mlcq_num, cq);
1525 			}
1526 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1527 		} else {
1528 			cq->mlcq_check_disarm_cnt = 0;
1529 			cq->mlcq_check_disarm_cc = 0;
1530 		}
1531 		mutex_exit(&cq->mlcq_mtx);
1532 	}
1533 }
1534 
1535 void
1536 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1537 {
1538 	mlxcx_sq_ctx_t ctx;
1539 	mlxcx_sq_state_t state;
1540 
1541 	ASSERT(mutex_owned(&sq->mlwq_mtx));
1542 
1543 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1544 		return;
1545 
1546 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1547 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1548 	switch (state) {
1549 	case MLXCX_SQ_STATE_RST:
1550 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1551 			mlxcx_fm_qstate_ereport(mlxp, "send",
1552 			    sq->mlwq_num, "RST", state);
1553 			sq->mlwq_fm_repd_qstate = B_TRUE;
1554 		}
1555 		break;
1556 	case MLXCX_SQ_STATE_RDY:
1557 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1558 			mlxcx_fm_qstate_ereport(mlxp, "send",
1559 			    sq->mlwq_num, "RDY", state);
1560 			sq->mlwq_fm_repd_qstate = B_TRUE;
1561 		}
1562 		break;
1563 	case MLXCX_SQ_STATE_ERR:
1564 		mlxcx_fm_qstate_ereport(mlxp, "send",
1565 		    sq->mlwq_num, "ERR", state);
1566 		sq->mlwq_fm_repd_qstate = B_TRUE;
1567 		break;
1568 	default:
1569 		mlxcx_fm_qstate_ereport(mlxp, "send",
1570 		    sq->mlwq_num, "???", state);
1571 		sq->mlwq_fm_repd_qstate = B_TRUE;
1572 		break;
1573 	}
1574 }
1575 
1576 void
1577 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1578 {
1579 	mlxcx_rq_ctx_t ctx;
1580 	mlxcx_rq_state_t state;
1581 
1582 	ASSERT(mutex_owned(&rq->mlwq_mtx));
1583 
1584 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1585 		return;
1586 
1587 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1588 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1589 	switch (state) {
1590 	case MLXCX_RQ_STATE_RST:
1591 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1592 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1593 			    rq->mlwq_num, "RST", state);
1594 			rq->mlwq_fm_repd_qstate = B_TRUE;
1595 		}
1596 		break;
1597 	case MLXCX_RQ_STATE_RDY:
1598 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1599 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1600 			    rq->mlwq_num, "RDY", state);
1601 			rq->mlwq_fm_repd_qstate = B_TRUE;
1602 		}
1603 		break;
1604 	case MLXCX_RQ_STATE_ERR:
1605 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1606 		    rq->mlwq_num, "ERR", state);
1607 		rq->mlwq_fm_repd_qstate = B_TRUE;
1608 		break;
1609 	default:
1610 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1611 		    rq->mlwq_num, "???", state);
1612 		rq->mlwq_fm_repd_qstate = B_TRUE;
1613 		break;
1614 	}
1615 }
1616 
1617 static void
1618 mlxcx_wq_check(void *arg)
1619 {
1620 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1621 	mlxcx_work_queue_t *wq;
1622 
1623 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1624 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1625 		mutex_enter(&wq->mlwq_mtx);
1626 		if (!(wq->mlwq_state & MLXCX_WQ_CREATED) ||
1627 		    (wq->mlwq_state & MLXCX_WQ_DESTROYED) ||
1628 		    (wq->mlwq_state & MLXCX_WQ_TEARDOWN)) {
1629 			mutex_exit(&wq->mlwq_mtx);
1630 			continue;
1631 		}
1632 		if (wq->mlwq_fm_repd_qstate) {
1633 			mutex_exit(&wq->mlwq_mtx);
1634 			continue;
1635 		}
1636 		switch (wq->mlwq_type) {
1637 		case MLXCX_WQ_TYPE_SENDQ:
1638 			mlxcx_check_sq(mlxp, wq);
1639 			break;
1640 		case MLXCX_WQ_TYPE_RECVQ:
1641 			mlxcx_check_rq(mlxp, wq);
1642 			break;
1643 		}
1644 		mutex_exit(&wq->mlwq_mtx);
1645 	}
1646 }
1647 
1648 static boolean_t
1649 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1650 {
1651 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1652 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1653 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1654 		    DDI_IPL_0);
1655 	}
1656 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1657 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1658 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1659 		    DDI_IPL_0);
1660 	}
1661 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1662 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1663 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1664 		    DDI_IPL_0);
1665 	}
1666 	return (B_TRUE);
1667 }
1668 
1669 int
1670 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1671 {
1672 	const mlxcx_flow_entry_t *left = arg0;
1673 	const mlxcx_flow_entry_t *right = arg1;
1674 	int bcmpr;
1675 
1676 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1677 	    sizeof (left->mlfe_dmac));
1678 	if (bcmpr < 0)
1679 		return (-1);
1680 	if (bcmpr > 0)
1681 		return (1);
1682 	if (left->mlfe_vid < right->mlfe_vid)
1683 		return (-1);
1684 	if (left->mlfe_vid > right->mlfe_vid)
1685 		return (1);
1686 	return (0);
1687 }
1688 
1689 int
1690 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1691 {
1692 	const mlxcx_group_mac_t *left = arg0;
1693 	const mlxcx_group_mac_t *right = arg1;
1694 	int bcmpr;
1695 
1696 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1697 	    sizeof (left->mlgm_mac));
1698 	if (bcmpr < 0)
1699 		return (-1);
1700 	if (bcmpr > 0)
1701 		return (1);
1702 	return (0);
1703 }
1704 
1705 int
1706 mlxcx_page_compare(const void *arg0, const void *arg1)
1707 {
1708 	const mlxcx_dev_page_t *p0 = arg0;
1709 	const mlxcx_dev_page_t *p1 = arg1;
1710 
1711 	if (p0->mxdp_pa < p1->mxdp_pa)
1712 		return (-1);
1713 	if (p0->mxdp_pa > p1->mxdp_pa)
1714 		return (1);
1715 	return (0);
1716 }
1717 
1718 static boolean_t
1719 mlxcx_setup_ports(mlxcx_t *mlxp)
1720 {
1721 	uint_t i, j;
1722 	mlxcx_port_t *p;
1723 	mlxcx_flow_table_t *ft;
1724 	mlxcx_flow_group_t *fg;
1725 	mlxcx_flow_entry_t *fe;
1726 
1727 	VERIFY3U(mlxp->mlx_nports, >, 0);
1728 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1729 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1730 
1731 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1732 		p = &mlxp->mlx_ports[i];
1733 		p->mlp_num = i;
1734 		p->mlp_init |= MLXCX_PORT_INIT;
1735 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1736 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1737 		mutex_enter(&p->mlp_mtx);
1738 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1739 			mutex_exit(&p->mlp_mtx);
1740 			goto err;
1741 		}
1742 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1743 			mutex_exit(&p->mlp_mtx);
1744 			goto err;
1745 		}
1746 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1747 			mutex_exit(&p->mlp_mtx);
1748 			goto err;
1749 		}
1750 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1751 			mutex_exit(&p->mlp_mtx);
1752 			goto err;
1753 		}
1754 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1755 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1756 			mutex_exit(&p->mlp_mtx);
1757 			goto err;
1758 		}
1759 		if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
1760 			mutex_exit(&p->mlp_mtx);
1761 			goto err;
1762 		}
1763 		p->mlp_fec_requested = LINK_FEC_AUTO;
1764 
1765 		mutex_exit(&p->mlp_mtx);
1766 	}
1767 
1768 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1769 		p = &mlxp->mlx_ports[i];
1770 		mutex_enter(&p->mlp_mtx);
1771 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1772 		    KM_SLEEP));
1773 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1774 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1775 
1776 		mutex_enter(&ft->mlft_mtx);
1777 
1778 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1779 		ft->mlft_port = p;
1780 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1781 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1782 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1783 		ft->mlft_nents = (1 << ft->mlft_entshift);
1784 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1785 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1786 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1787 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1788 
1789 		for (j = 0; j < ft->mlft_nents; ++j) {
1790 			ft->mlft_ent[j].mlfe_table = ft;
1791 			ft->mlft_ent[j].mlfe_index = j;
1792 		}
1793 
1794 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1795 			mutex_exit(&ft->mlft_mtx);
1796 			mutex_exit(&p->mlp_mtx);
1797 			goto err;
1798 		}
1799 
1800 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1801 			mutex_exit(&ft->mlft_mtx);
1802 			mutex_exit(&p->mlp_mtx);
1803 			goto err;
1804 		}
1805 
1806 		/*
1807 		 * We match broadcast at the top of the root flow table, then
1808 		 * all multicast/unicast MACs, then the promisc entry is down
1809 		 * the very bottom.
1810 		 *
1811 		 * This way when promisc is on, that entry simply catches any
1812 		 * remaining traffic that earlier flows haven't matched.
1813 		 */
1814 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1815 		list_insert_tail(&ft->mlft_groups, fg);
1816 		fg->mlfg_table = ft;
1817 		fg->mlfg_size = 1;
1818 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1819 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1820 			mutex_exit(&ft->mlft_mtx);
1821 			mutex_exit(&p->mlp_mtx);
1822 			goto err;
1823 		}
1824 		p->mlp_bcast = fg;
1825 		fe = list_head(&fg->mlfg_entries);
1826 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1827 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1828 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1829 
1830 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1831 		list_insert_tail(&ft->mlft_groups, fg);
1832 		fg->mlfg_table = ft;
1833 		fg->mlfg_size = ft->mlft_nents - 2;
1834 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1835 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1836 			mutex_exit(&ft->mlft_mtx);
1837 			mutex_exit(&p->mlp_mtx);
1838 			goto err;
1839 		}
1840 		p->mlp_umcast = fg;
1841 
1842 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1843 		list_insert_tail(&ft->mlft_groups, fg);
1844 		fg->mlfg_table = ft;
1845 		fg->mlfg_size = 1;
1846 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1847 			mutex_exit(&ft->mlft_mtx);
1848 			mutex_exit(&p->mlp_mtx);
1849 			goto err;
1850 		}
1851 		p->mlp_promisc = fg;
1852 		fe = list_head(&fg->mlfg_entries);
1853 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1854 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1855 
1856 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1857 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1858 		    mlfe_dmac_entry));
1859 
1860 		mutex_exit(&ft->mlft_mtx);
1861 		mutex_exit(&p->mlp_mtx);
1862 	}
1863 
1864 	return (B_TRUE);
1865 
1866 err:
1867 	mlxcx_teardown_ports(mlxp);
1868 	return (B_FALSE);
1869 }
1870 
1871 void
1872 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1873 {
1874 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1875 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1876 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1877 	mlxcx_flow_entry_t *fe;
1878 	mlxcx_group_vlan_t *v;
1879 
1880 	ASSERT(mutex_owned(&g->mlg_mtx));
1881 
1882 	mutex_enter(&ft->mlft_mtx);
1883 
1884 	if (!list_is_empty(&g->mlg_rx_vlans)) {
1885 		fe = list_head(&dfg->mlfg_entries);
1886 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
1887 	}
1888 
1889 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
1890 		fe = v->mlgv_fe;
1891 		ASSERT3P(fe->mlfe_table, ==, ft);
1892 		ASSERT3P(fe->mlfe_group, ==, fg);
1893 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
1894 
1895 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1896 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1897 	}
1898 
1899 	mutex_exit(&ft->mlft_mtx);
1900 }
1901 
1902 boolean_t
1903 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1904     boolean_t tagged, uint16_t vid)
1905 {
1906 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1907 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1908 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1909 	mlxcx_flow_entry_t *fe;
1910 	mlxcx_group_vlan_t *v;
1911 	boolean_t found = B_FALSE;
1912 
1913 	ASSERT(mutex_owned(&g->mlg_mtx));
1914 
1915 	mutex_enter(&ft->mlft_mtx);
1916 
1917 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1918 	    v = list_next(&g->mlg_rx_vlans, v)) {
1919 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1920 			found = B_TRUE;
1921 			break;
1922 		}
1923 	}
1924 	if (!found) {
1925 		mutex_exit(&ft->mlft_mtx);
1926 		return (B_FALSE);
1927 	}
1928 
1929 	list_remove(&g->mlg_rx_vlans, v);
1930 
1931 	/*
1932 	 * If this is the last VLAN entry, we have to go back to accepting
1933 	 * any VLAN (which means re-enabling the default entry).
1934 	 *
1935 	 * Do this before we remove the flow entry for the last specific
1936 	 * VLAN so that we don't lose any traffic in the transition.
1937 	 */
1938 	if (list_is_empty(&g->mlg_rx_vlans)) {
1939 		fe = list_head(&dfg->mlfg_entries);
1940 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1941 			list_insert_tail(&g->mlg_rx_vlans, v);
1942 			mutex_exit(&ft->mlft_mtx);
1943 			return (B_FALSE);
1944 		}
1945 	}
1946 
1947 	fe = v->mlgv_fe;
1948 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
1949 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
1950 	ASSERT3P(fe->mlfe_table, ==, ft);
1951 	ASSERT3P(fe->mlfe_group, ==, fg);
1952 
1953 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
1954 		list_insert_tail(&g->mlg_rx_vlans, v);
1955 		fe = list_head(&dfg->mlfg_entries);
1956 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
1957 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1958 		}
1959 		mutex_exit(&ft->mlft_mtx);
1960 		return (B_FALSE);
1961 	}
1962 
1963 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1964 
1965 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
1966 
1967 	mutex_exit(&ft->mlft_mtx);
1968 	return (B_TRUE);
1969 }
1970 
1971 boolean_t
1972 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
1973     uint16_t vid)
1974 {
1975 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1976 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1977 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1978 	mlxcx_flow_entry_t *fe;
1979 	mlxcx_group_vlan_t *v;
1980 	boolean_t found = B_FALSE;
1981 	boolean_t first = B_FALSE;
1982 
1983 	ASSERT(mutex_owned(&g->mlg_mtx));
1984 
1985 	mutex_enter(&ft->mlft_mtx);
1986 
1987 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1988 	    v = list_next(&g->mlg_rx_vlans, v)) {
1989 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1990 			mutex_exit(&ft->mlft_mtx);
1991 			return (B_TRUE);
1992 		}
1993 	}
1994 	if (list_is_empty(&g->mlg_rx_vlans))
1995 		first = B_TRUE;
1996 
1997 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
1998 	    fe = list_next(&fg->mlfg_entries, fe)) {
1999 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2000 			found = B_TRUE;
2001 			break;
2002 		}
2003 	}
2004 	if (!found) {
2005 		mutex_exit(&ft->mlft_mtx);
2006 		return (B_FALSE);
2007 	}
2008 
2009 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
2010 	v->mlgv_fe = fe;
2011 	v->mlgv_tagged = tagged;
2012 	v->mlgv_vid = vid;
2013 
2014 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2015 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2016 	fe->mlfe_vid = vid;
2017 	if (tagged) {
2018 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2019 	} else {
2020 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2021 	}
2022 
2023 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2024 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2025 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2026 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2027 		mutex_exit(&ft->mlft_mtx);
2028 		return (B_FALSE);
2029 	}
2030 
2031 	list_insert_tail(&g->mlg_rx_vlans, v);
2032 
2033 	/*
2034 	 * If the vlan list was empty for this group before adding this one,
2035 	 * then we no longer want the "default" entry to allow all VLANs
2036 	 * through.
2037 	 */
2038 	if (first) {
2039 		fe = list_head(&dfg->mlfg_entries);
2040 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2041 	}
2042 
2043 	mutex_exit(&ft->mlft_mtx);
2044 	return (B_TRUE);
2045 }
2046 
2047 void
2048 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2049     mlxcx_ring_group_t *group)
2050 {
2051 	mlxcx_flow_entry_t *fe;
2052 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2053 	mlxcx_group_mac_t *gm, *ngm;
2054 
2055 	ASSERT(mutex_owned(&port->mlp_mtx));
2056 	ASSERT(mutex_owned(&group->mlg_mtx));
2057 
2058 	mutex_enter(&ft->mlft_mtx);
2059 
2060 	gm = avl_first(&group->mlg_rx_macs);
2061 	for (; gm != NULL; gm = ngm) {
2062 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2063 
2064 		ASSERT3P(gm->mlgm_group, ==, group);
2065 		fe = gm->mlgm_fe;
2066 		ASSERT3P(fe->mlfe_table, ==, ft);
2067 
2068 		avl_remove(&group->mlg_rx_macs, gm);
2069 		list_remove(&fe->mlfe_ring_groups, gm);
2070 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
2071 
2072 		fe->mlfe_ndest = 0;
2073 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2074 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2075 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2076 			    gm->mlgm_group->mlg_rx_vlan_ft;
2077 		}
2078 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2079 
2080 		if (fe->mlfe_ndest > 0) {
2081 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2082 			continue;
2083 		}
2084 
2085 		/*
2086 		 * There are no more ring groups left for this MAC (it wasn't
2087 		 * attached to any other groups since ndest == 0), so clean up
2088 		 * its flow entry.
2089 		 */
2090 		avl_remove(&port->mlp_dmac_fe, fe);
2091 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2092 		list_destroy(&fe->mlfe_ring_groups);
2093 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2094 	}
2095 
2096 	mutex_exit(&ft->mlft_mtx);
2097 }
2098 
2099 boolean_t
2100 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2101     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2102 {
2103 	mlxcx_flow_entry_t *fe;
2104 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2105 	mlxcx_group_mac_t *gm, probe;
2106 
2107 	ASSERT(mutex_owned(&port->mlp_mtx));
2108 	ASSERT(mutex_owned(&group->mlg_mtx));
2109 
2110 	bzero(&probe, sizeof (probe));
2111 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2112 
2113 	mutex_enter(&ft->mlft_mtx);
2114 
2115 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2116 	if (gm == NULL) {
2117 		mutex_exit(&ft->mlft_mtx);
2118 		return (B_FALSE);
2119 	}
2120 	ASSERT3P(gm->mlgm_group, ==, group);
2121 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2122 
2123 	fe = gm->mlgm_fe;
2124 	ASSERT3P(fe->mlfe_table, ==, ft);
2125 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2126 
2127 	list_remove(&fe->mlfe_ring_groups, gm);
2128 	avl_remove(&group->mlg_rx_macs, gm);
2129 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2130 
2131 	fe->mlfe_ndest = 0;
2132 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2133 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2134 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2135 		    gm->mlgm_group->mlg_rx_vlan_ft;
2136 	}
2137 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2138 
2139 	if (fe->mlfe_ndest > 0) {
2140 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2141 			mutex_exit(&ft->mlft_mtx);
2142 			return (B_FALSE);
2143 		}
2144 		mutex_exit(&ft->mlft_mtx);
2145 		return (B_TRUE);
2146 	}
2147 
2148 	/*
2149 	 * There are no more ring groups left for this MAC (it wasn't attached
2150 	 * to any other groups since ndest == 0), so clean up its flow entry.
2151 	 */
2152 	avl_remove(&port->mlp_dmac_fe, fe);
2153 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2154 	list_destroy(&fe->mlfe_ring_groups);
2155 
2156 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2157 
2158 	mutex_exit(&ft->mlft_mtx);
2159 
2160 	return (B_TRUE);
2161 }
2162 
2163 boolean_t
2164 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2165     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2166 {
2167 	mlxcx_flow_group_t *fg;
2168 	mlxcx_flow_entry_t *fe, probe;
2169 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2170 	mlxcx_group_mac_t *gm;
2171 	boolean_t found = B_FALSE;
2172 
2173 	ASSERT(mutex_owned(&port->mlp_mtx));
2174 	ASSERT(mutex_owned(&group->mlg_mtx));
2175 
2176 	bzero(&probe, sizeof (probe));
2177 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2178 
2179 	mutex_enter(&ft->mlft_mtx);
2180 
2181 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2182 
2183 	if (fe == NULL) {
2184 		fg = port->mlp_umcast;
2185 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2186 		    fe = list_next(&fg->mlfg_entries, fe)) {
2187 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2188 				found = B_TRUE;
2189 				break;
2190 			}
2191 		}
2192 		if (!found) {
2193 			mutex_exit(&ft->mlft_mtx);
2194 			return (B_FALSE);
2195 		}
2196 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2197 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2198 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2199 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2200 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2201 
2202 		avl_add(&port->mlp_dmac_fe, fe);
2203 	}
2204 
2205 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2206 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2207 
2208 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2209 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2210 		if (--fe->mlfe_ndest == 0) {
2211 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2212 		}
2213 		mutex_exit(&ft->mlft_mtx);
2214 		return (B_FALSE);
2215 	}
2216 
2217 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2218 	gm->mlgm_group = group;
2219 	gm->mlgm_fe = fe;
2220 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2221 	avl_add(&group->mlg_rx_macs, gm);
2222 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2223 
2224 	mutex_exit(&ft->mlft_mtx);
2225 
2226 	return (B_TRUE);
2227 }
2228 
2229 boolean_t
2230 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2231     mlxcx_flow_group_t *fg)
2232 {
2233 	mlxcx_flow_entry_t *fe;
2234 	uint_t i, idx;
2235 
2236 	ASSERT(mutex_owned(&ft->mlft_mtx));
2237 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2238 	ASSERT3P(fg->mlfg_table, ==, ft);
2239 
2240 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2241 		return (B_FALSE);
2242 	fg->mlfg_start_idx = ft->mlft_next_ent;
2243 
2244 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2245 		return (B_FALSE);
2246 	}
2247 
2248 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2249 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2250 	for (i = 0; i < fg->mlfg_size; ++i) {
2251 		idx = fg->mlfg_start_idx + i;
2252 		fe = &ft->mlft_ent[idx];
2253 		fe->mlfe_group = fg;
2254 		list_insert_tail(&fg->mlfg_entries, fe);
2255 	}
2256 	fg->mlfg_avail = fg->mlfg_size;
2257 	ft->mlft_next_ent += fg->mlfg_size;
2258 
2259 	return (B_TRUE);
2260 }
2261 
2262 static boolean_t
2263 mlxcx_setup_eq0(mlxcx_t *mlxp)
2264 {
2265 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[0];
2266 
2267 	mutex_enter(&mleq->mleq_mtx);
2268 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2269 		/* mlxcx_teardown_eqs() will clean this up */
2270 		mutex_exit(&mleq->mleq_mtx);
2271 		return (B_FALSE);
2272 	}
2273 	mleq->mleq_mlx = mlxp;
2274 	mleq->mleq_uar = &mlxp->mlx_uar;
2275 	mleq->mleq_events =
2276 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2277 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2278 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2279 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2280 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2281 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2282 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2283 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2284 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2285 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2286 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2287 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2288 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST);
2289 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2290 		/* mlxcx_teardown_eqs() will clean this up */
2291 		mutex_exit(&mleq->mleq_mtx);
2292 		return (B_FALSE);
2293 	}
2294 	if (ddi_intr_enable(mlxp->mlx_intr_handles[0]) != DDI_SUCCESS) {
2295 		/*
2296 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2297 		 * eq_rele_dma
2298 		 */
2299 		mutex_exit(&mleq->mleq_mtx);
2300 		return (B_FALSE);
2301 	}
2302 	mlxcx_arm_eq(mlxp, mleq);
2303 	mutex_exit(&mleq->mleq_mtx);
2304 	return (B_TRUE);
2305 }
2306 
2307 int
2308 mlxcx_cq_compare(const void *arg0, const void *arg1)
2309 {
2310 	const mlxcx_completion_queue_t *left = arg0;
2311 	const mlxcx_completion_queue_t *right = arg1;
2312 
2313 	if (left->mlcq_num < right->mlcq_num) {
2314 		return (-1);
2315 	}
2316 	if (left->mlcq_num > right->mlcq_num) {
2317 		return (1);
2318 	}
2319 	return (0);
2320 }
2321 
2322 static boolean_t
2323 mlxcx_setup_eqs(mlxcx_t *mlxp)
2324 {
2325 	uint_t i;
2326 	mlxcx_event_queue_t *mleq;
2327 
2328 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2329 
2330 	for (i = 1; i < mlxp->mlx_intr_count; ++i) {
2331 		mleq = &mlxp->mlx_eqs[i];
2332 		mutex_enter(&mleq->mleq_mtx);
2333 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2334 			mutex_exit(&mleq->mleq_mtx);
2335 			return (B_FALSE);
2336 		}
2337 		mleq->mleq_uar = &mlxp->mlx_uar;
2338 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2339 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2340 			mutex_exit(&mleq->mleq_mtx);
2341 			return (B_FALSE);
2342 		}
2343 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2344 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2345 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2346 			mutex_exit(&mleq->mleq_mtx);
2347 			return (B_FALSE);
2348 		}
2349 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2350 			mutex_exit(&mleq->mleq_mtx);
2351 			return (B_FALSE);
2352 		}
2353 		mlxcx_arm_eq(mlxp, mleq);
2354 		mutex_exit(&mleq->mleq_mtx);
2355 	}
2356 
2357 	mlxp->mlx_next_eq = 1;
2358 
2359 	return (B_TRUE);
2360 }
2361 
2362 /*
2363  * Snapshot all of the hardware capabilities that we care about and then modify
2364  * the HCA capabilities to get things moving.
2365  */
2366 static boolean_t
2367 mlxcx_init_caps(mlxcx_t *mlxp)
2368 {
2369 	mlxcx_caps_t *c;
2370 
2371 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2372 
2373 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2374 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2375 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2376 	}
2377 
2378 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2379 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2380 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2381 	}
2382 
2383 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2384 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2385 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2386 	}
2387 
2388 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2389 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2390 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2391 	}
2392 
2393 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2394 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2395 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2396 	}
2397 
2398 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2399 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2400 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2401 	}
2402 
2403 	/*
2404 	 * Check the caps meet our requirements.
2405 	 */
2406 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2407 
2408 	if (gen->mlcap_general_log_pg_sz != 12) {
2409 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2410 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2411 		goto err;
2412 	}
2413 	if (gen->mlcap_general_cqe_version != 1) {
2414 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2415 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2416 		goto err;
2417 	}
2418 	if (gen->mlcap_general_port_type !=
2419 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2420 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2421 		goto err;
2422 	}
2423 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2424 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2425 
2426 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2427 
2428 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2429 	    MLXCX_ETH_CAP_CSUM_CAP);
2430 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2431 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2432 
2433 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2434 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2435 	if (c->mlc_max_lso_size == 1) {
2436 		c->mlc_max_lso_size = 0;
2437 		c->mlc_lso = B_FALSE;
2438 	} else {
2439 		c->mlc_lso = B_TRUE;
2440 	}
2441 
2442 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2443 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2444 
2445 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2446 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2447 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2448 		goto err;
2449 	}
2450 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2451 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2452 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2453 		    "flow table entries");
2454 		goto err;
2455 	}
2456 
2457 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2458 	    mlcap_flow_prop_log_max_ft_size;
2459 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2460 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2461 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2462 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2463 
2464 	return (B_TRUE);
2465 
2466 err:
2467 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2468 	return (B_FALSE);
2469 }
2470 
2471 static int
2472 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2473 {
2474 	mlxcx_t *mlxp;
2475 
2476 	if (cmd != DDI_DETACH)
2477 		return (DDI_FAILURE);
2478 
2479 	mlxp = ddi_get_driver_private(dip);
2480 	if (mlxp == NULL) {
2481 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2482 		    "private data");
2483 		return (DDI_FAILURE);
2484 	}
2485 
2486 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2487 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2488 			return (DDI_FAILURE);
2489 		}
2490 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2491 	}
2492 
2493 	mlxcx_teardown(mlxp);
2494 	return (DDI_SUCCESS);
2495 }
2496 
2497 static size_t
2498 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2499 {
2500 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2501 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2502 	size_t tirlim, flowlim, gflowlim;
2503 
2504 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2505 	if (tirlim < ngroups) {
2506 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2507 		    "on number of TIRs available", tirlim);
2508 		ngroups = tirlim;
2509 	}
2510 
2511 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2512 	if (flowlim < ngroups) {
2513 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2514 		    "on max size of RX flow tables", flowlim);
2515 		ngroups = flowlim;
2516 	}
2517 
2518 	do {
2519 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2520 		if (gflowlim < ngroups) {
2521 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2522 			    "based on max total RX flows", gflowlim);
2523 			--ngroups;
2524 		}
2525 	} while (gflowlim < ngroups);
2526 
2527 	return (ngroups);
2528 }
2529 
2530 static int
2531 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2532 {
2533 	mlxcx_t *mlxp;
2534 	uint_t i;
2535 	int inst, ret;
2536 
2537 	if (cmd != DDI_ATTACH)
2538 		return (DDI_FAILURE);
2539 
2540 	inst = ddi_get_instance(dip);
2541 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2542 	if (ret != 0)
2543 		return (ret);
2544 
2545 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2546 	if (mlxp == NULL)
2547 		return (DDI_FAILURE);
2548 	mlxp->mlx_dip = dip;
2549 	mlxp->mlx_inst = inst;
2550 	ddi_set_driver_private(dip, mlxp);
2551 
2552 	mlxcx_load_props(mlxp);
2553 
2554 	mlxcx_fm_init(mlxp);
2555 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2556 
2557 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2558 	    DDI_SUCCESS) {
2559 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2560 		goto err;
2561 	}
2562 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2563 
2564 	if (!mlxcx_regs_map(mlxp)) {
2565 		goto err;
2566 	}
2567 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2568 
2569 	if (!mlxcx_cmd_queue_init(mlxp)) {
2570 		goto err;
2571 	}
2572 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2573 
2574 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2575 		goto err;
2576 	}
2577 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2578 
2579 	if (!mlxcx_check_issi(mlxp)) {
2580 		goto err;
2581 	}
2582 
2583 	/*
2584 	 * We have to get our interrupts now so we know what priority to
2585 	 * create pagemtx with.
2586 	 */
2587 	if (!mlxcx_intr_setup(mlxp)) {
2588 		goto err;
2589 	}
2590 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2591 
2592 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2593 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2594 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2595 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2596 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2597 
2598 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2599 		goto err;
2600 	}
2601 
2602 	if (!mlxcx_init_caps(mlxp)) {
2603 		goto err;
2604 	}
2605 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2606 
2607 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2608 		goto err;
2609 	}
2610 
2611 	if (!mlxcx_cmd_init_hca(mlxp)) {
2612 		goto err;
2613 	}
2614 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2615 
2616 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2617 		goto err;
2618 	}
2619 
2620 	/*
2621 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2622 	 * doorbells.
2623 	 */
2624 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2625 		goto err;
2626 	}
2627 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2628 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2629 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2630 	}
2631 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2632 
2633 	/*
2634 	 * Set up event queue #0 -- it's special and only handles control
2635 	 * type events, like PAGE_REQUEST (which we will probably get during
2636 	 * the commands below).
2637 	 *
2638 	 * This will enable and arm the interrupt on EQ 0, too.
2639 	 */
2640 	if (!mlxcx_setup_eq0(mlxp)) {
2641 		goto err;
2642 	}
2643 
2644 	/*
2645 	 * Allocate a protection and transport domain. These don't really do
2646 	 * anything for us (they're IB concepts), but we need to give their
2647 	 * ID numbers in other commands.
2648 	 */
2649 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2650 		goto err;
2651 	}
2652 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2653 		goto err;
2654 	}
2655 	/*
2656 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2657 	 * work queue entries, rather than having to mess with the NIC's
2658 	 * internal MMU.
2659 	 */
2660 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2661 		goto err;
2662 	}
2663 
2664 	/*
2665 	 * Query our port information and current state, populate the
2666 	 * mlxcx_port_t structs.
2667 	 *
2668 	 * This also sets up the root flow tables and flow groups.
2669 	 */
2670 	if (!mlxcx_setup_ports(mlxp)) {
2671 		goto err;
2672 	}
2673 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2674 
2675 	mlxcx_load_model_props(mlxp);
2676 
2677 	/*
2678 	 * Set up, enable and arm the rest of the interrupt EQs which will
2679 	 * service events from CQs.
2680 	 *
2681 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2682 	 * cleaned up.
2683 	 */
2684 	if (!mlxcx_setup_eqs(mlxp)) {
2685 		goto err;
2686 	}
2687 
2688 	/* Completion queues */
2689 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2690 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2691 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2692 
2693 	/* Work queues (send queues, receive queues) */
2694 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2695 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2696 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2697 
2698 	/* Set up periodic fault check timers which check the queue states */
2699 	if (!mlxcx_setup_checktimers(mlxp)) {
2700 		goto err;
2701 	}
2702 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2703 
2704 	/*
2705 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2706 	 * "groups" we advertise to MAC.
2707 	 */
2708 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2709 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2710 	    sizeof (mlxcx_ring_group_t);
2711 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2712 
2713 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2714 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2715 	    sizeof (mlxcx_ring_group_t);
2716 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2717 
2718 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2719 
2720 	/*
2721 	 * Sets up the free/busy buffers list for keeping track of packet
2722 	 * buffers.
2723 	 */
2724 	if (!mlxcx_setup_bufs(mlxp))
2725 		goto err;
2726 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2727 
2728 	/*
2729 	 * Before we tell MAC about our rings/groups, we need to do enough
2730 	 * setup on them to be sure about the numbers and configuration that
2731 	 * we have. This will do basically everything short of allocating
2732 	 * packet buffers and starting the rings up.
2733 	 */
2734 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2735 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2736 			goto err;
2737 	}
2738 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2739 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2740 			goto err;
2741 	}
2742 
2743 	/*
2744 	 * Finally, tell MAC that we exist!
2745 	 */
2746 	if (!mlxcx_register_mac(mlxp)) {
2747 		goto err;
2748 	}
2749 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
2750 
2751 	return (DDI_SUCCESS);
2752 
2753 err:
2754 	mlxcx_teardown(mlxp);
2755 	return (DDI_FAILURE);
2756 }
2757 
2758 static struct cb_ops mlxcx_cb_ops = {
2759 	.cb_open = nulldev,
2760 	.cb_close = nulldev,
2761 	.cb_strategy = nodev,
2762 	.cb_print = nodev,
2763 	.cb_dump = nodev,
2764 	.cb_read = nodev,
2765 	.cb_write = nodev,
2766 	.cb_ioctl = nodev,
2767 	.cb_devmap = nodev,
2768 	.cb_mmap = nodev,
2769 	.cb_segmap = nodev,
2770 	.cb_chpoll = nochpoll,
2771 	.cb_prop_op = ddi_prop_op,
2772 	.cb_flag = D_MP,
2773 	.cb_rev = CB_REV,
2774 	.cb_aread = nodev,
2775 	.cb_awrite = nodev
2776 };
2777 
2778 static struct dev_ops mlxcx_dev_ops = {
2779 	.devo_rev = DEVO_REV,
2780 	.devo_refcnt = 0,
2781 	.devo_getinfo = NULL,
2782 	.devo_identify = nulldev,
2783 	.devo_probe = nulldev,
2784 	.devo_attach = mlxcx_attach,
2785 	.devo_detach = mlxcx_detach,
2786 	.devo_reset = nodev,
2787 	.devo_power = ddi_power,
2788 	.devo_quiesce = ddi_quiesce_not_supported,
2789 	.devo_cb_ops = &mlxcx_cb_ops
2790 };
2791 
2792 static struct modldrv mlxcx_modldrv = {
2793 	.drv_modops = &mod_driverops,
2794 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
2795 	.drv_dev_ops = &mlxcx_dev_ops
2796 };
2797 
2798 static struct modlinkage mlxcx_modlinkage = {
2799 	.ml_rev = MODREV_1,
2800 	.ml_linkage = { &mlxcx_modldrv, NULL }
2801 };
2802 
2803 int
2804 _init(void)
2805 {
2806 	int ret;
2807 
2808 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
2809 	if (ret != 0) {
2810 		return (ret);
2811 	}
2812 
2813 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
2814 
2815 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2816 		mac_fini_ops(&mlxcx_dev_ops);
2817 		ddi_soft_state_fini(&mlxcx_softstate);
2818 		return (ret);
2819 	}
2820 
2821 	return (DDI_SUCCESS);
2822 }
2823 
2824 int
2825 _info(struct modinfo *modinfop)
2826 {
2827 	return (mod_info(&mlxcx_modlinkage, modinfop));
2828 }
2829 
2830 int
2831 _fini(void)
2832 {
2833 	int ret;
2834 
2835 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2836 		return (ret);
2837 	}
2838 
2839 	mac_fini_ops(&mlxcx_dev_ops);
2840 
2841 	ddi_soft_state_fini(&mlxcx_softstate);
2842 
2843 	return (DDI_SUCCESS);
2844 }
2845