xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision 5014e1fa)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2023, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2023 RackTop Systems, Inc.
16  * Copyright 2023 MNX Cloud, Inc.
17  */
18 
19 /*
20  * Mellanox Connect-X 4/5/6 driver.
21  */
22 
23 /*
24  * The PRM for this family of parts was freely available at:
25  *
26  * https://www.mellanox.com/related-docs/user_manuals/ \
27  *   Ethernet_Adapters_Programming_Manual.pdf
28  *
29  * but has since disappeared.
30  */
31 /*
32  * ConnectX glossary
33  * -----------------
34  *
35  * WR		Work Request: something we've asked the hardware to do by
36  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
37  *
38  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
39  *
40  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
41  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
42  *		types have different WQE structures, different commands for
43  *		creating and destroying them, etc, but share a common context
44  *		structure, counter setup and state graph.
45  * SQ		Send Queue, a specific type of WQ that sends packets
46  * RQ		Receive Queue, a specific type of WQ that receives packets
47  *
48  * CQ		Completion Queue: completion of WRs from a WQ are reported to
49  *		one of these, as a CQE on its entry ring.
50  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
51  *		info, as well as packet size, the ID of the WQ, and the index
52  *		of the WQE which completed. Does not contain any packet data.
53  *
54  * EQ		Event Queue: a ring of event structs from the hardware informing
55  *		us when particular events happen. Many events can point at a
56  *		a particular CQ which we should then go look at.
57  * EQE		Event Queue Entry: an entry on the EQ ring
58  *
59  * UAR		User Access Region, a page of the device's PCI BAR which is
60  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
61  *		ring to arm them for interrupts or wake them up for new work
62  *
63  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
64  *		as a single unit (for e.g. hashing/RSS).
65  *
66  * TIR		Transport Interface Recieve, a bucket of resources for the
67  *		reception of packets. TIRs have to point at either a single RQ
68  *		or a table of RQs (RQT). They then serve as a target for flow
69  *		table entries (FEs). TIRs that point at an RQT also contain the
70  *		settings for hashing for RSS.
71  *
72  * TIS		Transport Interface Send, a bucket of resources associated with
73  *		the transmission of packets. In particular, the temporary
74  *		resources used for LSO internally in the card are accounted to
75  *		a TIS.
76  *
77  * FT		Flow Table, a collection of FEs and FGs that can be referred to
78  *		as a single entity (e.g. used as a target from another flow
79  *		entry or set as the "root" table to handle incoming or outgoing
80  *		packets). Packets arriving at a FT are matched against the
81  *		FEs in the table until either one matches with a terminating
82  *		action or all FEs are exhausted (it's first-match-wins but with
83  *		some actions that are non-terminal, like counting actions).
84  *
85  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
86  *		they match on the same attributes of packets coming into the
87  *		flow).
88  *
89  * FE		Flow Entry, an individual set of values to match against
90  *		packets entering the flow table, combined with an action to
91  *		take upon a successful match. The action we use most is
92  *		"forward", which sends the packets to a TIR or another flow
93  *		table and then stops further processing within the FE's FT.
94  *
95  * lkey/mkey	A reference to something similar to a page table but in the
96  *		device's internal onboard MMU. Since Connect-X parts double as
97  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
98  *		features which we try very hard not to use. For our WQEs we use
99  *		the "reserved" lkey, which is a special value which indicates
100  *		that addresses we give are linear addresses and should not be
101  *		translated.
102  *
103  * PD		Protection Domain, an IB concept. We have to allocate one to
104  *		provide as a parameter for new WQs, but we don't do anything
105  *		with it.
106  *
107  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
108  *		provide it as a parameter to TIR/TIS creation, but we don't do
109  *		anything with it.
110  */
111 /*
112  *
113  * Data flow overview
114  * ------------------
115  *
116  * This driver is a MAC ring-enabled driver which maps rings to send and recv
117  * queues in hardware on the device.
118  *
119  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
120  * sufficient space, and simplify the logic needed to work out which buffer
121  * was completed.
122  *
123  * The CQs are then round-robin allocated onto EQs, of which we set up one per
124  * interrupt that the system gives us for the device. Normally this means we
125  * have 8 EQs.
126  *
127  * When we have >= 8 EQs available, we try to allocate only RX or only TX
128  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
129  *
130  * EQ #0 is reserved for all event types other than completion events, and has
131  * no CQs associated with it at any time. EQs #1 and upwards are only used for
132  * handling CQ completion events.
133  *
134  * +------+     +------+           +------+        +---------+
135  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
136  * +------+     +------+     |     +------+        +---------+
137  *                           |
138  * +------+     +------+     |
139  * | SQ 1 |---->| CQ 1 |---+ |     +------+
140  * +------+     +------+   | +---> |      |
141  *                         |       |      |
142  * +------+     +------+   |       | EQ 1 |        +---------+
143  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
144  * +------+     +------+   | +---> |      |        +---------+
145  *                         | |     +------+
146  *                         | |
147  *   ...                   | |
148  *                         | |     +------+
149  * +------+     +------+   +-----> |      |
150  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
151  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
152  *                           |     |      |        +---------+
153  * +------+     +------+     | +-> |      |
154  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
155  * +------+     +------+       |
156  *                             |     ....
157  * +------+     +------+       |
158  * | RQ 2 |---->| CQ 5 |-------+
159  * +------+     +------+
160  *
161  *   ... (note this diagram does not show RX-only or TX-only EQs)
162  *
163  * For TX, we advertise all of the SQs we create as plain rings to MAC with
164  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
165  * and use the rings as it sees fit.
166  *
167  * For RX, we advertise actual groups in order to make use of hardware
168  * classification.
169  *
170  * The hardware classification we use is based around Flow Tables, and we
171  * currently ignore all of the eswitch features of the card. The NIC VPORT
172  * is always set to promisc mode so that the eswitch sends us all of the
173  * traffic that arrives on the NIC, and we use flow entries to manage
174  * everything.
175  *
176  * We use 2 layers of flow tables for classification: traffic arrives at the
177  * root RX flow table which contains MAC address filters. Those then send
178  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
179  * presence and VID filters.
180  *
181  * Since these parts only support doing RSS hashing on a single protocol at a
182  * time, we have to use a third layer of flow tables as well to break traffic
183  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
184  * so that it can be sent to the appropriate TIR for hashing.
185  *
186  * Incoming packets
187  *        +           +---------+      +---------+
188  *        |        +->| group 0 |      | group 0 |
189  *        |        |  | vlan ft |  +-->| hash ft |
190  *        v        |  |   L1    |  |   |   L2    |
191  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
192  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
193  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
194  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
195  *        |        |  |         |  |   +---------+    +-----+    |     +------+
196  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
197  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
198  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
199  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
200  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
201  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
202  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
203  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
204  *   +---------+   |               ^   |  other  |-+
205  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
206  *   +---------+                   |               +->| TIR |--->| RQ0 |
207  *   |  MAC 1  |-+                 |                  +-----+    +-----+
208  *   +---------+ | +---------------+
209  *   |  MAC 2  |-+ |               ^
210  *   +---------+ | |               |
211  *   |  MAC 3  |-+ |  +---------+  |   +---------+
212  *   +---------+ | |  | group 1 |  |   | group 1 |
213  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
214  *   |         |   |  |   L1    |  | | |   L2    |
215  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
216  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
217  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
218  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
219  *                    |         |  |   +---------+    +-----+    |     +------+
220  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
221  *                    |         |  |   +---------+    +-----+    | RQT +------+
222  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
223  *                    |         |  |   +---------+    +-----+    |     |      |
224  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
225  *                    | promisc |--+   +---------+    +-----+    |     |      |
226  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
227  *                                     +---------+    +-----+    +-----+------+
228  *                                     |  other  |-+
229  *                                     +---------+ |
230  *                      .......                    |  +-----+    +-----+
231  *                                                 +->| TIR |--->| RQ3 |
232  *                                                    +-----+    +-----+
233  *
234  * Note that the "promisc" flow entries are only set/enabled when promisc
235  * mode is enabled for the NIC. All promisc flow entries point directly at
236  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
237  * the "default group" in MAC).
238  *
239  * The "default" entry in the L1 VLAN filter flow tables is used when there
240  * are no VLANs set for the group, to accept any traffic regardless of tag. It
241  * is deleted as soon as a VLAN filter is added (and re-instated if the
242  * last VLAN filter is removed).
243  *
244  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
245  * space for packet data (they're a collection of scatter pointers only). TX
246  * descriptors contain some space for "inline headers" (and the card requires
247  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
248  * but all the rest of the data comes from the gather pointers.
249  *
250  * When we get completions back they simply contain the ring index number of
251  * the WR (work request) which completed. So, we manage the buffers for actual
252  * packet data completely independently of the descriptors in this driver. When
253  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
254  * with the WQE index that we put it at, and therefore don't have to look at
255  * the original descriptor at all when handling completions.
256  *
257  * For RX, we create sufficient packet data buffers to fill 150% of the
258  * available descriptors for each ring. These all are pre-set-up for DMA and
259  * have an mblk_t associated with them (with desballoc()).
260  *
261  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
262  * large enough), or we copy it into a pre-allocated buffer set up in the same
263  * as as for RX.
264  */
265 
266 /*
267  * Buffer lifecycle: RX
268  * --------------------
269  *
270  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
271  * straightforward.
272  *
273  * It is created (and has all its memory allocated) at the time of starting up
274  * the RX ring it belongs to. Then it is placed on the "free" list in the
275  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
276  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
277  * before making a WQE for it.
278  *
279  * After a completion event occurs, the packet is either discarded (and the
280  * buffer_t returned to the free list), or it is readied for loaning to MAC
281  * and placed on the "loaned" list in the mlxcx_buffer_shard_t.
282  *
283  * Once MAC and the rest of the system have finished with the packet, they call
284  * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point
285  * the fate of the buffer_t is determined by the state of the
286  * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t
287  * will be returned to the free list, potentially to be recycled and used
288  * again. But if the shard is draining (E.g. after a ring stop) there will be
289  * no recycling and the buffer_t is immediately destroyed.
290  *
291  * At detach/teardown time, buffers are only every destroyed from the free list.
292  *
293  *
294  *                         +
295  *                         |
296  *                         | mlxcx_buf_create
297  *                         |
298  *                         v
299  *                    +----+----+
300  *                    | created |
301  *                    +----+----+                        +------+
302  *                         |                             | dead |
303  *                         |                             +------+
304  *                         | mlxcx_buf_return                ^
305  *                         |                                 |
306  *                         v                                 | mlxcx_buf_destroy
307  * mlxcx_buf_destroy  +----+----+          +-----------+     |
308  *          +---------|  free   |<------no-| draining? |-yes-+
309  *          |         +----+----+          +-----------+
310  *          |              |                     ^
311  *          |              |                     |
312  *          v              | mlxcx_buf_take      | mlxcx_buf_return
313  *      +---+--+           v                     |
314  *      | dead |       +---+---+                 |
315  *      +------+       | on WQ |- - - - - - - - >O
316  *                     +---+---+                 ^
317  *                         |                     |
318  *                         |                     |
319  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
320  *                         v                     |
321  *                 +-------+--------+            |
322  *                 | on loan to MAC |----------->O
323  *                 +----------------+  freemsg()
324  *
325  */
326 
327 /*
328  * Buffer lifecycle: TX
329  * --------------------
330  *
331  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
332  * "foreign" buffers.
333  *
334  * The former have their memory allocated and DMA bound by this driver, while
335  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
336  * not owned by us, though we do DMA bind it (and take responsibility for
337  * un-binding it when we're done with them).
338  *
339  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
340  * SQ. Thus, there is a separate free list and mutex for each kind.
341  *
342  * Since a TX packet might consist of multiple mblks, we translate each mblk
343  * into exactly one buffer_t. The buffer_ts are chained together in the same
344  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
345  *
346  * Each chain of TX buffers may consist of foreign or driver buffers, in any
347  * mixture.
348  *
349  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
350  * it from the rest of the chain buffers.
351  *
352  * TX buffer chains are always returned to the free list by
353  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
354  * freeing all of the members.
355  *
356  * We only call freemsg() once, on the head of the TX buffer chain's original
357  * mblk. This is true whether we copied it or bound it in a foreign buffer.
358  */
359 
360 /*
361  * Startup and command interface
362  * -----------------------------
363  *
364  * The command interface is the primary way in which we give control orders to
365  * the hardware (e.g. actions like "create this queue" or "delete this flow
366  * entry"). The command interface is never used to transmit or receive packets
367  * -- that takes place only on the queues that are set up through it.
368  *
369  * In mlxcx_cmd.c we implement our use of the command interface on top of a
370  * simple taskq. As commands are submitted from the taskq they choose a
371  * "slot", if there are no free slots then execution of the command will
372  * be paused until one is free. The hardware permits up to 32 independent
373  * slots for concurrent command execution.
374  *
375  * Before interrupts are enabled, command completion is polled, once
376  * interrupts are up command completions become asynchronous and are
377  * wired to EQ 0. A caveat to this is commands can not be submitted
378  * directly from EQ 0's completion handler, and any processing resulting from
379  * an asynchronous event which requires further use of the command interface
380  * is posted through a taskq.
381  *
382  * The startup/attach process for this card involves a bunch of different steps
383  * which are summarised pretty well in the PRM. We have to send a number of
384  * commands which do different things to start the card up, give it some pages
385  * of our own memory for it to use, then start creating all the entities that
386  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
387  * and TDoms.
388  */
389 
390 /*
391  * UARs
392  * ----
393  *
394  * The pages of the PCI BAR other than the first few are reserved for use as
395  * "UAR" sections in this device. Each UAR section can be used as a set of
396  * doorbells for our queues.
397  *
398  * Currently we just make one single UAR for all of our queues. It doesn't
399  * seem to be a major limitation yet.
400  *
401  * When we're sending packets through an SQ, the PRM is not awful clear about
402  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
403  * (it's clear on the pattern of alternation you're expected to use between
404  * even and odd for Blueflame sends, but not for regular doorbells).
405  *
406  * Currently we don't do the even-odd alternating pattern for ordinary
407  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
408  * least on Connect-X4 Lx.
409  */
410 
411 /*
412  * Lock ordering
413  * -------------
414  *
415  * Interrupt side:
416  *
417  *  - mleq_mtx
418  *    - mlcq_arm_mtx
419  *      - mlcq_mtx
420  *        - mlcq_bufbmtx
421  *        - mlwq_mtx
422  *          - mlbs_mtx
423  *    - mlp_mtx
424  *
425  * GLD side:
426  *
427  *  - mlp_mtx
428  *    - mlg_mtx
429  *      - mlg_*.mlft_mtx
430  *    - mlp_*.mlft_mtx
431  *    - mlwq_mtx
432  *      - mlbs_mtx
433  *      - mlcq_bufbmtx
434  *  - mleq_mtx
435  *    - mlcq_arm_mtx
436  *      - mlcq_mtx
437  *
438  */
439 
440 #include <sys/modctl.h>
441 #include <sys/conf.h>
442 #include <sys/devops.h>
443 #include <sys/sysmacros.h>
444 #include <sys/time.h>
445 #include <sys/pci.h>
446 #include <sys/mac_provider.h>
447 
448 #include <mlxcx.h>
449 
450 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
451 
452 #define	MLXCX_MODULE_NAME	"mlxcx"
453 /*
454  * We give this to the firmware, so it has to be in a fixed format that it
455  * understands.
456  */
457 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
458 
459 /*
460  * Firmware may take a while to reclaim pages. Try a set number of times.
461  */
462 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
463 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
464 
465 static void *mlxcx_softstate;
466 
467 /*
468  * Fault detection thresholds.
469  */
470 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
471 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
472 
473 static void
mlxcx_load_prop_defaults(mlxcx_t * mlxp)474 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
475 {
476 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
477 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
478 
479 	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
480 	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
481 
482 	/*
483 	 * Currently we have different queue size defaults for two
484 	 * categories of queues. One set for devices which support a
485 	 * maximum speed of 10Gb/s, and another for those above that.
486 	 */
487 	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
488 	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0 ||
489 	    (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_25G |
490 	    MLXCX_EXTPROTO_40G | MLXCX_EXTPROTO_50G | MLXCX_EXTPROTO_100G |
491 	    MLXCX_EXTPROTO_200G | MLXCX_EXTPROTO_400G)) != 0) {
492 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
493 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
494 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
495 	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
496 	    MLXCX_PROTO_10G)) != 0 ||
497 	    (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_100M |
498 	    MLXCX_EXTPROTO_5G | MLXCX_EXTPROTO_1G | MLXCX_EXTPROTO_10G)) != 0) {
499 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
500 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
501 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
502 	} else {
503 		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
504 		    "recognize. Proto: 0x%x", port->mlp_max_proto);
505 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
506 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
507 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
508 	}
509 }
510 
511 /*
512  * Properties which may have different defaults based on hardware
513  * characteristics.
514  */
515 static void
mlxcx_load_model_props(mlxcx_t * mlxp)516 mlxcx_load_model_props(mlxcx_t *mlxp)
517 {
518 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
519 
520 	mlxcx_load_prop_defaults(mlxp);
521 
522 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
523 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
524 	    p->mldp_cq_size_shift_default);
525 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
526 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
527 	    p->mldp_sq_size_shift_default);
528 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
529 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
530 	    p->mldp_rq_size_shift_default);
531 }
532 
533 static void
mlxcx_load_props(mlxcx_t * mlxp)534 mlxcx_load_props(mlxcx_t *mlxp)
535 {
536 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
537 
538 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
539 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
540 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
541 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
542 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
543 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
544 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
545 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
546 	    MLXCX_CQEMOD_COUNT_DFLT);
547 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
548 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
549 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
550 
551 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
552 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
553 	    MLXCX_TX_NGROUPS_DFLT);
554 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
555 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
556 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
557 
558 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
559 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
560 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
561 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
562 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
563 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
564 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
565 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
566 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
567 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
568 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
569 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
570 
571 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
572 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
573 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
574 
575 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
576 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
577 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
578 
579 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
580 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
581 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
582 
583 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
584 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
585 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
586 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
587 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
588 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
589 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
590 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
591 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
592 
593 	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
594 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
595 	    MLXCX_RX_PER_CQ_DEFAULT);
596 
597 	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
598 	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
599 		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
600 		    "out of range. Defaulting to: %d. Valid values are from "
601 		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
602 		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
603 		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
604 	}
605 
606 	p->mldp_rx_p50_loan_min_size = ddi_getprop(DDI_DEV_T_ANY,
607 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
608 	    "rx_p50_loan_min_size", MLXCX_P50_LOAN_MIN_SIZE_DFLT);
609 }
610 
611 void
mlxcx_note(mlxcx_t * mlxp,const char * fmt,...)612 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
613 {
614 	va_list ap;
615 
616 	va_start(ap, fmt);
617 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
618 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
619 	} else {
620 		vcmn_err(CE_NOTE, fmt, ap);
621 	}
622 	va_end(ap);
623 }
624 
625 void
mlxcx_warn(mlxcx_t * mlxp,const char * fmt,...)626 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
627 {
628 	va_list ap;
629 
630 	va_start(ap, fmt);
631 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
632 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
633 	} else {
634 		vcmn_err(CE_WARN, fmt, ap);
635 	}
636 	va_end(ap);
637 }
638 
639 void
mlxcx_panic(mlxcx_t * mlxp,const char * fmt,...)640 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
641 {
642 	va_list ap;
643 
644 	va_start(ap, fmt);
645 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
646 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
647 	} else {
648 		vcmn_err(CE_PANIC, fmt, ap);
649 	}
650 	va_end(ap);
651 }
652 
653 uint16_t
mlxcx_get16(mlxcx_t * mlxp,uintptr_t off)654 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
655 {
656 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
657 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
658 }
659 
660 uint32_t
mlxcx_get32(mlxcx_t * mlxp,uintptr_t off)661 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
662 {
663 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
664 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
665 }
666 
667 uint64_t
mlxcx_get64(mlxcx_t * mlxp,uintptr_t off)668 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
669 {
670 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
671 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
672 }
673 
674 void
mlxcx_put32(mlxcx_t * mlxp,uintptr_t off,uint32_t val)675 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
676 {
677 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
678 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
679 }
680 
681 void
mlxcx_put64(mlxcx_t * mlxp,uintptr_t off,uint64_t val)682 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
683 {
684 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
685 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
686 }
687 
688 void
mlxcx_uar_put32(mlxcx_t * mlxp,mlxcx_uar_t * mlu,uintptr_t off,uint32_t val)689 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
690 {
691 	/*
692 	 * The UAR is always inside the first BAR, which we mapped as
693 	 * mlx_regs
694 	 */
695 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
696 	    (uintptr_t)mlxp->mlx_regs_base;
697 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
698 }
699 
700 void
mlxcx_uar_put64(mlxcx_t * mlxp,mlxcx_uar_t * mlu,uintptr_t off,uint64_t val)701 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
702 {
703 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
704 	    (uintptr_t)mlxp->mlx_regs_base;
705 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
706 }
707 
708 static void
mlxcx_fm_fini(mlxcx_t * mlxp)709 mlxcx_fm_fini(mlxcx_t *mlxp)
710 {
711 	if (mlxp->mlx_fm_caps == 0)
712 		return;
713 
714 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
715 		ddi_fm_handler_unregister(mlxp->mlx_dip);
716 
717 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
718 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
719 		pci_ereport_teardown(mlxp->mlx_dip);
720 
721 	ddi_fm_fini(mlxp->mlx_dip);
722 
723 	mlxp->mlx_fm_caps = 0;
724 }
725 
726 void
mlxcx_fm_ereport(mlxcx_t * mlxp,const char * detail)727 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
728 {
729 	uint64_t ena;
730 	char buf[FM_MAX_CLASS];
731 
732 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
733 		return;
734 
735 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
736 	ena = fm_ena_generate(0, FM_ENA_FMT1);
737 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
738 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
739 	    NULL);
740 }
741 
742 static int
mlxcx_fm_errcb(dev_info_t * dip,ddi_fm_error_t * err,const void * arg)743 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
744 {
745 	/*
746 	 * as the driver can always deal with an error in any dma or
747 	 * access handle, we can just return the fme_status value.
748 	 */
749 	pci_ereport_post(dip, err, NULL);
750 	return (err->fme_status);
751 }
752 
753 static void
mlxcx_fm_init(mlxcx_t * mlxp)754 mlxcx_fm_init(mlxcx_t *mlxp)
755 {
756 	ddi_iblock_cookie_t iblk;
757 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
758 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
759 
760 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
761 	    DDI_PROP_DONTPASS, "fm_capable", def);
762 
763 	if (mlxp->mlx_fm_caps < 0) {
764 		mlxp->mlx_fm_caps = 0;
765 	}
766 	mlxp->mlx_fm_caps &= def;
767 
768 	if (mlxp->mlx_fm_caps == 0)
769 		return;
770 
771 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
772 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
773 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
774 		pci_ereport_setup(mlxp->mlx_dip);
775 	}
776 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
777 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
778 		    (void *)mlxp);
779 	}
780 }
781 
782 static void
mlxcx_mlbs_teardown(mlxcx_t * mlxp,mlxcx_buf_shard_t * s)783 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
784 {
785 	mlxcx_buffer_t *buf;
786 
787 	mutex_enter(&s->mlbs_mtx);
788 
789 	while (!list_is_empty(&s->mlbs_busy))
790 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
791 
792 	while (!list_is_empty(&s->mlbs_loaned))
793 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
794 
795 	while ((buf = list_head(&s->mlbs_free)) != NULL)
796 		mlxcx_buf_destroy(mlxp, buf);
797 
798 	list_destroy(&s->mlbs_free);
799 	list_destroy(&s->mlbs_busy);
800 	list_destroy(&s->mlbs_loaned);
801 	mutex_exit(&s->mlbs_mtx);
802 
803 	cv_destroy(&s->mlbs_free_nonempty);
804 	mutex_destroy(&s->mlbs_mtx);
805 }
806 
807 static void
mlxcx_teardown_bufs(mlxcx_t * mlxp)808 mlxcx_teardown_bufs(mlxcx_t *mlxp)
809 {
810 	mlxcx_buf_shard_t *s;
811 
812 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
813 		mlxcx_mlbs_teardown(mlxp, s);
814 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
815 	}
816 	list_destroy(&mlxp->mlx_buf_shards);
817 
818 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
819 }
820 
821 static void
mlxcx_teardown_pages(mlxcx_t * mlxp)822 mlxcx_teardown_pages(mlxcx_t *mlxp)
823 {
824 	uint_t nzeros = 0;
825 	uint64_t *pas;
826 
827 	pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES,
828 	    KM_SLEEP);
829 
830 	mutex_enter(&mlxp->mlx_pagemtx);
831 
832 	while (mlxp->mlx_npages > 0) {
833 		int32_t req, ret;
834 
835 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
836 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
837 
838 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
839 			mlxcx_warn(mlxp, "hardware refused to return pages, "
840 			    "leaking %u remaining pages", mlxp->mlx_npages);
841 			goto out;
842 		}
843 
844 		for (int32_t i = 0; i < ret; i++) {
845 			mlxcx_dev_page_t *mdp, probe;
846 			bzero(&probe, sizeof (probe));
847 			probe.mxdp_pa = pas[i];
848 
849 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
850 
851 			if (mdp != NULL) {
852 				avl_remove(&mlxp->mlx_pages, mdp);
853 				mlxp->mlx_npages--;
854 				mlxcx_dma_free(&mdp->mxdp_dma);
855 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
856 			} else {
857 				mlxcx_panic(mlxp, "hardware returned a page "
858 				    "with PA 0x%" PRIx64 " but we have no "
859 				    "record of giving out such a page", pas[i]);
860 			}
861 		}
862 
863 		/*
864 		 * If no pages were returned, note that fact.
865 		 */
866 		if (ret == 0) {
867 			nzeros++;
868 			if (nzeros > mlxcx_reclaim_tries) {
869 				mlxcx_warn(mlxp, "hardware refused to return "
870 				    "pages, leaking %u remaining pages",
871 				    mlxp->mlx_npages);
872 				goto out;
873 			}
874 			delay(drv_usectohz(mlxcx_reclaim_delay));
875 		}
876 	}
877 
878 	avl_destroy(&mlxp->mlx_pages);
879 
880 out:
881 	mutex_exit(&mlxp->mlx_pagemtx);
882 	mutex_destroy(&mlxp->mlx_pagemtx);
883 
884 	kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES);
885 }
886 
887 static boolean_t
mlxcx_eq_alloc_dma(mlxcx_t * mlxp,mlxcx_event_queue_t * mleq)888 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
889 {
890 	ddi_device_acc_attr_t acc;
891 	ddi_dma_attr_t attr;
892 	boolean_t ret;
893 	size_t sz, i;
894 
895 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
896 
897 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
898 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
899 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
900 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
901 
902 	mlxcx_dma_acc_attr(mlxp, &acc);
903 	mlxcx_dma_queue_attr(mlxp, &attr);
904 
905 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
906 	    B_TRUE, sz, B_TRUE);
907 	if (!ret) {
908 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
909 		return (B_FALSE);
910 	}
911 
912 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
913 
914 	for (i = 0; i < mleq->mleq_nents; ++i)
915 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
916 
917 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
918 
919 	return (B_TRUE);
920 }
921 
922 static void
mlxcx_eq_rele_dma(mlxcx_t * mlxp,mlxcx_event_queue_t * mleq)923 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
924 {
925 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
926 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
927 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
928 
929 	mlxcx_dma_free(&mleq->mleq_dma);
930 	mleq->mleq_ent = NULL;
931 
932 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
933 }
934 
935 void
mlxcx_teardown_flow_table(mlxcx_t * mlxp,mlxcx_flow_table_t * ft)936 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
937 {
938 	mlxcx_flow_group_t *fg;
939 	mlxcx_flow_entry_t *fe;
940 	int i;
941 
942 	ASSERT(mutex_owned(&ft->mlft_mtx));
943 
944 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
945 		fe = &ft->mlft_ent[i];
946 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
947 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
948 				mlxcx_panic(mlxp, "failed to delete flow "
949 				    "entry %u on table %u", i,
950 				    ft->mlft_num);
951 			}
952 		}
953 	}
954 
955 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
956 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
957 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
958 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
959 				mlxcx_panic(mlxp, "failed to destroy flow "
960 				    "group %u", fg->mlfg_num);
961 			}
962 		}
963 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
964 	}
965 	list_destroy(&ft->mlft_groups);
966 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
967 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
968 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
969 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
970 			    ft->mlft_num);
971 		}
972 	}
973 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
974 	ft->mlft_ent = NULL;
975 	mutex_exit(&ft->mlft_mtx);
976 	mutex_destroy(&ft->mlft_mtx);
977 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
978 }
979 
980 static void
mlxcx_teardown_ports(mlxcx_t * mlxp)981 mlxcx_teardown_ports(mlxcx_t *mlxp)
982 {
983 	uint_t i;
984 	mlxcx_port_t *p;
985 	mlxcx_flow_table_t *ft;
986 
987 	for (i = 0; i < mlxp->mlx_nports; ++i) {
988 		p = &mlxp->mlx_ports[i];
989 		if (!(p->mlp_init & MLXCX_PORT_INIT))
990 			continue;
991 		mutex_enter(&p->mlp_mtx);
992 		if ((ft = p->mlp_rx_flow) != NULL) {
993 			mutex_enter(&ft->mlft_mtx);
994 			/*
995 			 * teardown_flow_table() will destroy the mutex, so
996 			 * we don't release it here.
997 			 */
998 			mlxcx_teardown_flow_table(mlxp, ft);
999 		}
1000 		mutex_exit(&p->mlp_mtx);
1001 		mutex_destroy(&p->mlp_mtx);
1002 		mutex_destroy(&p->mlx_port_event.mla_mtx);
1003 		p->mlx_port_event.mla_mlx = NULL;
1004 		p->mlx_port_event.mla_port = NULL;
1005 		p->mlp_init &= ~MLXCX_PORT_INIT;
1006 	}
1007 
1008 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
1009 	mlxp->mlx_ports = NULL;
1010 }
1011 
1012 static void
mlxcx_teardown_wqs(mlxcx_t * mlxp)1013 mlxcx_teardown_wqs(mlxcx_t *mlxp)
1014 {
1015 	mlxcx_work_queue_t *mlwq;
1016 
1017 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
1018 		mlxcx_wq_teardown(mlxp, mlwq);
1019 	}
1020 	list_destroy(&mlxp->mlx_wqs);
1021 }
1022 
1023 static void
mlxcx_teardown_cqs(mlxcx_t * mlxp)1024 mlxcx_teardown_cqs(mlxcx_t *mlxp)
1025 {
1026 	mlxcx_completion_queue_t *mlcq;
1027 
1028 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
1029 		mlxcx_cq_teardown(mlxp, mlcq);
1030 	}
1031 	list_destroy(&mlxp->mlx_cqs);
1032 }
1033 
1034 static void
mlxcx_teardown_eqs(mlxcx_t * mlxp)1035 mlxcx_teardown_eqs(mlxcx_t *mlxp)
1036 {
1037 	mlxcx_event_queue_t *mleq;
1038 	uint_t i;
1039 
1040 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1041 		mleq = &mlxp->mlx_eqs[i];
1042 		mutex_enter(&mleq->mleq_mtx);
1043 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1044 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1045 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1046 				mlxcx_warn(mlxp, "failed to destroy "
1047 				    "event queue idx %u eqn %u",
1048 				    i, mleq->mleq_num);
1049 			}
1050 		}
1051 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1052 			mlxcx_eq_rele_dma(mlxp, mleq);
1053 		}
1054 		mutex_exit(&mleq->mleq_mtx);
1055 	}
1056 }
1057 
1058 static void
mlxcx_teardown_checktimers(mlxcx_t * mlxp)1059 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1060 {
1061 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1062 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1063 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1064 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1065 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1066 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1067 }
1068 
1069 static void
mlxcx_teardown(mlxcx_t * mlxp)1070 mlxcx_teardown(mlxcx_t *mlxp)
1071 {
1072 	uint_t i;
1073 	dev_info_t *dip = mlxp->mlx_dip;
1074 
1075 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1076 		/*
1077 		 * Disable interrupts and let any active vectors quiesce.
1078 		 */
1079 		mlxcx_intr_disable(mlxp);
1080 	}
1081 
1082 	if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) {
1083 		mlxcx_teardown_sensors(mlxp);
1084 		mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS;
1085 	}
1086 
1087 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1088 		mlxcx_teardown_checktimers(mlxp);
1089 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1090 	}
1091 
1092 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1093 		mlxcx_teardown_groups(mlxp);
1094 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1095 	}
1096 
1097 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1098 		mlxcx_teardown_wqs(mlxp);
1099 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1100 	}
1101 
1102 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1103 		mlxcx_teardown_cqs(mlxp);
1104 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1105 	}
1106 
1107 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1108 		mlxcx_teardown_bufs(mlxp);
1109 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1110 	}
1111 
1112 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1113 		mlxcx_teardown_ports(mlxp);
1114 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1115 	}
1116 
1117 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1118 		mlxcx_teardown_eqs(mlxp);
1119 		mlxcx_intr_teardown(mlxp);
1120 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1121 	}
1122 
1123 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1124 		if (mlxp->mlx_uar.mlu_allocated) {
1125 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1126 				mlxcx_warn(mlxp, "failed to release UAR");
1127 			}
1128 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1129 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1130 		}
1131 		if (mlxp->mlx_pd.mlpd_allocated &&
1132 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1133 			mlxcx_warn(mlxp, "failed to release PD");
1134 		}
1135 		if (mlxp->mlx_tdom.mltd_allocated &&
1136 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1137 			mlxcx_warn(mlxp, "failed to release TDOM");
1138 		}
1139 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1140 	}
1141 
1142 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1143 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1144 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1145 			    "command during device detach");
1146 		}
1147 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1148 	}
1149 
1150 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1151 		mlxcx_teardown_pages(mlxp);
1152 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1153 	}
1154 
1155 	if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) {
1156 		for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
1157 			mlxp->mlx_npages_req[i].mla_mlx = NULL;
1158 			mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx);
1159 		}
1160 		taskq_destroy(mlxp->mlx_async_tq);
1161 		mlxp->mlx_async_tq = NULL;
1162 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ;
1163 	}
1164 
1165 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1166 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1167 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1168 			    "during device detach");
1169 		}
1170 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1171 	}
1172 
1173 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1174 		mlxcx_cmd_queue_fini(mlxp);
1175 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1176 	}
1177 
1178 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1179 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1180 		mlxp->mlx_caps = NULL;
1181 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1182 	}
1183 
1184 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1185 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1186 		mlxp->mlx_regs_handle = NULL;
1187 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1188 	}
1189 
1190 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1191 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1192 		mlxp->mlx_cfg_handle = NULL;
1193 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1194 	}
1195 
1196 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1197 		mlxcx_fm_fini(mlxp);
1198 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1199 	}
1200 
1201 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1202 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1203 	ddi_set_driver_private(dip, NULL);
1204 }
1205 
1206 static void
mlxcx_get_model(mlxcx_t * mlxp)1207 mlxcx_get_model(mlxcx_t *mlxp)
1208 {
1209 	uint16_t venid;
1210 	uint16_t devid;
1211 
1212 	venid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_VENID);
1213 	if (venid != MLXCX_VENDOR_ID) {
1214 		/* Currently, all supported cards have a Mellanox vendor id. */
1215 		mlxp->mlx_type = MLXCX_DEV_UNKNOWN;
1216 		return;
1217 	}
1218 
1219 	devid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_DEVID);
1220 	switch (devid) {
1221 	case MLXCX_CX4_DEVID:
1222 	case MLXCX_CX4_VF_DEVID:
1223 	case MLXCX_CX4_LX_VF_DEVID:
1224 		mlxp->mlx_type = MLXCX_DEV_CX4;
1225 		break;
1226 	case MLXCX_CX5_DEVID:
1227 	case MLXCX_CX5_VF_DEVID:
1228 	case MLXCX_CX5_EX_DEVID:
1229 	case MLXCX_CX5_EX_VF_DEVID:
1230 	case MLXCX_CX5_GEN_VF_DEVID:
1231 		mlxp->mlx_type = MLXCX_DEV_CX5;
1232 		break;
1233 	case MLXCX_CX6_DEVID:
1234 	case MLXCX_CX6_VF_DEVID:
1235 	case MLXCX_CX6_DF_DEVID:
1236 	case MLXCX_CX6_LX_DEVID:
1237 		mlxp->mlx_type = MLXCX_DEV_CX6;
1238 		break;
1239 	default:
1240 		mlxp->mlx_type = MLXCX_DEV_UNKNOWN;
1241 	}
1242 }
1243 
1244 static boolean_t
mlxcx_regs_map(mlxcx_t * mlxp)1245 mlxcx_regs_map(mlxcx_t *mlxp)
1246 {
1247 	off_t memsize;
1248 	int ret;
1249 	ddi_device_acc_attr_t da;
1250 
1251 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1252 	    DDI_SUCCESS) {
1253 		mlxcx_warn(mlxp, "failed to get register set size");
1254 		return (B_FALSE);
1255 	}
1256 
1257 	/*
1258 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1259 	 * device.
1260 	 */
1261 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1262 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1263 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1264 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1265 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1266 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1267 	} else {
1268 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1269 	}
1270 
1271 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1272 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1273 
1274 	if (ret != DDI_SUCCESS) {
1275 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1276 		return (B_FALSE);
1277 	}
1278 
1279 	return (B_TRUE);
1280 }
1281 
1282 static boolean_t
mlxcx_check_issi(mlxcx_t * mlxp)1283 mlxcx_check_issi(mlxcx_t *mlxp)
1284 {
1285 	uint32_t issi;
1286 
1287 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1288 		mlxcx_warn(mlxp, "failed to get ISSI");
1289 		return (B_FALSE);
1290 	}
1291 
1292 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1293 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1294 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1295 		return (B_FALSE);
1296 	}
1297 
1298 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1299 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1300 		    MLXCX_CURRENT_ISSI);
1301 		return (B_FALSE);
1302 	}
1303 
1304 	return (B_TRUE);
1305 }
1306 
1307 boolean_t
mlxcx_give_pages(mlxcx_t * mlxp,int32_t npages,int32_t * ngiven)1308 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven)
1309 {
1310 	ddi_device_acc_attr_t acc;
1311 	ddi_dma_attr_t attr;
1312 	int32_t i;
1313 	list_t plist;
1314 	mlxcx_dev_page_t *mdp;
1315 	mlxcx_dev_page_t **pages;
1316 	const ddi_dma_cookie_t *ck;
1317 
1318 	/*
1319 	 * If there are no pages required, then we're done here.
1320 	 */
1321 	if (npages <= 0) {
1322 		*ngiven = 0;
1323 		return (B_TRUE);
1324 	}
1325 
1326 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
1327 
1328 	pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP);
1329 
1330 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1331 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1332 
1333 	for (i = 0; i < npages; i++) {
1334 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1335 		mlxcx_dma_acc_attr(mlxp, &acc);
1336 		mlxcx_dma_page_attr(mlxp, &attr);
1337 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1338 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1339 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1340 			    npages);
1341 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1342 			goto cleanup_npages;
1343 		}
1344 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1345 		mdp->mxdp_pa = ck->dmac_laddress;
1346 
1347 		list_insert_tail(&plist, mdp);
1348 	}
1349 
1350 	/*
1351 	 * Now that all of the pages have been allocated, given them to hardware
1352 	 * in chunks.
1353 	 */
1354 	for (i = 0; i < npages; i++) {
1355 		pages[i] = list_remove_head(&plist);
1356 	}
1357 
1358 	if (!mlxcx_cmd_give_pages(mlxp,
1359 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
1360 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1361 		    "pages!", npages);
1362 		for (i = 0; i < npages; i++) {
1363 			list_insert_tail(&plist, pages[i]);
1364 		}
1365 		goto cleanup_npages;
1366 	}
1367 
1368 	mutex_enter(&mlxp->mlx_pagemtx);
1369 	for (i = 0; i < npages; i++) {
1370 		avl_add(&mlxp->mlx_pages, pages[i]);
1371 	}
1372 	mlxp->mlx_npages += npages;
1373 	mutex_exit(&mlxp->mlx_pagemtx);
1374 
1375 	list_destroy(&plist);
1376 	kmem_free(pages, sizeof (*pages) * npages);
1377 
1378 	*ngiven = npages;
1379 
1380 	return (B_TRUE);
1381 
1382 cleanup_npages:
1383 	kmem_free(pages, sizeof (*pages) * npages);
1384 	while ((mdp = list_remove_head(&plist)) != NULL) {
1385 		mlxcx_dma_free(&mdp->mxdp_dma);
1386 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1387 	}
1388 	list_destroy(&plist);
1389 	return (B_FALSE);
1390 }
1391 
1392 static boolean_t
mlxcx_init_pages(mlxcx_t * mlxp,uint_t type)1393 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1394 {
1395 	int32_t npages, given;
1396 
1397 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1398 		mlxcx_warn(mlxp, "failed to determine boot pages");
1399 		return (B_FALSE);
1400 	}
1401 
1402 	while (npages > 0) {
1403 		if (!mlxcx_give_pages(mlxp, npages, &given))
1404 			return (B_FALSE);
1405 
1406 		npages -= given;
1407 	}
1408 
1409 	return (B_TRUE);
1410 }
1411 
1412 static int
mlxcx_bufs_cache_constr(void * arg,void * cookie,int kmflags)1413 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1414 {
1415 	mlxcx_t *mlxp = cookie;
1416 	mlxcx_buffer_t *b = arg;
1417 
1418 	bzero(b, sizeof (mlxcx_buffer_t));
1419 	b->mlb_mlx = mlxp;
1420 	b->mlb_state = MLXCX_BUFFER_INIT;
1421 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1422 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1423 
1424 	return (0);
1425 }
1426 
1427 static void
mlxcx_bufs_cache_destr(void * arg,void * cookie)1428 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1429 {
1430 	mlxcx_t *mlxp = cookie;
1431 	mlxcx_buffer_t *b = arg;
1432 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1433 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1434 	list_destroy(&b->mlb_tx_chain);
1435 }
1436 
1437 mlxcx_buf_shard_t *
mlxcx_mlbs_create(mlxcx_t * mlxp)1438 mlxcx_mlbs_create(mlxcx_t *mlxp)
1439 {
1440 	mlxcx_buf_shard_t *s;
1441 
1442 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1443 
1444 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1445 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1446 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1447 	    offsetof(mlxcx_buffer_t, mlb_entry));
1448 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1449 	    offsetof(mlxcx_buffer_t, mlb_entry));
1450 	list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t),
1451 	    offsetof(mlxcx_buffer_t, mlb_entry));
1452 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1453 
1454 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1455 
1456 	return (s);
1457 }
1458 
1459 static boolean_t
mlxcx_setup_bufs(mlxcx_t * mlxp)1460 mlxcx_setup_bufs(mlxcx_t *mlxp)
1461 {
1462 	char namebuf[KSTAT_STRLEN];
1463 
1464 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1465 	    ddi_get_instance(mlxp->mlx_dip));
1466 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1467 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1468 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1469 	    NULL, mlxp, NULL, 0);
1470 
1471 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1472 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1473 
1474 	return (B_TRUE);
1475 }
1476 
1477 static void
mlxcx_fm_qstate_ereport(mlxcx_t * mlxp,const char * qtype,uint32_t qnum,const char * state,uint8_t statenum)1478 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1479     const char *state, uint8_t statenum)
1480 {
1481 	uint64_t ena;
1482 	char buf[FM_MAX_CLASS];
1483 
1484 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1485 		return;
1486 
1487 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1488 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1489 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1490 
1491 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1492 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1493 	    "state", DATA_TYPE_STRING, state,
1494 	    "state_num", DATA_TYPE_UINT8, statenum,
1495 	    "qtype", DATA_TYPE_STRING, qtype,
1496 	    "qnum", DATA_TYPE_UINT32, qnum,
1497 	    NULL);
1498 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1499 }
1500 
1501 /*
1502  * The following set of routines are for monitoring the health of
1503  * event, completion and work queues. They run infrequently peeking at
1504  * the structs to catch stalls and inconsistent state.
1505  *
1506  * They peek at the structs *without* acquiring locks - we don't want
1507  * to impede flow of data. Driver start up and shutdown semantics
1508  * guarantee the structs are present and won't disappear underneath
1509  * these routines.
1510  *
1511  * As previously noted, the routines peek at active data in the structs and
1512  * they will store some values for comparison on next invocation. To
1513  * maintain integrity of the saved values, these values are only modified
1514  * within these routines.
1515  */
1516 static void
mlxcx_eq_check(void * arg)1517 mlxcx_eq_check(void *arg)
1518 {
1519 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1520 	mlxcx_event_queue_t *eq;
1521 	mlxcx_eventq_ctx_t ctx;
1522 	const char *str;
1523 
1524 	uint_t i;
1525 
1526 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1527 		eq = &mlxp->mlx_eqs[i];
1528 
1529 		if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0)
1530 			continue;
1531 
1532 		/*
1533 		 * If the event queue was successfully created in the HCA,
1534 		 * then initialization and shutdown sequences guarantee
1535 		 * the queue exists.
1536 		 */
1537 		ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED);
1538 
1539 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx))
1540 			continue;
1541 
1542 		str = "???";
1543 		switch (ctx.mleqc_status) {
1544 		case MLXCX_EQ_STATUS_OK:
1545 			break;
1546 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1547 			str = "WRITE_FAILURE";
1548 			break;
1549 		}
1550 
1551 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1552 			mlxcx_fm_qstate_ereport(mlxp, "event",
1553 			    eq->mleq_num, str, ctx.mleqc_status);
1554 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1555 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1556 		}
1557 
1558 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1559 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1560 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1561 			    ++eq->mleq_check_disarm_cnt >= 3) {
1562 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1563 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1564 				    eq->mleq_intr_index);
1565 			}
1566 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1567 		} else {
1568 			eq->mleq_check_disarm_cc = 0;
1569 			eq->mleq_check_disarm_cnt = 0;
1570 		}
1571 	}
1572 }
1573 
1574 static void
mlxcx_cq_check(void * arg)1575 mlxcx_cq_check(void *arg)
1576 {
1577 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1578 	mlxcx_completion_queue_t *cq;
1579 	mlxcx_completionq_ctx_t ctx;
1580 	const char *str, *type;
1581 	uint_t v;
1582 
1583 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1584 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1585 
1586 		if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0)
1587 			continue;
1588 
1589 		/*
1590 		 * If the completion queue was successfully created in the HCA,
1591 		 * then initialization and shutdown sequences guarantee
1592 		 * the queue exists.
1593 		 */
1594 		ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED);
1595 		ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN);
1596 
1597 		if (cq->mlcq_fm_repd_qstate)
1598 			continue;
1599 
1600 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx))
1601 			continue;
1602 
1603 		if (cq->mlcq_wq != NULL) {
1604 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1605 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1606 				type = "rx ";
1607 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1608 				type = "tx ";
1609 			else
1610 				type = "";
1611 		} else {
1612 			type = "";
1613 		}
1614 
1615 		str = "???";
1616 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1617 		switch (v) {
1618 		case MLXCX_CQC_STATUS_OK:
1619 			break;
1620 		case MLXCX_CQC_STATUS_OVERFLOW:
1621 			str = "OVERFLOW";
1622 			break;
1623 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1624 			str = "WRITE_FAIL";
1625 			break;
1626 		case MLXCX_CQC_STATUS_INVALID:
1627 			str = "INVALID";
1628 			break;
1629 		}
1630 
1631 		if (v != MLXCX_CQC_STATUS_OK) {
1632 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1633 			    cq->mlcq_num, str, v);
1634 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1635 			    type, cq->mlcq_num, v, str);
1636 			cq->mlcq_fm_repd_qstate = B_TRUE;
1637 		}
1638 
1639 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1640 		if (v != MLXCX_CQC_STATE_ARMED &&
1641 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1642 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1643 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1644 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1645 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1646 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1647 				    type, cq->mlcq_num, cq);
1648 			}
1649 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1650 		} else {
1651 			cq->mlcq_check_disarm_cnt = 0;
1652 			cq->mlcq_check_disarm_cc = 0;
1653 		}
1654 	}
1655 }
1656 
1657 void
mlxcx_check_sq(mlxcx_t * mlxp,mlxcx_work_queue_t * sq)1658 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1659 {
1660 	mlxcx_sq_ctx_t ctx;
1661 	mlxcx_sq_state_t state;
1662 
1663 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1664 		return;
1665 
1666 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1667 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1668 	switch (state) {
1669 	case MLXCX_SQ_STATE_RST:
1670 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1671 			mlxcx_fm_qstate_ereport(mlxp, "send",
1672 			    sq->mlwq_num, "RST", state);
1673 			sq->mlwq_fm_repd_qstate = B_TRUE;
1674 		}
1675 		break;
1676 	case MLXCX_SQ_STATE_RDY:
1677 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1678 			mlxcx_fm_qstate_ereport(mlxp, "send",
1679 			    sq->mlwq_num, "RDY", state);
1680 			sq->mlwq_fm_repd_qstate = B_TRUE;
1681 		}
1682 		break;
1683 	case MLXCX_SQ_STATE_ERR:
1684 		mlxcx_fm_qstate_ereport(mlxp, "send",
1685 		    sq->mlwq_num, "ERR", state);
1686 		sq->mlwq_fm_repd_qstate = B_TRUE;
1687 		break;
1688 	default:
1689 		mlxcx_fm_qstate_ereport(mlxp, "send",
1690 		    sq->mlwq_num, "???", state);
1691 		sq->mlwq_fm_repd_qstate = B_TRUE;
1692 		break;
1693 	}
1694 }
1695 
1696 void
mlxcx_check_rq(mlxcx_t * mlxp,mlxcx_work_queue_t * rq)1697 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1698 {
1699 	mlxcx_rq_ctx_t ctx;
1700 	mlxcx_rq_state_t state;
1701 
1702 
1703 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1704 		return;
1705 
1706 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1707 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1708 	switch (state) {
1709 	case MLXCX_RQ_STATE_RST:
1710 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1711 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1712 			    rq->mlwq_num, "RST", state);
1713 			rq->mlwq_fm_repd_qstate = B_TRUE;
1714 		}
1715 		break;
1716 	case MLXCX_RQ_STATE_RDY:
1717 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1718 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1719 			    rq->mlwq_num, "RDY", state);
1720 			rq->mlwq_fm_repd_qstate = B_TRUE;
1721 		}
1722 		break;
1723 	case MLXCX_RQ_STATE_ERR:
1724 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1725 		    rq->mlwq_num, "ERR", state);
1726 		rq->mlwq_fm_repd_qstate = B_TRUE;
1727 		break;
1728 	default:
1729 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1730 		    rq->mlwq_num, "???", state);
1731 		rq->mlwq_fm_repd_qstate = B_TRUE;
1732 		break;
1733 	}
1734 }
1735 
1736 static void
mlxcx_wq_check(void * arg)1737 mlxcx_wq_check(void *arg)
1738 {
1739 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1740 	mlxcx_work_queue_t *wq;
1741 
1742 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1743 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1744 
1745 		if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0)
1746 			continue;
1747 
1748 		/*
1749 		 * If the work queue was successfully created in the HCA,
1750 		 * then initialization and shutdown sequences guarantee
1751 		 * the queue exists.
1752 		 */
1753 		ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED);
1754 		ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN);
1755 
1756 		if (wq->mlwq_fm_repd_qstate)
1757 			continue;
1758 
1759 		switch (wq->mlwq_type) {
1760 		case MLXCX_WQ_TYPE_SENDQ:
1761 			mlxcx_check_sq(mlxp, wq);
1762 			break;
1763 		case MLXCX_WQ_TYPE_RECVQ:
1764 			mlxcx_check_rq(mlxp, wq);
1765 			break;
1766 		}
1767 	}
1768 }
1769 
1770 static boolean_t
mlxcx_setup_checktimers(mlxcx_t * mlxp)1771 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1772 {
1773 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1774 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1775 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1776 		    DDI_IPL_0);
1777 	}
1778 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1779 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1780 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1781 		    DDI_IPL_0);
1782 	}
1783 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1784 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1785 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1786 		    DDI_IPL_0);
1787 	}
1788 	return (B_TRUE);
1789 }
1790 
1791 int
mlxcx_dmac_fe_compare(const void * arg0,const void * arg1)1792 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1793 {
1794 	const mlxcx_flow_entry_t *left = arg0;
1795 	const mlxcx_flow_entry_t *right = arg1;
1796 	int bcmpr;
1797 
1798 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1799 	    sizeof (left->mlfe_dmac));
1800 	if (bcmpr < 0)
1801 		return (-1);
1802 	if (bcmpr > 0)
1803 		return (1);
1804 	if (left->mlfe_vid < right->mlfe_vid)
1805 		return (-1);
1806 	if (left->mlfe_vid > right->mlfe_vid)
1807 		return (1);
1808 	return (0);
1809 }
1810 
1811 int
mlxcx_grmac_compare(const void * arg0,const void * arg1)1812 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1813 {
1814 	const mlxcx_group_mac_t *left = arg0;
1815 	const mlxcx_group_mac_t *right = arg1;
1816 	int bcmpr;
1817 
1818 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1819 	    sizeof (left->mlgm_mac));
1820 	if (bcmpr < 0)
1821 		return (-1);
1822 	if (bcmpr > 0)
1823 		return (1);
1824 	return (0);
1825 }
1826 
1827 int
mlxcx_page_compare(const void * arg0,const void * arg1)1828 mlxcx_page_compare(const void *arg0, const void *arg1)
1829 {
1830 	const mlxcx_dev_page_t *p0 = arg0;
1831 	const mlxcx_dev_page_t *p1 = arg1;
1832 
1833 	if (p0->mxdp_pa < p1->mxdp_pa)
1834 		return (-1);
1835 	if (p0->mxdp_pa > p1->mxdp_pa)
1836 		return (1);
1837 	return (0);
1838 }
1839 
1840 static boolean_t
mlxcx_setup_ports(mlxcx_t * mlxp)1841 mlxcx_setup_ports(mlxcx_t *mlxp)
1842 {
1843 	uint_t i, j;
1844 	mlxcx_port_t *p;
1845 	mlxcx_flow_table_t *ft;
1846 	mlxcx_flow_group_t *fg;
1847 	mlxcx_flow_entry_t *fe;
1848 
1849 	VERIFY3U(mlxp->mlx_nports, >, 0);
1850 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1851 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1852 
1853 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1854 		p = &mlxp->mlx_ports[i];
1855 		p->mlp_num = i;
1856 		p->mlx_port_event.mla_mlx = mlxp;
1857 		p->mlx_port_event.mla_port = p;
1858 		mutex_init(&p->mlx_port_event.mla_mtx, NULL,
1859 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
1860 		p->mlp_init |= MLXCX_PORT_INIT;
1861 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1862 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1863 		mutex_enter(&p->mlp_mtx);
1864 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1865 			mutex_exit(&p->mlp_mtx);
1866 			goto err;
1867 		}
1868 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1869 			mutex_exit(&p->mlp_mtx);
1870 			goto err;
1871 		}
1872 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1873 			mutex_exit(&p->mlp_mtx);
1874 			goto err;
1875 		}
1876 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1877 			mutex_exit(&p->mlp_mtx);
1878 			goto err;
1879 		}
1880 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1881 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1882 			mutex_exit(&p->mlp_mtx);
1883 			goto err;
1884 		}
1885 		if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
1886 			mutex_exit(&p->mlp_mtx);
1887 			goto err;
1888 		}
1889 		p->mlp_fec_requested = LINK_FEC_AUTO;
1890 
1891 		mutex_exit(&p->mlp_mtx);
1892 	}
1893 
1894 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1895 		p = &mlxp->mlx_ports[i];
1896 		mutex_enter(&p->mlp_mtx);
1897 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1898 		    KM_SLEEP));
1899 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1900 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1901 
1902 		mutex_enter(&ft->mlft_mtx);
1903 
1904 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1905 		ft->mlft_port = p;
1906 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1907 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1908 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1909 		ft->mlft_nents = (1 << ft->mlft_entshift);
1910 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1911 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1912 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1913 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1914 
1915 		for (j = 0; j < ft->mlft_nents; ++j) {
1916 			ft->mlft_ent[j].mlfe_table = ft;
1917 			ft->mlft_ent[j].mlfe_index = j;
1918 		}
1919 
1920 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1921 			mutex_exit(&ft->mlft_mtx);
1922 			mutex_exit(&p->mlp_mtx);
1923 			goto err;
1924 		}
1925 
1926 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1927 			mutex_exit(&ft->mlft_mtx);
1928 			mutex_exit(&p->mlp_mtx);
1929 			goto err;
1930 		}
1931 
1932 		/*
1933 		 * We match broadcast at the top of the root flow table, then
1934 		 * all multicast/unicast MACs, then the promisc entry is down
1935 		 * the very bottom.
1936 		 *
1937 		 * This way when promisc is on, that entry simply catches any
1938 		 * remaining traffic that earlier flows haven't matched.
1939 		 */
1940 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1941 		list_insert_tail(&ft->mlft_groups, fg);
1942 		fg->mlfg_table = ft;
1943 		fg->mlfg_size = 1;
1944 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1945 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1946 			mutex_exit(&ft->mlft_mtx);
1947 			mutex_exit(&p->mlp_mtx);
1948 			goto err;
1949 		}
1950 		p->mlp_bcast = fg;
1951 		fe = list_head(&fg->mlfg_entries);
1952 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1953 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1954 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1955 
1956 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1957 		list_insert_tail(&ft->mlft_groups, fg);
1958 		fg->mlfg_table = ft;
1959 		fg->mlfg_size = ft->mlft_nents - 2;
1960 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1961 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1962 			mutex_exit(&ft->mlft_mtx);
1963 			mutex_exit(&p->mlp_mtx);
1964 			goto err;
1965 		}
1966 		p->mlp_umcast = fg;
1967 
1968 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1969 		list_insert_tail(&ft->mlft_groups, fg);
1970 		fg->mlfg_table = ft;
1971 		fg->mlfg_size = 1;
1972 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1973 			mutex_exit(&ft->mlft_mtx);
1974 			mutex_exit(&p->mlp_mtx);
1975 			goto err;
1976 		}
1977 		p->mlp_promisc = fg;
1978 		fe = list_head(&fg->mlfg_entries);
1979 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1980 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1981 
1982 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1983 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1984 		    mlfe_dmac_entry));
1985 
1986 		mutex_exit(&ft->mlft_mtx);
1987 		mutex_exit(&p->mlp_mtx);
1988 	}
1989 
1990 	return (B_TRUE);
1991 
1992 err:
1993 	mlxcx_teardown_ports(mlxp);
1994 	return (B_FALSE);
1995 }
1996 
1997 void
mlxcx_remove_all_vlan_entries(mlxcx_t * mlxp,mlxcx_ring_group_t * g)1998 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1999 {
2000 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2001 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2002 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2003 	mlxcx_flow_entry_t *fe;
2004 	mlxcx_group_vlan_t *v;
2005 
2006 	ASSERT(mutex_owned(&g->mlg_mtx));
2007 
2008 	mutex_enter(&ft->mlft_mtx);
2009 
2010 	if (!list_is_empty(&g->mlg_rx_vlans)) {
2011 		fe = list_head(&dfg->mlfg_entries);
2012 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2013 	}
2014 
2015 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
2016 		fe = v->mlgv_fe;
2017 		ASSERT3P(fe->mlfe_table, ==, ft);
2018 		ASSERT3P(fe->mlfe_group, ==, fg);
2019 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2020 
2021 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2022 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2023 	}
2024 
2025 	mutex_exit(&ft->mlft_mtx);
2026 }
2027 
2028 boolean_t
mlxcx_remove_vlan_entry(mlxcx_t * mlxp,mlxcx_ring_group_t * g,boolean_t tagged,uint16_t vid)2029 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
2030     boolean_t tagged, uint16_t vid)
2031 {
2032 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2033 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2034 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2035 	mlxcx_flow_entry_t *fe;
2036 	mlxcx_group_vlan_t *v;
2037 	boolean_t found = B_FALSE;
2038 
2039 	ASSERT(mutex_owned(&g->mlg_mtx));
2040 
2041 	mutex_enter(&ft->mlft_mtx);
2042 
2043 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2044 	    v = list_next(&g->mlg_rx_vlans, v)) {
2045 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2046 			found = B_TRUE;
2047 			break;
2048 		}
2049 	}
2050 	if (!found) {
2051 		mutex_exit(&ft->mlft_mtx);
2052 		return (B_FALSE);
2053 	}
2054 
2055 	list_remove(&g->mlg_rx_vlans, v);
2056 
2057 	/*
2058 	 * If this is the last VLAN entry, we have to go back to accepting
2059 	 * any VLAN (which means re-enabling the default entry).
2060 	 *
2061 	 * Do this before we remove the flow entry for the last specific
2062 	 * VLAN so that we don't lose any traffic in the transition.
2063 	 */
2064 	if (list_is_empty(&g->mlg_rx_vlans)) {
2065 		fe = list_head(&dfg->mlfg_entries);
2066 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2067 			list_insert_tail(&g->mlg_rx_vlans, v);
2068 			mutex_exit(&ft->mlft_mtx);
2069 			return (B_FALSE);
2070 		}
2071 	}
2072 
2073 	fe = v->mlgv_fe;
2074 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
2075 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
2076 	ASSERT3P(fe->mlfe_table, ==, ft);
2077 	ASSERT3P(fe->mlfe_group, ==, fg);
2078 
2079 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
2080 		list_insert_tail(&g->mlg_rx_vlans, v);
2081 		fe = list_head(&dfg->mlfg_entries);
2082 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
2083 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2084 		}
2085 		mutex_exit(&ft->mlft_mtx);
2086 		return (B_FALSE);
2087 	}
2088 
2089 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2090 
2091 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
2092 
2093 	mutex_exit(&ft->mlft_mtx);
2094 	return (B_TRUE);
2095 }
2096 
2097 boolean_t
mlxcx_add_vlan_entry(mlxcx_t * mlxp,mlxcx_ring_group_t * g,boolean_t tagged,uint16_t vid)2098 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
2099     uint16_t vid)
2100 {
2101 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2102 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2103 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2104 	mlxcx_flow_entry_t *fe;
2105 	mlxcx_group_vlan_t *v;
2106 	boolean_t found = B_FALSE;
2107 	boolean_t first = B_FALSE;
2108 
2109 	ASSERT(mutex_owned(&g->mlg_mtx));
2110 
2111 	mutex_enter(&ft->mlft_mtx);
2112 
2113 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2114 	    v = list_next(&g->mlg_rx_vlans, v)) {
2115 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2116 			mutex_exit(&ft->mlft_mtx);
2117 			return (B_TRUE);
2118 		}
2119 	}
2120 	if (list_is_empty(&g->mlg_rx_vlans))
2121 		first = B_TRUE;
2122 
2123 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2124 	    fe = list_next(&fg->mlfg_entries, fe)) {
2125 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2126 			found = B_TRUE;
2127 			break;
2128 		}
2129 	}
2130 	if (!found) {
2131 		mutex_exit(&ft->mlft_mtx);
2132 		return (B_FALSE);
2133 	}
2134 
2135 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
2136 	v->mlgv_fe = fe;
2137 	v->mlgv_tagged = tagged;
2138 	v->mlgv_vid = vid;
2139 
2140 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2141 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2142 	fe->mlfe_vid = vid;
2143 	if (tagged) {
2144 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2145 	} else {
2146 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2147 	}
2148 
2149 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2150 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2151 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2152 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2153 		mutex_exit(&ft->mlft_mtx);
2154 		return (B_FALSE);
2155 	}
2156 
2157 	list_insert_tail(&g->mlg_rx_vlans, v);
2158 
2159 	/*
2160 	 * If the vlan list was empty for this group before adding this one,
2161 	 * then we no longer want the "default" entry to allow all VLANs
2162 	 * through.
2163 	 */
2164 	if (first) {
2165 		fe = list_head(&dfg->mlfg_entries);
2166 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2167 	}
2168 
2169 	mutex_exit(&ft->mlft_mtx);
2170 	return (B_TRUE);
2171 }
2172 
2173 void
mlxcx_remove_all_umcast_entries(mlxcx_t * mlxp,mlxcx_port_t * port,mlxcx_ring_group_t * group)2174 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2175     mlxcx_ring_group_t *group)
2176 {
2177 	mlxcx_flow_entry_t *fe;
2178 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2179 	mlxcx_group_mac_t *gm, *ngm;
2180 
2181 	ASSERT(mutex_owned(&port->mlp_mtx));
2182 	ASSERT(mutex_owned(&group->mlg_mtx));
2183 
2184 	mutex_enter(&ft->mlft_mtx);
2185 
2186 	gm = avl_first(&group->mlg_rx_macs);
2187 	for (; gm != NULL; gm = ngm) {
2188 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2189 
2190 		ASSERT3P(gm->mlgm_group, ==, group);
2191 		fe = gm->mlgm_fe;
2192 		ASSERT3P(fe->mlfe_table, ==, ft);
2193 
2194 		avl_remove(&group->mlg_rx_macs, gm);
2195 		list_remove(&fe->mlfe_ring_groups, gm);
2196 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
2197 
2198 		fe->mlfe_ndest = 0;
2199 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2200 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2201 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2202 			    gm->mlgm_group->mlg_rx_vlan_ft;
2203 		}
2204 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2205 
2206 		if (fe->mlfe_ndest > 0) {
2207 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2208 			continue;
2209 		}
2210 
2211 		/*
2212 		 * There are no more ring groups left for this MAC (it wasn't
2213 		 * attached to any other groups since ndest == 0), so clean up
2214 		 * its flow entry.
2215 		 */
2216 		avl_remove(&port->mlp_dmac_fe, fe);
2217 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2218 		list_destroy(&fe->mlfe_ring_groups);
2219 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2220 	}
2221 
2222 	mutex_exit(&ft->mlft_mtx);
2223 }
2224 
2225 boolean_t
mlxcx_remove_umcast_entry(mlxcx_t * mlxp,mlxcx_port_t * port,mlxcx_ring_group_t * group,const uint8_t * macaddr)2226 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2227     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2228 {
2229 	mlxcx_flow_entry_t *fe;
2230 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2231 	mlxcx_group_mac_t *gm, probe;
2232 
2233 	ASSERT(mutex_owned(&port->mlp_mtx));
2234 	ASSERT(mutex_owned(&group->mlg_mtx));
2235 
2236 	bzero(&probe, sizeof (probe));
2237 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2238 
2239 	mutex_enter(&ft->mlft_mtx);
2240 
2241 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2242 	if (gm == NULL) {
2243 		mutex_exit(&ft->mlft_mtx);
2244 		return (B_FALSE);
2245 	}
2246 	ASSERT3P(gm->mlgm_group, ==, group);
2247 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2248 
2249 	fe = gm->mlgm_fe;
2250 	ASSERT3P(fe->mlfe_table, ==, ft);
2251 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2252 
2253 	list_remove(&fe->mlfe_ring_groups, gm);
2254 	avl_remove(&group->mlg_rx_macs, gm);
2255 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2256 
2257 	fe->mlfe_ndest = 0;
2258 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2259 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2260 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2261 		    gm->mlgm_group->mlg_rx_vlan_ft;
2262 	}
2263 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2264 
2265 	if (fe->mlfe_ndest > 0) {
2266 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2267 			mutex_exit(&ft->mlft_mtx);
2268 			return (B_FALSE);
2269 		}
2270 		mutex_exit(&ft->mlft_mtx);
2271 		return (B_TRUE);
2272 	}
2273 
2274 	/*
2275 	 * There are no more ring groups left for this MAC (it wasn't attached
2276 	 * to any other groups since ndest == 0), so clean up its flow entry.
2277 	 */
2278 	avl_remove(&port->mlp_dmac_fe, fe);
2279 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2280 	list_destroy(&fe->mlfe_ring_groups);
2281 
2282 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2283 
2284 	mutex_exit(&ft->mlft_mtx);
2285 
2286 	return (B_TRUE);
2287 }
2288 
2289 boolean_t
mlxcx_add_umcast_entry(mlxcx_t * mlxp,mlxcx_port_t * port,mlxcx_ring_group_t * group,const uint8_t * macaddr)2290 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2291     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2292 {
2293 	mlxcx_flow_group_t *fg;
2294 	mlxcx_flow_entry_t *fe, probe;
2295 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2296 	mlxcx_group_mac_t *gm;
2297 	boolean_t found = B_FALSE;
2298 
2299 	ASSERT(mutex_owned(&port->mlp_mtx));
2300 	ASSERT(mutex_owned(&group->mlg_mtx));
2301 
2302 	bzero(&probe, sizeof (probe));
2303 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2304 
2305 	mutex_enter(&ft->mlft_mtx);
2306 
2307 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2308 
2309 	if (fe == NULL) {
2310 		fg = port->mlp_umcast;
2311 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2312 		    fe = list_next(&fg->mlfg_entries, fe)) {
2313 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2314 				found = B_TRUE;
2315 				break;
2316 			}
2317 		}
2318 		if (!found) {
2319 			mutex_exit(&ft->mlft_mtx);
2320 			return (B_FALSE);
2321 		}
2322 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2323 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2324 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2325 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2326 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2327 
2328 		avl_add(&port->mlp_dmac_fe, fe);
2329 	}
2330 
2331 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2332 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2333 
2334 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2335 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2336 		if (--fe->mlfe_ndest == 0) {
2337 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2338 		}
2339 		mutex_exit(&ft->mlft_mtx);
2340 		return (B_FALSE);
2341 	}
2342 
2343 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2344 	gm->mlgm_group = group;
2345 	gm->mlgm_fe = fe;
2346 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2347 	avl_add(&group->mlg_rx_macs, gm);
2348 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2349 
2350 	mutex_exit(&ft->mlft_mtx);
2351 
2352 	return (B_TRUE);
2353 }
2354 
2355 boolean_t
mlxcx_setup_flow_group(mlxcx_t * mlxp,mlxcx_flow_table_t * ft,mlxcx_flow_group_t * fg)2356 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2357     mlxcx_flow_group_t *fg)
2358 {
2359 	mlxcx_flow_entry_t *fe;
2360 	uint_t i, idx;
2361 
2362 	ASSERT(mutex_owned(&ft->mlft_mtx));
2363 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2364 	ASSERT3P(fg->mlfg_table, ==, ft);
2365 
2366 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2367 		return (B_FALSE);
2368 	fg->mlfg_start_idx = ft->mlft_next_ent;
2369 
2370 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2371 		return (B_FALSE);
2372 	}
2373 
2374 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2375 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2376 	for (i = 0; i < fg->mlfg_size; ++i) {
2377 		idx = fg->mlfg_start_idx + i;
2378 		fe = &ft->mlft_ent[idx];
2379 		fe->mlfe_group = fg;
2380 		list_insert_tail(&fg->mlfg_entries, fe);
2381 	}
2382 	fg->mlfg_avail = fg->mlfg_size;
2383 	ft->mlft_next_ent += fg->mlfg_size;
2384 
2385 	return (B_TRUE);
2386 }
2387 
2388 static boolean_t
mlxcx_setup_eq(mlxcx_t * mlxp,uint_t vec,uint64_t events)2389 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events)
2390 {
2391 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec];
2392 
2393 	mutex_enter(&mleq->mleq_mtx);
2394 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2395 		/* mlxcx_teardown_eqs() will clean this up */
2396 		mutex_exit(&mleq->mleq_mtx);
2397 		return (B_FALSE);
2398 	}
2399 	mleq->mleq_mlx = mlxp;
2400 	mleq->mleq_uar = &mlxp->mlx_uar;
2401 	mleq->mleq_events = events;
2402 	mleq->mleq_intr_index = vec;
2403 
2404 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2405 		/* mlxcx_teardown_eqs() will clean this up */
2406 		mutex_exit(&mleq->mleq_mtx);
2407 		return (B_FALSE);
2408 	}
2409 
2410 	if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) {
2411 		/*
2412 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2413 		 * eq_rele_dma
2414 		 */
2415 		mutex_exit(&mleq->mleq_mtx);
2416 		return (B_FALSE);
2417 	}
2418 	mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2419 	mleq->mleq_state |= MLXCX_EQ_ATTACHING;
2420 	mlxcx_arm_eq(mlxp, mleq);
2421 	mutex_exit(&mleq->mleq_mtx);
2422 
2423 	return (B_TRUE);
2424 }
2425 
2426 static void
mlxcx_eq_set_attached(mlxcx_t * mlxp)2427 mlxcx_eq_set_attached(mlxcx_t *mlxp)
2428 {
2429 	uint_t vec;
2430 	mlxcx_event_queue_t *mleq;
2431 
2432 	for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) {
2433 		mleq = &mlxp->mlx_eqs[vec];
2434 
2435 		mutex_enter(&mleq->mleq_mtx);
2436 		mleq->mleq_state &= ~MLXCX_EQ_ATTACHING;
2437 		mutex_exit(&mleq->mleq_mtx);
2438 	}
2439 }
2440 
2441 static boolean_t
mlxcx_setup_async_eqs(mlxcx_t * mlxp)2442 mlxcx_setup_async_eqs(mlxcx_t *mlxp)
2443 {
2444 	boolean_t ret;
2445 
2446 	ret = mlxcx_setup_eq(mlxp, 0,
2447 	    (1ULL << MLXCX_EVENT_CMD_COMPLETION) |
2448 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2449 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2450 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2451 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2452 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2453 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2454 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2455 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2456 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2457 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2458 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2459 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2460 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST));
2461 
2462 	if (ret)
2463 		mlxcx_cmd_eq_enable(mlxp);
2464 
2465 	return (ret);
2466 }
2467 
2468 int
mlxcx_cq_compare(const void * arg0,const void * arg1)2469 mlxcx_cq_compare(const void *arg0, const void *arg1)
2470 {
2471 	const mlxcx_completion_queue_t *left = arg0;
2472 	const mlxcx_completion_queue_t *right = arg1;
2473 
2474 	if (left->mlcq_num < right->mlcq_num) {
2475 		return (-1);
2476 	}
2477 	if (left->mlcq_num > right->mlcq_num) {
2478 		return (1);
2479 	}
2480 	return (0);
2481 }
2482 
2483 static boolean_t
mlxcx_setup_eqs(mlxcx_t * mlxp)2484 mlxcx_setup_eqs(mlxcx_t *mlxp)
2485 {
2486 	uint_t i;
2487 	mlxcx_event_queue_t *mleq;
2488 
2489 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2490 
2491 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
2492 		mleq = &mlxp->mlx_eqs[i];
2493 		mutex_enter(&mleq->mleq_mtx);
2494 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2495 			mutex_exit(&mleq->mleq_mtx);
2496 			return (B_FALSE);
2497 		}
2498 		mleq->mleq_uar = &mlxp->mlx_uar;
2499 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2500 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2501 			mutex_exit(&mleq->mleq_mtx);
2502 			return (B_FALSE);
2503 		}
2504 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2505 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2506 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2507 			mutex_exit(&mleq->mleq_mtx);
2508 			return (B_FALSE);
2509 		}
2510 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2511 			mutex_exit(&mleq->mleq_mtx);
2512 			return (B_FALSE);
2513 		}
2514 		mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2515 		mlxcx_arm_eq(mlxp, mleq);
2516 		mutex_exit(&mleq->mleq_mtx);
2517 	}
2518 
2519 	mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
2520 
2521 	return (B_TRUE);
2522 }
2523 
2524 /*
2525  * A more recent ConnectX part will have the Port CApability Mask register.
2526  * Explore it and note things here.
2527  */
2528 static void
mlxcx_explore_pcam(mlxcx_t * mlxp,mlxcx_caps_t * c)2529 mlxcx_explore_pcam(mlxcx_t *mlxp, mlxcx_caps_t *c)
2530 {
2531 	mlxcx_register_data_t data;
2532 	mlxcx_reg_pcam_t *pcam = &data.mlrd_pcam;
2533 
2534 	ASSERT(c->mlc_pcam);
2535 	bzero(&data, sizeof (data));
2536 
2537 	/*
2538 	 * Okay, so we have access the the Ports CApability Mask (PCAM).
2539 	 * There are various things we need to check about it.
2540 	 */
2541 
2542 	VERIFY(mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
2543 	    MLXCX_REG_PCAM, &data));
2544 
2545 	/*
2546 	 * NOTE: These ASSERT()s may change in future mlxcx(4D) parts.
2547 	 * As of now, only 0 is valid, and 1-255 are reserved.  A future part
2548 	 * may return non-zero in these fields.
2549 	 */
2550 	ASSERT0(pcam->mlrd_pcam_feature_group);
2551 	ASSERT0(pcam->mlrd_pcam_access_reg_group);
2552 
2553 	c->mlc_ext_ptys = get_bit64(pcam->mlrd_pcam_feature_cap_mask_low,
2554 	    MLXCX_PCAM_LOW_FFLAGS_PTYS_EXTENDED);
2555 }
2556 
2557 /*
2558  * Snapshot all of the hardware capabilities that we care about and then modify
2559  * the HCA capabilities to get things moving.
2560  */
2561 static boolean_t
mlxcx_init_caps(mlxcx_t * mlxp)2562 mlxcx_init_caps(mlxcx_t *mlxp)
2563 {
2564 	mlxcx_caps_t *c;
2565 
2566 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2567 
2568 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2569 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2570 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2571 	}
2572 
2573 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2574 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2575 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2576 	}
2577 
2578 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2579 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2580 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2581 	}
2582 
2583 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2584 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2585 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2586 	}
2587 
2588 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2589 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2590 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2591 	}
2592 
2593 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2594 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2595 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2596 	}
2597 
2598 	/*
2599 	 * Check the caps meet our requirements.
2600 	 */
2601 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2602 
2603 	if (gen->mlcap_general_log_pg_sz != 12) {
2604 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2605 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2606 		goto err;
2607 	}
2608 	if (gen->mlcap_general_cqe_version != 1) {
2609 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2610 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2611 		goto err;
2612 	}
2613 	if (gen->mlcap_general_port_type !=
2614 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2615 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2616 		goto err;
2617 	}
2618 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2619 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2620 
2621 	if (mlxp->mlx_type >= MLXCX_DEV_CX5 &&
2622 	    get_bit16(gen->mlcap_general_flags_c,
2623 	    MLXCX_CAP_GENERAL_FLAGS_C_PCAM_REG)) {
2624 		c->mlc_pcam = B_TRUE;
2625 	}
2626 
2627 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2628 
2629 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2630 	    MLXCX_ETH_CAP_CSUM_CAP);
2631 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2632 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2633 
2634 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2635 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2636 	if (c->mlc_max_lso_size == 1) {
2637 		c->mlc_max_lso_size = 0;
2638 		c->mlc_lso = B_FALSE;
2639 	} else {
2640 		c->mlc_lso = B_TRUE;
2641 	}
2642 
2643 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2644 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2645 
2646 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2647 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2648 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2649 		goto err;
2650 	}
2651 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2652 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2653 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2654 		    "flow table entries");
2655 		goto err;
2656 	}
2657 
2658 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2659 	    mlcap_flow_prop_log_max_ft_size;
2660 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2661 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2662 	c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow.
2663 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num);
2664 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2665 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2666 
2667 	return (B_TRUE);
2668 
2669 err:
2670 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2671 	return (B_FALSE);
2672 }
2673 
2674 static int
mlxcx_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2675 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2676 {
2677 	mlxcx_t *mlxp;
2678 
2679 	if (cmd != DDI_DETACH)
2680 		return (DDI_FAILURE);
2681 
2682 	mlxp = ddi_get_driver_private(dip);
2683 	if (mlxp == NULL) {
2684 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2685 		    "private data");
2686 		return (DDI_FAILURE);
2687 	}
2688 
2689 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2690 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2691 			return (DDI_FAILURE);
2692 		}
2693 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2694 	}
2695 
2696 	mlxcx_teardown(mlxp);
2697 	return (DDI_SUCCESS);
2698 }
2699 
2700 static size_t
mlxcx_calc_rx_ngroups(mlxcx_t * mlxp)2701 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2702 {
2703 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2704 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2705 	size_t tirlim, flowlim, gflowlim;
2706 
2707 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2708 	if (tirlim < ngroups) {
2709 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2710 		    "on number of TIRs available", tirlim);
2711 		ngroups = tirlim;
2712 	}
2713 
2714 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2715 	if (flowlim < ngroups) {
2716 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2717 		    "on max size of RX flow tables", flowlim);
2718 		ngroups = flowlim;
2719 	}
2720 
2721 	/*
2722 	 * Restrict the number of groups not to exceed the max flow
2723 	 * table number from the devices capabilities.
2724 	 * There is one root table entry per port and 2 entries per
2725 	 * group.
2726 	 */
2727 	flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2;
2728 	if (flowlim < ngroups) {
2729 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2730 		    "on max number of RX flow tables",
2731 		    flowlim);
2732 		ngroups = flowlim;
2733 	}
2734 
2735 	do {
2736 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2737 		if (gflowlim < ngroups) {
2738 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2739 			    "based on max total RX flows", gflowlim);
2740 			--ngroups;
2741 		}
2742 	} while (gflowlim < ngroups);
2743 
2744 	return (ngroups);
2745 }
2746 
2747 static int
mlxcx_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)2748 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2749 {
2750 	mlxcx_t *mlxp;
2751 	char tq_name[TASKQ_NAMELEN];
2752 	uint_t i;
2753 	int inst, ret;
2754 
2755 	if (cmd != DDI_ATTACH)
2756 		return (DDI_FAILURE);
2757 
2758 	inst = ddi_get_instance(dip);
2759 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2760 	if (ret != 0)
2761 		return (ret);
2762 
2763 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2764 	if (mlxp == NULL)
2765 		return (DDI_FAILURE);
2766 	mlxp->mlx_dip = dip;
2767 	mlxp->mlx_inst = inst;
2768 	ddi_set_driver_private(dip, mlxp);
2769 
2770 	mlxcx_load_props(mlxp);
2771 
2772 	mlxcx_fm_init(mlxp);
2773 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2774 
2775 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2776 	    DDI_SUCCESS) {
2777 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2778 		goto err;
2779 	}
2780 	mlxcx_get_model(mlxp);
2781 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2782 
2783 	if (!mlxcx_regs_map(mlxp)) {
2784 		goto err;
2785 	}
2786 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2787 
2788 	if (!mlxcx_cmd_queue_init(mlxp)) {
2789 		goto err;
2790 	}
2791 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2792 
2793 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2794 		goto err;
2795 	}
2796 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2797 
2798 	if (!mlxcx_check_issi(mlxp)) {
2799 		goto err;
2800 	}
2801 
2802 	/*
2803 	 * We have to get our interrupts now so we know what priority to
2804 	 * create pagemtx with.
2805 	 */
2806 	if (!mlxcx_intr_setup(mlxp)) {
2807 		goto err;
2808 	}
2809 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2810 
2811 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2812 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2813 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2814 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2815 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2816 
2817 	/*
2818 	 * Taskq for asynchronous events which may interact with the HCA
2819 	 * via the command interface. Single threaded FIFO.
2820 	 */
2821 	(void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d",
2822 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst);
2823 	mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX,
2824 	    TASKQ_PREPOPULATE);
2825 	/*
2826 	 * Initialize any pre-allocated taskq param structs.
2827 	 */
2828 	for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
2829 		mlxp->mlx_npages_req[i].mla_mlx = mlxp;
2830 		mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL,
2831 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
2832 	}
2833 	mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ;
2834 
2835 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2836 		goto err;
2837 	}
2838 
2839 	if (!mlxcx_init_caps(mlxp)) {
2840 		goto err;
2841 	}
2842 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2843 
2844 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2845 		goto err;
2846 	}
2847 
2848 	if (!mlxcx_cmd_init_hca(mlxp)) {
2849 		goto err;
2850 	}
2851 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2852 
2853 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2854 		goto err;
2855 	}
2856 
2857 	if (mlxp->mlx_caps->mlc_pcam) {
2858 		mlxcx_explore_pcam(mlxp, mlxp->mlx_caps);
2859 	}
2860 
2861 	/*
2862 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2863 	 * doorbells.
2864 	 */
2865 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2866 		goto err;
2867 	}
2868 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2869 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2870 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2871 	}
2872 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2873 
2874 	/*
2875 	 * Set up asynchronous event queue which handles control type events
2876 	 * like PAGE_REQUEST and CMD completion events.
2877 	 *
2878 	 * This will enable and arm the interrupt on EQ 0. Note that only page
2879 	 * reqs and cmd completions will be handled until we call
2880 	 * mlxcx_eq_set_attached further down (this way we don't need an extra
2881 	 * set of locks over the mlxcx_t sub-structs not allocated yet)
2882 	 */
2883 	if (!mlxcx_setup_async_eqs(mlxp)) {
2884 		goto err;
2885 	}
2886 
2887 	/*
2888 	 * Allocate a protection and transport domain. These don't really do
2889 	 * anything for us (they're IB concepts), but we need to give their
2890 	 * ID numbers in other commands.
2891 	 */
2892 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2893 		goto err;
2894 	}
2895 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2896 		goto err;
2897 	}
2898 	/*
2899 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2900 	 * work queue entries, rather than having to mess with the NIC's
2901 	 * internal MMU.
2902 	 */
2903 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2904 		goto err;
2905 	}
2906 
2907 	/*
2908 	 * Query our port information and current state, populate the
2909 	 * mlxcx_port_t structs.
2910 	 *
2911 	 * This also sets up the root flow tables and flow groups.
2912 	 */
2913 	if (!mlxcx_setup_ports(mlxp)) {
2914 		goto err;
2915 	}
2916 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2917 
2918 	mlxcx_load_model_props(mlxp);
2919 
2920 	/*
2921 	 * Set up, enable and arm the rest of the interrupt EQs which will
2922 	 * service events from CQs.
2923 	 *
2924 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2925 	 * cleaned up.
2926 	 */
2927 	if (!mlxcx_setup_eqs(mlxp)) {
2928 		goto err;
2929 	}
2930 
2931 	/* Completion queues */
2932 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2933 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2934 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2935 
2936 	/* Work queues (send queues, receive queues) */
2937 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2938 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2939 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2940 
2941 	/*
2942 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2943 	 * "groups" we advertise to MAC.
2944 	 */
2945 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2946 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2947 	    sizeof (mlxcx_ring_group_t);
2948 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2949 
2950 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2951 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2952 	    sizeof (mlxcx_ring_group_t);
2953 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2954 
2955 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2956 
2957 	/*
2958 	 * Sets up the free/busy buffers list for keeping track of packet
2959 	 * buffers.
2960 	 */
2961 	if (!mlxcx_setup_bufs(mlxp))
2962 		goto err;
2963 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2964 
2965 	/*
2966 	 * Before we tell MAC about our rings/groups, we need to do enough
2967 	 * setup on them to be sure about the numbers and configuration that
2968 	 * we have. This will do basically everything short of allocating
2969 	 * packet buffers and starting the rings up.
2970 	 */
2971 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2972 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2973 			goto err;
2974 	}
2975 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2976 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2977 			goto err;
2978 	}
2979 
2980 	/*
2981 	 * Set up periodic fault check timers which check the queue states,
2982 	 * set up should be after all the queues have been initialized and
2983 	 * consequently the teardown of timers must happen before
2984 	 * queue teardown.
2985 	 */
2986 	if (!mlxcx_setup_checktimers(mlxp)) {
2987 		goto err;
2988 	}
2989 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2990 
2991 	/*
2992 	 * Some devices may not have a working temperature sensor; however,
2993 	 * there isn't a great way for us to know. We shouldn't fail attach if
2994 	 * this doesn't work.
2995 	 */
2996 	if (mlxcx_setup_sensors(mlxp)) {
2997 		mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS;
2998 	}
2999 
3000 	/*
3001 	 * Finally, tell MAC that we exist!
3002 	 */
3003 	if (!mlxcx_register_mac(mlxp)) {
3004 		goto err;
3005 	}
3006 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
3007 
3008 	/*
3009 	 * This tells the interrupt handlers they can start processing events
3010 	 * other than cmd completions and page requests.
3011 	 */
3012 	mlxcx_eq_set_attached(mlxp);
3013 
3014 	return (DDI_SUCCESS);
3015 
3016 err:
3017 	mlxcx_teardown(mlxp);
3018 	return (DDI_FAILURE);
3019 }
3020 
3021 static struct cb_ops mlxcx_cb_ops = {
3022 	.cb_open = nulldev,
3023 	.cb_close = nulldev,
3024 	.cb_strategy = nodev,
3025 	.cb_print = nodev,
3026 	.cb_dump = nodev,
3027 	.cb_read = nodev,
3028 	.cb_write = nodev,
3029 	.cb_ioctl = nodev,
3030 	.cb_devmap = nodev,
3031 	.cb_mmap = nodev,
3032 	.cb_segmap = nodev,
3033 	.cb_chpoll = nochpoll,
3034 	.cb_prop_op = ddi_prop_op,
3035 	.cb_flag = D_MP,
3036 	.cb_rev = CB_REV,
3037 	.cb_aread = nodev,
3038 	.cb_awrite = nodev
3039 };
3040 
3041 static struct dev_ops mlxcx_dev_ops = {
3042 	.devo_rev = DEVO_REV,
3043 	.devo_refcnt = 0,
3044 	.devo_getinfo = NULL,
3045 	.devo_identify = nulldev,
3046 	.devo_probe = nulldev,
3047 	.devo_attach = mlxcx_attach,
3048 	.devo_detach = mlxcx_detach,
3049 	.devo_reset = nodev,
3050 	.devo_quiesce = ddi_quiesce_not_supported,
3051 	.devo_cb_ops = &mlxcx_cb_ops
3052 };
3053 
3054 static struct modldrv mlxcx_modldrv = {
3055 	.drv_modops = &mod_driverops,
3056 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
3057 	.drv_dev_ops = &mlxcx_dev_ops
3058 };
3059 
3060 static struct modlinkage mlxcx_modlinkage = {
3061 	.ml_rev = MODREV_1,
3062 	.ml_linkage = { &mlxcx_modldrv, NULL }
3063 };
3064 
3065 int
_init(void)3066 _init(void)
3067 {
3068 	int ret;
3069 
3070 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
3071 	if (ret != 0) {
3072 		return (ret);
3073 	}
3074 
3075 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
3076 
3077 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3078 		mac_fini_ops(&mlxcx_dev_ops);
3079 		ddi_soft_state_fini(&mlxcx_softstate);
3080 		return (ret);
3081 	}
3082 
3083 	return (DDI_SUCCESS);
3084 }
3085 
3086 int
_info(struct modinfo * modinfop)3087 _info(struct modinfo *modinfop)
3088 {
3089 	return (mod_info(&mlxcx_modlinkage, modinfop));
3090 }
3091 
3092 int
_fini(void)3093 _fini(void)
3094 {
3095 	int ret;
3096 
3097 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3098 		return (ret);
3099 	}
3100 
3101 	mac_fini_ops(&mlxcx_dev_ops);
3102 
3103 	ddi_soft_state_fini(&mlxcx_softstate);
3104 
3105 	return (DDI_SUCCESS);
3106 }
3107