1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2021 Oxide Computer Company
14 */
15
16 #include "ena_hw.h"
17 #include "ena.h"
18
19 /*
20 * Elastic Network Adapter (ENA) Driver
21 * ------------------------------------
22 *
23 * The ena driver provides support for the AWS ENA device, also
24 * referred to as their "enhanced networking". This device is present
25 * on "Nitro"-based instances. It presents itself with the following
26 * PCI Vendor/Device IDs
27 *
28 * o 1d0f:0ec2 -- ENA PF
29 * o 1d0f:1ec2 -- ENA PF (Reserved)
30 * o 1d0f:ec20 -- ENA VF
31 * o 1d0f:ec21 -- ENA VF (Reserved)
32 *
33 * This driver provides support for only the essential features needed
34 * to drive traffic on an ENA device. Support for the following
35 * features IS NOT currently implemented.
36 *
37 * o Admin Queue Interrupts: queue completion events are always polled
38 * o AENQ keep alive
39 * o FMA
40 * o Rx checksum offloads
41 * o Tx checksum offloads
42 * o Tx DMA bind (borrow buffers)
43 * o Rx DMA bind (loaned buffers)
44 * o TSO
45 * o RSS
46 * o Low Latency Queues (LLQ)
47 * o Support for different Tx complection policies
48 * o More controlled Tx recycling and Rx refill
49 *
50 * Even without these features the ena driver should perform
51 * reasonably well.
52 *
53 * Driver vs. Hardware Types
54 * -------------------------
55 *
56 * To properly communicate with the ENA device the driver must
57 * populate memory (registers and buffers) with specific types. These
58 * types are defined by the device and are found under the "common"
59 * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have
60 * simplified this a bit by defining all device-specific types in the
61 * ena_hw.h file. Furthermore, all device-specific types are given an
62 * "enahw" prefix. This makes it clear when we are dealing with a
63 * device type and when we are dealing with a driver type.
64 *
65 * [1]: https://github.com/amzn/amzn-drivers
66 *
67 * Groups, Rings (Queues), and Interrupts
68 * --------------------------------------
69 *
70 * The ENA device presents one mac group. This single mac group
71 * represents the single unicast address that this device represents
72 * in your AWS instance. The ENA device presents no option for
73 * configuring additional MAC addresses, multicast, or promisc mode --
74 * you receive only what AWS wants you to receive.
75 *
76 * This single mac group may have one or more rings. The ENA driver
77 * refers to rings as queues, for no special reason other than it was
78 * the dominant language in the Linux and FreeBSD drivers, and it
79 * spilled over into this port. The upper bound on number of queues is
80 * presented by the device. However, we don't just go with whatever
81 * number of queues the device reports; but rather we limit the queues
82 * based on other factors such as an absolute maximum, number of
83 * online CPUs, and number of available interrupts. The upper bound is
84 * calculated by ena_set_max_io_queues(), and that is used and
85 * possibly further restricted in ena_attach_intr_alloc(). As this
86 * point, ultimately, it is the number of available interrupts (minus
87 * one for the admin queue) that determines the number of queues: one
88 * Tx and one Rx on each I/O interrupt.
89 *
90 * NOTE: Perhaps it is overly restrictive to limit the number of
91 * queues to the number of I/O interrupts. Something worth considering
92 * on larger instances if they present far less interrupts than they
93 * do queues + CPUs.
94 *
95 * The ENA device presents MSI-X interrupts only. During attach the
96 * driver queries the number of available interrupts and sets aside
97 * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N).
98 * This means that a Tx/Rx queue at index 0 will map to vector 1, and
99 * so on.
100 *
101 * NOTE: The ENA driver currently doesn't make use of the Admin Queue
102 * interrupt. This interrupt is used to notify a the driver that a
103 * command response is read. The ENA driver always polls the Admin
104 * Queue for responses.
105 *
106 * Tx Queue Workings
107 * -----------------
108 *
109 * A single Tx queue (ena_txq_t) is made up of one submission queue
110 * (SQ) and its paired completion queue (CQ). These two queues form a
111 * logical descriptor ring which is used to send packets out of the
112 * device -- where each SQ entry describes the packet to be sent
113 * (enahw_tx_desc_t) and each CQ entry describes the result of sending
114 * a packet (enahw_tx_cdesc_t). For this to work the host and device
115 * must agree on which descriptors are currently owned by the host
116 * (free for sending) and which are owned by the device (pending
117 * device completion). This state is tracked on the host side via head
118 * and tail indexes along with a phase value.
119 *
120 * The head and tail values represent the head and tail of the FIFO
121 * queue of pending packets -- the next packet to be sent by the
122 * device is head, and all descriptors up to tail are ready for
123 * sending. The phase allows the host to determine which CQ
124 * descriptors represent completed events when using per-SQ completion
125 * events (as opposed to queue head pointer updates). As the queues
126 * represent a logical ring buffer, the phase must alternate on
127 * wrap-around. The device initializes the phase to zero, and the host
128 * starts with a phase of 1. The first packet descriptor writes, and
129 * their corresponding completions, are indicated with a phase of 1.
130 *
131 *
132 * For example, the diagram below represents the SQ/CQ state after the
133 * first 6 packets have been sent by the host and 2 of them have been
134 * completed by the device (and these completions have been processed
135 * by the driver). In this state the host could send 4 more packets
136 * before needing to wait on completion events.
137 *
138 *
139 * +---+---+---+---+---+---+---+---+
140 * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | phase = 1
141 * +---+---+---+---+---+---+---+---+
142 * ^
143 * |
144 * tail
145 * head
146 * |
147 * v
148 * +---+---+---+---+---+---+---+---+
149 * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | phase = 1
150 * +---+---+---+---+---+---+---+---+
151 *
152 *
153 * The next diagram shows how the state changes as 5 more packets are
154 * sent (for a total of 11) and 7 more are completed (for a total of
155 * 9). Notice that as the SQ and CQ have wrapped around their phases
156 * have been complemented. In this state the host could send 6 more
157 * packets before needing to wait on completion events.
158 *
159 * +---+---+---+---+---+---+---+---+
160 * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | phase = 0
161 * +---+---+---+---+---+---+---+---+
162 * ^
163 * |
164 * tail
165 * head
166 * |
167 * v
168 * +---+---+---+---+---+---+---+---+
169 * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | phase = 0
170 * +---+---+---+---+---+---+---+---+
171 *
172 *
173 * Currently, all packets are copied for Tx. At ring start we allocate
174 * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has
175 * DMA buffer associated with it; and each buffer is large enough to
176 * hold the MTU. Therefore, Tx descriptors and TCBs currently have a
177 * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to
178 * the TCB's DMA buffer, and a new descriptor is written to the SQ
179 * describing said TCB buffer. If and when we add more advanced
180 * features like DMA binding of mblks and TSO, this 1:1 guarantee will
181 * no longer hold.
182 *
183 * Rx Queue Workings
184 * -----------------
185 *
186 * In terms of implementing the logical descriptor ring, the Rx queues
187 * are very much like the Tx queues. There is a paired SQ and CQ for
188 * each logical ring. The difference is that in Rx the SQ is for
189 * handing buffers to the device to fill, and the CQ is for describing
190 * the contents of those buffers for a given received frame. At Rx
191 * ring start we allocate a Rx Control Buffer (RCB) for each
192 * descriptor in the ring. Each RCB has a DMA buffer associated with
193 * it; and each buffer is large enough to hold the MTU. For each
194 * received frame we copy the contents out of the RCB and into its own
195 * mblk, immediately returning the RCB for reuse. As with Tx, this
196 * gives us a simple 1:1 mapping currently, but if more advanced
197 * features are implemented later this could change.
198 *
199 * Asynchronous Event Notification Queue (AENQ)
200 * --------------------------------------------
201 *
202 * Each ENA device comes with a mechanism for sending out-of-band
203 * notifications to the driver. This includes events like link state
204 * changes, fatal errors, and a watchdog/keep alive signal. The AENQ
205 * delivery mechanism is via interrupt, handled by the ena_aenq_work()
206 * function, which dispatches via the eaenq_hdlrs table. If no handler
207 * is registered, the ena_aenq_default_hdlr() handler is used. A given
208 * device may not support all the different event types
209 * (enahw_aenq_groups_t); and the driver may choose to enable a subset
210 * of the supported events. During attach we call ena_setup_aenq() to
211 * negotiate the supported/enabled events. The enabled group is stored
212 * at ena_aenq_enabled_groups.
213 *
214 * Queues and Unsigned Wraparound
215 * ------------------------------
216 *
217 * All the queues use a uint16_t value as their head/tail values, e.g.
218 * the Rx queue's er_cq_head_idx value. You might notice that we only
219 * ever increment these values, letting them perform implicit unsigned
220 * integer wraparound. This is intended. This is the same behavior as
221 * the common code, and seems to be what the hardware expects. Of
222 * course, when accessing our own descriptor arrays we must make sure
223 * to first perform a modulo of this value or risk running off into
224 * space.
225 *
226 * Attach Sequencing
227 * -----------------
228 *
229 * Most drivers implement their attach/detach/cleanup functions as a
230 * sequential stream of function calls used to allocate and initialize
231 * resources in an order determined by the device's programming manual
232 * combined with any requirements imposed by the kernel and its
233 * relevant modules. These functions can become quite long. It is
234 * often hard to see the order in which steps are taken, and even
235 * harder to tell if detach/cleanup undoes them in the correct order,
236 * or even if it undoes them at all! The only sure way to understand
237 * the flow is to take good notes while closely inspecting each line
238 * of code. Even then, it's easy for attach and detach to get out of
239 * sync.
240 *
241 * Some more recent drivers have improved on this situation by using a
242 * bit vector to track the sequence of events in attach/detach. Each
243 * bit is declared in as an enum value, in the same order it is
244 * expected attach would run, and thus detach would run in the exact
245 * opposite order. This has three main benefits:
246 *
247 * 1. It makes it easier to determine sequence order at a
248 * glance.
249 *
250 * 2. It gives a better idea of what state the device is in during
251 * debugging (the sequence bit vector is kept with the instance
252 * state).
253 *
254 * 3. The detach function can verify that all sequence bits are
255 * cleared, indicating that everything done in attach was
256 * successfully undone.
257 *
258 * These are great improvements. However, the attach/detach functions
259 * can still become unruly, and there is still no guarantee that
260 * detach is done in opposite order of attach (this is not always
261 * strictly required, but is probably the best way to write detach).
262 * There is still a lot of boilerplate and chance for programmer
263 * error.
264 *
265 * The ena driver takes the sequence idea a bit further, creating a
266 * descriptor table of the attach sequence (ena_attach_tbl). This
267 * table is used by attach/detach to generically, declaratively, and
268 * programmaticaly enforce the precise sequence order and verify that
269 * anything that is done is undone. This provides several benefits:
270 *
271 * o Correct order is enforced implicitly by the descriptor table.
272 * It is impossible for the detach sequence to run in any other
273 * order other than opposite that of attach.
274 *
275 * o It is obvious what the precise attach sequence is. While the
276 * bit vector enum helps a lot with this it doesn't prevent
277 * programmer error. With the sequence defined as a declarative
278 * table it makes it easy for the programmer to see the order and
279 * know it's followed exactly.
280 *
281 * o It is impossible to modify the attach sequence without also
282 * specifying a callback for its dual in the detach sequence.
283 *
284 * o Common and repetitive code like error checking, logging, and bit
285 * vector modification is eliminated and centralized, again
286 * reducing the chance of programmer error.
287 *
288 * The ena attach sequence is defined under ena_attach_seq_t. The
289 * descriptor table is defined under ena_attach_tbl.
290 */
291
292 /*
293 * These are some basic data layout invariants on which development
294 * assumptions where made.
295 */
296 CTASSERT(sizeof (enahw_aenq_desc_t) == 64);
297 /* TODO: Why doesn't this work? */
298 /* CTASSERT(sizeof (enahw_tx_data_desc_t) == 64); */
299 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t));
300 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t));
301 CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t));
302 /*
303 * We add this here as an extra safety check to make sure that any
304 * addition to the AENQ group enum also updates the groups array num
305 * value.
306 */
307 CTASSERT(ENAHW_AENQ_GROUPS_ARR_NUM == 6);
308
309 /*
310 * Amazon does not specify the endianess of the ENA device. We assume
311 * it's the same as the bus, and we assume the CPU/bus is always
312 * little endian.
313 */
314 #ifdef _BIG_ENDIAN
315 #error "ENA driver is little-endian only"
316 #endif
317
318 /*
319 * These values are used to communicate the driver version to the AWS
320 * hypervisor via the ena_set_host_info() function. We don't know what
321 * exactly AWS does with this info, but it's fairly safe to assume
322 * it's used solely for debug/informational purposes. The Linux driver
323 * updates these values frequently as bugs are fixed and features are
324 * added.
325 */
326 #define ENA_DRV_VER_MAJOR 1
327 #define ENA_DRV_VER_MINOR 0
328 #define ENA_DRV_VER_SUBMINOR 0
329
330 uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT;
331
332 /*
333 * Log an error message. We leave the destination (console or system
334 * log) up to the caller
335 */
336 void
ena_err(const ena_t * ena,const char * fmt,...)337 ena_err(const ena_t *ena, const char *fmt, ...)
338 {
339 va_list ap;
340
341 va_start(ap, fmt);
342 if (ena != NULL && ena->ena_dip != NULL) {
343 vdev_err(ena->ena_dip, CE_WARN, fmt, ap);
344 } else {
345 vcmn_err(CE_WARN, fmt, ap);
346 }
347 va_end(ap);
348 }
349
350 /*
351 * Set this to B_TRUE to enable debug messages.
352 */
353 boolean_t ena_debug = B_FALSE;
354
355 /*
356 * Log a debug message. We force all debug messages to go to the
357 * system log.
358 */
359 void
ena_dbg(const ena_t * ena,const char * fmt,...)360 ena_dbg(const ena_t *ena, const char *fmt, ...)
361 {
362 va_list ap;
363
364 if (ena_debug) {
365 char msg[1024];
366
367 va_start(ap, fmt);
368 (void) vsnprintf(msg, sizeof (msg), fmt, ap);
369 va_end(ap);
370
371 if (ena != NULL && ena->ena_dip != NULL) {
372 dev_err(ena->ena_dip, CE_NOTE, "!%s", msg);
373 } else {
374 cmn_err(CE_NOTE, "!%s", msg);
375 }
376 }
377 }
378
379 ena_aenq_grpstr_t ena_groups_str[ENAHW_AENQ_GROUPS_ARR_NUM] = {
380 { .eag_type = ENAHW_AENQ_GROUP_LINK_CHANGE, .eag_str = "LINK CHANGE" },
381 { .eag_type = ENAHW_AENQ_GROUP_FATAL_ERROR, .eag_str = "FATAL ERROR" },
382 { .eag_type = ENAHW_AENQ_GROUP_WARNING, .eag_str = "WARNING" },
383 {
384 .eag_type = ENAHW_AENQ_GROUP_NOTIFICATION,
385 .eag_str = "NOTIFICATION"
386 },
387 { .eag_type = ENAHW_AENQ_GROUP_KEEP_ALIVE, .eag_str = "KEEP ALIVE" },
388 {
389 .eag_type = ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES,
390 .eag_str = "REFRESH CAPABILITIES"
391 },
392 };
393
394 void
ena_aenq_work(ena_t * ena)395 ena_aenq_work(ena_t *ena)
396 {
397 ena_aenq_t *aenq = &ena->ena_aenq;
398 uint16_t head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1);
399 boolean_t processed = B_FALSE;
400 enahw_aenq_desc_t *desc = &aenq->eaenq_descs[head_mod];
401 uint64_t ts;
402
403 ts = ((uint64_t)desc->ead_ts_high << 32) | (uint64_t)desc->ead_ts_low;
404 ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORKERNEL);
405
406 while (ENAHW_AENQ_DESC_PHASE(desc) == aenq->eaenq_phase) {
407 ena_aenq_hdlr_t hdlr;
408
409 ASSERT3U(desc->ead_group, <, ENAHW_AENQ_GROUPS_ARR_NUM);
410 processed = B_TRUE;
411 ena_dbg(ena, "AENQ Group: (0x%x) %s Syndrome: 0x%x ts: %" PRIu64
412 " us", desc->ead_group,
413 ena_groups_str[desc->ead_group].eag_str, desc->ead_syndrome,
414 ts);
415
416 hdlr = ena->ena_aenq.eaenq_hdlrs[desc->ead_group];
417 hdlr(ena, desc);
418
419 aenq->eaenq_head++;
420 head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1);
421
422 if (head_mod == 0) {
423 aenq->eaenq_phase = !aenq->eaenq_phase;
424 }
425
426 desc = &aenq->eaenq_descs[head_mod];
427 }
428
429 if (processed) {
430 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB,
431 aenq->eaenq_head);
432 }
433 }
434
435 /*
436 * Use for attach sequences which perform no resource allocation (or
437 * global state modification) and thus require no subsequent
438 * deallocation.
439 */
440 static void
ena_no_cleanup(ena_t * ena)441 ena_no_cleanup(ena_t *ena)
442 {
443 }
444
445 static boolean_t
ena_attach_pci(ena_t * ena)446 ena_attach_pci(ena_t *ena)
447 {
448 ddi_acc_handle_t hdl;
449
450 if (pci_config_setup(ena->ena_dip, &hdl) != 0) {
451 return (B_FALSE);
452 }
453
454 ena->ena_pci_hdl = hdl;
455 ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID);
456 ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID);
457 ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID);
458 ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID);
459 ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID);
460 ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x",
461 ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev,
462 ena->ena_pci_svid, ena->ena_pci_sdid);
463
464 return (B_TRUE);
465 }
466
467 static void
ena_cleanup_pci(ena_t * ena)468 ena_cleanup_pci(ena_t *ena)
469 {
470 pci_config_teardown(&ena->ena_pci_hdl);
471 }
472
473 static void
ena_cleanup_regs_map(ena_t * ena)474 ena_cleanup_regs_map(ena_t *ena)
475 {
476 ddi_regs_map_free(&ena->ena_reg_hdl);
477 }
478
479 static boolean_t
ena_attach_regs_map(ena_t * ena)480 ena_attach_regs_map(ena_t *ena)
481 {
482 int ret = 0;
483
484 if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) !=
485 DDI_SUCCESS) {
486 ena_err(ena, "failed to get register set %d size",
487 ENA_REG_NUMBER);
488 return (B_FALSE);
489 }
490
491 ena_dbg(ena, "register size: %ld", ena->ena_reg_size);
492 bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr));
493 ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1;
494 ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
495 ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
496
497 /*
498 * This function can return several different failure values,
499 * so we make sure to capture its return value for the purpose
500 * of logging.
501 */
502 ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER,
503 &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr,
504 &ena->ena_reg_hdl);
505
506 if (ret != DDI_SUCCESS) {
507 ena_err(ena, "failed to map register set %d: %d",
508 ENA_REG_NUMBER, ret);
509 return (B_FALSE);
510 }
511
512 ena_dbg(ena, "registers mapped to base: 0x%p",
513 (void *)ena->ena_reg_base);
514
515 return (B_TRUE);
516 }
517
518 /*
519 * Free any resources related to the admin submission queue.
520 */
521 static void
ena_admin_sq_free(ena_t * ena)522 ena_admin_sq_free(ena_t *ena)
523 {
524 ena_dma_free(&ena->ena_aq.ea_sq.eas_dma);
525 }
526
527 /*
528 * Initialize the admin submission queue.
529 */
530 static boolean_t
ena_admin_sq_init(ena_t * ena)531 ena_admin_sq_init(ena_t *ena)
532 {
533 ena_adminq_t *aq = &ena->ena_aq;
534 ena_dma_buf_t *dma = &aq->ea_sq.eas_dma;
535 size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries);
536 uint32_t addr_low, addr_high, wval;
537 ena_dma_conf_t conf = {
538 .edc_size = size,
539 .edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT,
540 .edc_sgl = 1,
541 .edc_endian = DDI_NEVERSWAP_ACC,
542 .edc_stream = B_FALSE,
543 };
544
545 if (!ena_dma_alloc(ena, dma, &conf, size)) {
546 ena_err(ena, "failed to allocate DMA for Admin SQ");
547 return (B_FALSE);
548 }
549
550 aq->ea_sq.eas_entries = (void *)dma->edb_va;
551 aq->ea_sq.eas_tail = 0;
552 aq->ea_sq.eas_phase = 1;
553 aq->ea_sq.eas_dbaddr =
554 (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB);
555 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
556 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
557 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
558 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low);
559 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high);
560 wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) |
561 ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries));
562 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval);
563 return (B_TRUE);
564 }
565
566 /*
567 * Free any resources related to the admin completion queue.
568 */
569 static void
ena_admin_cq_free(ena_t * ena)570 ena_admin_cq_free(ena_t *ena)
571 {
572 ena_dma_free(&ena->ena_aq.ea_cq.eac_dma);
573 }
574
575 /*
576 * Initialize the admin completion queue.
577 */
578 static boolean_t
ena_admin_cq_init(ena_t * ena)579 ena_admin_cq_init(ena_t *ena)
580 {
581 ena_adminq_t *aq = &ena->ena_aq;
582 ena_dma_buf_t *dma = &aq->ea_cq.eac_dma;
583 size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries);
584 uint32_t addr_low, addr_high, wval;
585 ena_dma_conf_t conf = {
586 .edc_size = size,
587 .edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT,
588 .edc_sgl = 1,
589 .edc_endian = DDI_NEVERSWAP_ACC,
590 .edc_stream = B_FALSE,
591 };
592
593 if (!ena_dma_alloc(ena, dma, &conf, size)) {
594 ena_err(ena, "failed to allocate DMA for Admin CQ");
595 return (B_FALSE);
596 }
597
598 aq->ea_cq.eac_entries = (void *)dma->edb_va;
599 aq->ea_cq.eac_head = 0;
600 aq->ea_cq.eac_phase = 1;
601 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
602 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
603 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
604 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low);
605 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high);
606 wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) |
607 ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries));
608 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval);
609 return (B_TRUE);
610 }
611
612 static void
ena_aenq_default_hdlr(void * data,enahw_aenq_desc_t * desc)613 ena_aenq_default_hdlr(void *data, enahw_aenq_desc_t *desc)
614 {
615 ena_t *ena = data;
616
617 ena->ena_aenq_stat.eaes_default.value.ui64++;
618 ena_dbg(ena, "unimplemented handler for aenq group: %s",
619 ena_groups_str[desc->ead_group].eag_str);
620 }
621
622 static void
ena_aenq_link_change_hdlr(void * data,enahw_aenq_desc_t * desc)623 ena_aenq_link_change_hdlr(void *data, enahw_aenq_desc_t *desc)
624 {
625 ena_t *ena = data;
626 boolean_t is_up = (desc->ead_payload.link_change.flags &
627 ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK) != 0;
628
629 /*
630 * The interupts are not enabled until after we register mac,
631 * so the mac handle should be valid.
632 */
633 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_MAC_REGISTER);
634 ena->ena_aenq_stat.eaes_link_change.value.ui64++;
635
636 mutex_enter(&ena->ena_lock);
637
638 /*
639 * Notify mac only on an actual change in status.
640 */
641 if (ena->ena_link_up != is_up) {
642 if (is_up) {
643 mac_link_update(ena->ena_mh, LINK_STATE_UP);
644 } else {
645 mac_link_update(ena->ena_mh, LINK_STATE_DOWN);
646 }
647 }
648
649 ena->ena_link_up = is_up;
650
651 mutex_exit(&ena->ena_lock);
652 }
653
654 /*
655 * Free any resources related to the Async Event Notification Queue.
656 */
657 static void
ena_aenq_free(ena_t * ena)658 ena_aenq_free(ena_t *ena)
659 {
660 ena_dma_free(&ena->ena_aenq.eaenq_dma);
661 }
662
663 static void
ena_aenq_set_def_hdlrs(ena_aenq_t * aenq)664 ena_aenq_set_def_hdlrs(ena_aenq_t *aenq)
665 {
666 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = ena_aenq_default_hdlr;
667 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_FATAL_ERROR] = ena_aenq_default_hdlr;
668 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_WARNING] = ena_aenq_default_hdlr;
669 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_NOTIFICATION] =
670 ena_aenq_default_hdlr;
671 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_KEEP_ALIVE] = ena_aenq_default_hdlr;
672 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES] =
673 ena_aenq_default_hdlr;
674 }
675 /*
676 * Initialize the Async Event Notification Queue.
677 */
678 static boolean_t
ena_aenq_init(ena_t * ena)679 ena_aenq_init(ena_t *ena)
680 {
681 ena_aenq_t *aenq = &ena->ena_aenq;
682 size_t size;
683 uint32_t addr_low, addr_high, wval;
684 ena_dma_conf_t conf;
685
686 aenq->eaenq_num_descs = ENA_AENQ_NUM_DESCS;
687 size = aenq->eaenq_num_descs * sizeof (*aenq->eaenq_descs);
688
689 conf = (ena_dma_conf_t) {
690 .edc_size = size,
691 .edc_align = ENAHW_AENQ_DESC_BUF_ALIGNMENT,
692 .edc_sgl = 1,
693 .edc_endian = DDI_NEVERSWAP_ACC,
694 .edc_stream = B_FALSE,
695 };
696
697 if (!ena_dma_alloc(ena, &aenq->eaenq_dma, &conf, size)) {
698 ena_err(ena, "failed to allocate DMA for AENQ");
699 return (B_FALSE);
700 }
701
702 aenq->eaenq_descs = (void *)aenq->eaenq_dma.edb_va;
703 aenq->eaenq_head = 0;
704 aenq->eaenq_phase = 1;
705 bzero(aenq->eaenq_descs, size);
706 ena_aenq_set_def_hdlrs(aenq);
707
708 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] =
709 ena_aenq_link_change_hdlr;
710
711 ENA_DMA_VERIFY_ADDR(ena, aenq->eaenq_dma.edb_cookie->dmac_laddress);
712 addr_low = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress);
713 addr_high = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress >> 32);
714 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_LO, addr_low);
715 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_HI, addr_high);
716 ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORDEV);
717 wval = ENAHW_AENQ_CAPS_DEPTH(aenq->eaenq_num_descs) |
718 ENAHW_AENQ_CAPS_ENTRY_SIZE(sizeof (*aenq->eaenq_descs));
719 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_CAPS, wval);
720 return (B_TRUE);
721 }
722
723 /*
724 * We limit the max number of I/O queues based on several aspects of
725 * the underlying hardware.
726 *
727 * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES,
728 * which comes from the common code and presumably is based on device
729 * constraints.
730 *
731 * 2. Next we latch the number of I/O queues to the number of online
732 * CPUs. The idea being that each queue is a parallel work stream,
733 * and having more queues than CPUs to flush them will not improve
734 * performance. The number of online CPUs can change dynamically,
735 * and that's okay, everything should still work fine, it just
736 * might not be ideal.
737 *
738 * 3. Next we latch the number of I/O queues to the smallest of the
739 * max Tx queues and max Rx queues. We could probably loosen this
740 * restriction in the future, and have separate max I/O queues for
741 * Tx and Rx. This is what Linux does, and seems like a fine place
742 * to start.
743 */
744 static void
ena_set_max_io_queues(ena_t * ena)745 ena_set_max_io_queues(ena_t *ena)
746 {
747 uint32_t max = ENAHW_MAX_NUM_IO_QUEUES;
748
749 max = MIN(ncpus_online, max);
750 /*
751 * Supposedly a device could present a different number of SQs
752 * and CQs. This driver is desinged in a way that requires
753 * each SQ to have a corresponding and dedicated CQ (how would
754 * it work otherwise). Therefore, we must check both values
755 * and find the minimum between them.
756 */
757 max = MIN(ena->ena_tx_max_sq_num, max);
758 max = MIN(ena->ena_tx_max_cq_num, max);
759 max = MIN(ena->ena_rx_max_sq_num, max);
760 max = MIN(ena->ena_rx_max_cq_num, max);
761
762
763 /* This shouldn't happen, but just in case. */
764 if (max == 0) {
765 max = 1;
766 }
767
768 ena->ena_max_io_queues = max;
769 }
770
771 /*
772 * We require that an Rx or Tx buffer be able to hold the maximum MTU
773 * along with the maximum frame header length. In this case we know
774 * ENA is presenting us an Ethernet frame so we add the size of an
775 * Ethernet VLAN header. Rx has the additional requirement of needing
776 * additional margin for the sake of IP header alignment.
777 */
778 static void
ena_update_buf_sizes(ena_t * ena)779 ena_update_buf_sizes(ena_t *ena)
780 {
781 ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header);
782 ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu;
783 ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total,
784 ena->ena_page_sz, uint32_t);
785 ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total +
786 ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t);
787 }
788
789 static boolean_t
ena_get_offloads(ena_t * ena)790 ena_get_offloads(ena_t *ena)
791 {
792 int ret = 0;
793 enahw_resp_desc_t resp;
794 enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload;
795
796 ena->ena_tx_l3_ipv4_csum = B_FALSE;
797
798 ena->ena_tx_l4_ipv4_part_csum = B_FALSE;
799 ena->ena_tx_l4_ipv4_full_csum = B_FALSE;
800 ena->ena_tx_l4_ipv4_lso = B_FALSE;
801
802 ena->ena_tx_l4_ipv6_part_csum = B_FALSE;
803 ena->ena_tx_l4_ipv6_full_csum = B_FALSE;
804 ena->ena_tx_l4_ipv6_lso = B_FALSE;
805
806 ena->ena_rx_l3_ipv4_csum = B_FALSE;
807 ena->ena_rx_l4_ipv4_csum = B_FALSE;
808 ena->ena_rx_l4_ipv6_csum = B_FALSE;
809 ena->ena_rx_hash = B_FALSE;
810
811 bzero(&resp, sizeof (resp));
812 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG,
813 ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER);
814
815 if (ret == ENOTSUP) {
816 /*
817 * In this case the device does not support querying
818 * for hardware offloads. We take that as a sign that
819 * the device provides no offloads.
820 */
821 return (B_TRUE);
822 } else if (ret != 0) {
823 ena_err(ena, "error getting stateless offload: %d", ret);
824 return (B_FALSE);
825 }
826
827 ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat);
828
829 ena->ena_tx_l4_ipv4_part_csum =
830 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat);
831 ena->ena_tx_l4_ipv4_full_csum =
832 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat);
833 ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat);
834
835 ena->ena_tx_l4_ipv6_part_csum =
836 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat);
837 ena->ena_tx_l4_ipv6_full_csum =
838 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat);
839 ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat);
840
841 ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat);
842 ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat);
843 ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat);
844 return (B_TRUE);
845 }
846
847 static int
ena_get_prop(ena_t * ena,char * propname,const int minval,const int maxval,const int defval)848 ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval,
849 const int defval)
850 {
851 int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip,
852 DDI_PROP_DONTPASS, propname, defval);
853
854 if (value > maxval) {
855 ena_err(ena, "user value %s=%d exceeded maximum, setting to %d",
856 propname, value, maxval);
857 value = maxval;
858 }
859
860 if (value < minval) {
861 ena_err(ena, "user value %s=%d below minimum, setting to %d",
862 propname, value, minval);
863 value = minval;
864 }
865
866 return (value);
867 }
868
869 static boolean_t
ena_set_mtu(ena_t * ena)870 ena_set_mtu(ena_t *ena)
871 {
872 int ret = 0;
873 enahw_cmd_desc_t cmd;
874 enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu;
875 enahw_resp_desc_t resp;
876
877 bzero(&cmd, sizeof (cmd));
878 bzero(&resp, sizeof (resp));
879 feat->efm_mtu = ena->ena_mtu;
880
881 if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU,
882 ENAHW_FEAT_MTU_VER)) != 0) {
883 ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu,
884 ret);
885 return (B_FALSE);
886 }
887
888 return (B_TRUE);
889 }
890
891 static void
ena_get_link_config(ena_t * ena)892 ena_get_link_config(ena_t *ena)
893 {
894 enahw_resp_desc_t resp;
895 enahw_feat_link_conf_t *feat =
896 &resp.erd_resp.erd_get_feat.ergf_link_conf;
897 boolean_t full_duplex;
898
899 bzero(&resp, sizeof (resp));
900
901 if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG,
902 ENAHW_FEAT_LINK_CONFIG_VER) != 0) {
903 /*
904 * Some ENA devices do no support this feature. In
905 * those cases we report a 1Gbps link, full duplex.
906 * For the most accurate information on bandwidth
907 * limits see the official AWS documentation.
908 */
909 ena->ena_link_speed_mbits = 1 * 1000 * 1000;
910 ena->ena_link_speeds = ENAHW_LINK_SPEED_1G;
911 ena->ena_link_duplex = LINK_DUPLEX_FULL;
912 ena->ena_link_autoneg = B_TRUE;
913 return;
914 }
915
916 ena->ena_link_speed_mbits = feat->eflc_speed;
917 ena->ena_link_speeds = feat->eflc_supported;
918 full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat);
919 ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL :
920 LINK_DUPLEX_HALF;
921 ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat);
922 }
923
924 /*
925 * Retrieve all configuration values which are modifiable via
926 * ena.conf, and set ena_t members accordingly. While the conf values
927 * have priority, they may be implicitly modified by the driver to
928 * meet resource constraints on a given platform. If no value is
929 * specified in the conf file, the driver will attempt to use the
930 * largest value supported. While there should be no value large
931 * enough, keep in mind that ena_get_prop() will cast the values to an
932 * int.
933 *
934 * This function should be called after the device is initialized,
935 * admin queue is established, and the hardware features/capabs have
936 * been queried; it should be called before mac registration.
937 */
938 static boolean_t
ena_attach_read_conf(ena_t * ena)939 ena_attach_read_conf(ena_t *ena)
940 {
941 uint32_t gcv; /* Greatest Common Value */
942
943 /*
944 * We expect that the queue lengths are the same for both the
945 * CQ and SQ, but technically the device could return
946 * different lengths. For now the driver locks them together.
947 */
948 gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs);
949 ASSERT3U(gcv, <=, INT_MAX);
950 ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS,
951 ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv);
952
953 ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT,
954 ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX,
955 ENA_PROP_RXQ_INTR_LIMIT_DEF);
956
957 gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs);
958 ASSERT3U(gcv, <=, INT_MAX);
959 ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS,
960 ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv);
961
962 return (B_TRUE);
963 }
964
965 /*
966 * Perform any necessary device configuration after the driver.conf
967 * has been read.
968 */
969 static boolean_t
ena_attach_dev_cfg(ena_t * ena)970 ena_attach_dev_cfg(ena_t *ena)
971 {
972 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF);
973
974 if (!ena_set_mtu(ena)) {
975 /*
976 * We don't expect this to fail, but we try a fallback
977 * first before failing the attach sequence.
978 */
979 ena->ena_mtu = 1500;
980 ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu);
981
982 if (!ena_set_mtu(ena)) {
983 return (B_FALSE);
984 }
985 }
986
987 return (B_TRUE);
988 }
989
990 static boolean_t
ena_check_versions(ena_t * ena)991 ena_check_versions(ena_t *ena)
992 {
993 uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION);
994 uint32_t ctrl_vsn =
995 ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION);
996
997 ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn);
998 ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn);
999
1000 ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn);
1001 ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn);
1002 ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn);
1003 ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn);
1004
1005 if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) {
1006 ena_err(ena, "unsupported controller version: %u.%u.%u",
1007 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
1008 ena->ena_ctrl_subminor_vsn);
1009 return (B_FALSE);
1010 }
1011
1012 return (B_TRUE);
1013 }
1014
1015 boolean_t
ena_setup_aenq(ena_t * ena)1016 ena_setup_aenq(ena_t *ena)
1017 {
1018 enahw_cmd_desc_t cmd;
1019 enahw_feat_aenq_t *cmd_feat =
1020 &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_aenq;
1021 enahw_resp_desc_t resp;
1022 enahw_feat_aenq_t *resp_feat = &resp.erd_resp.erd_get_feat.ergf_aenq;
1023 enahw_aenq_groups_t to_enable;
1024
1025 bzero(&resp, sizeof (resp));
1026 if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG,
1027 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
1028 return (B_FALSE);
1029 }
1030
1031 to_enable = BIT(ENAHW_AENQ_GROUP_LINK_CHANGE) |
1032 BIT(ENAHW_AENQ_GROUP_FATAL_ERROR) |
1033 BIT(ENAHW_AENQ_GROUP_WARNING) |
1034 BIT(ENAHW_AENQ_GROUP_NOTIFICATION);
1035 to_enable &= resp_feat->efa_supported_groups;
1036
1037 bzero(&cmd, sizeof (cmd));
1038 bzero(&resp, sizeof (cmd));
1039 cmd_feat->efa_enabled_groups = to_enable;
1040
1041 if (ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_AENQ_CONFIG,
1042 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
1043 return (B_FALSE);
1044 }
1045
1046 bzero(&resp, sizeof (resp));
1047 if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG,
1048 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
1049 return (B_FALSE);
1050 }
1051
1052 ena->ena_aenq_supported_groups = resp_feat->efa_supported_groups;
1053 ena->ena_aenq_enabled_groups = resp_feat->efa_enabled_groups;
1054
1055 for (uint_t i = 0; i < ENAHW_AENQ_GROUPS_ARR_NUM; i++) {
1056 ena_aenq_grpstr_t *grpstr = &ena_groups_str[i];
1057 boolean_t supported = BIT(grpstr->eag_type) &
1058 resp_feat->efa_supported_groups;
1059 boolean_t enabled = BIT(grpstr->eag_type) &
1060 resp_feat->efa_enabled_groups;
1061
1062 ena_dbg(ena, "%s supported: %s enabled: %s", grpstr->eag_str,
1063 supported ? "Y" : "N", enabled ? "Y" : "N");
1064 }
1065
1066 return (B_TRUE);
1067 }
1068
1069 /*
1070 * Free all resources allocated as part of ena_device_init().
1071 */
1072 static void
ena_cleanup_device_init(ena_t * ena)1073 ena_cleanup_device_init(ena_t *ena)
1074 {
1075 ena_adminq_t *aq = &ena->ena_aq;
1076
1077 ena_free_host_info(ena);
1078 mutex_destroy(&aq->ea_sq_lock);
1079 mutex_destroy(&aq->ea_cq_lock);
1080 mutex_destroy(&aq->ea_stat_lock);
1081 list_destroy(&aq->ea_cmd_ctxs_free);
1082 kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen);
1083 ena_admin_sq_free(ena);
1084 ena_admin_cq_free(ena);
1085 ena_aenq_free(ena);
1086 ena_stat_device_basic_cleanup(ena);
1087 ena_stat_device_extended_cleanup(ena);
1088 ena_stat_aenq_cleanup(ena);
1089 }
1090
1091 static boolean_t
ena_attach_device_init(ena_t * ena)1092 ena_attach_device_init(ena_t *ena)
1093 {
1094 ena_adminq_t *aq = &ena->ena_aq;
1095 uint32_t rval, wval;
1096 uint8_t dma_width;
1097 hrtime_t timeout, cmd_timeout;
1098 hrtime_t expired;
1099 enahw_resp_desc_t resp;
1100 enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr;
1101 uint8_t *maddr;
1102 uint32_t supported_features;
1103 int ret = 0;
1104
1105 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
1106 if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) {
1107 ena_err(ena, "device is not ready");
1108 return (B_FALSE);
1109 }
1110
1111 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
1112
1113 /*
1114 * The device stores the reset timeout at 100ms resolution; we
1115 * normalize that to nanoseconds.
1116 */
1117 timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100);
1118
1119 if (timeout == 0) {
1120 ena_err(ena, "device gave invalid reset timeout");
1121 return (B_FALSE);
1122 }
1123
1124 expired = gethrtime() + timeout;
1125
1126 wval = ENAHW_DEV_CTL_DEV_RESET_MASK;
1127 wval |= (ENAHW_RESET_NORMAL << ENAHW_DEV_CTL_RESET_REASON_SHIFT) &
1128 ENAHW_DEV_CTL_RESET_REASON_MASK;
1129 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval);
1130
1131 /*
1132 * Make sure reset is in progress.
1133 */
1134 while (1) {
1135 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
1136
1137 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0) {
1138 break;
1139 }
1140
1141 if (gethrtime() > expired) {
1142 ena_err(ena, "device reset start timed out");
1143 return (B_FALSE);
1144 }
1145
1146 /* Sleep for 100 milliseconds. */
1147 delay(drv_usectohz(100 * 1000));
1148 }
1149
1150 /*
1151 * Reset the timeout counter for the next device request.
1152 */
1153 expired = gethrtime() + timeout;
1154
1155 /*
1156 * Wait for the device reset to finish.
1157 */
1158 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0);
1159 while (1) {
1160 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
1161
1162 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) {
1163 break;
1164 }
1165
1166 if (gethrtime() > expired) {
1167 ena_err(ena, "device reset timed out");
1168 return (B_FALSE);
1169 }
1170
1171 /* Sleep for 100 milliseconds. */
1172 delay(drv_usectohz(100 * 1000));
1173 }
1174
1175 if (!ena_check_versions(ena)) {
1176 return (B_FALSE);
1177 }
1178
1179 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
1180 dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval);
1181 ena->ena_dma_width = dma_width;
1182
1183 /*
1184 * As we are not using an interrupt for admin queue completion
1185 * signaling, we do not need a priority on these mutexes. If
1186 * that changes, we will have to rejigger some code to create
1187 * the admin queue interrupt before this function.
1188 */
1189 mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL);
1190 mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL);
1191 mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL);
1192 aq->ea_qlen = ENA_ADMINQ_DEPTH;
1193 aq->ea_pending_cmds = 0;
1194
1195 aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen,
1196 KM_SLEEP);
1197 list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t),
1198 offsetof(ena_cmd_ctx_t, ectx_node));
1199
1200 for (uint_t i = 0; i < aq->ea_qlen; i++) {
1201 ena_cmd_ctx_t *ctx = &aq->ea_cmd_ctxs[i];
1202
1203 ctx->ectx_id = i;
1204 ctx->ectx_pending = B_FALSE;
1205 ctx->ectx_cmd_opcode = ENAHW_CMD_NONE;
1206 ctx->ectx_resp = NULL;
1207 list_insert_tail(&aq->ea_cmd_ctxs_free, ctx);
1208 }
1209
1210 /*
1211 * The value stored in the device register is in the
1212 * resolution of 100 milliseconds. We normalize that to
1213 * nanoseconds.
1214 */
1215 cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100);
1216 aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns);
1217
1218 if (aq->ea_cmd_timeout_ns == 0) {
1219 aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT;
1220 }
1221
1222 if (!ena_admin_sq_init(ena)) {
1223 return (B_FALSE);
1224 }
1225
1226 if (!ena_admin_cq_init(ena)) {
1227 return (B_FALSE);
1228 }
1229
1230 if (!ena_aenq_init(ena)) {
1231 return (B_FALSE);
1232 }
1233
1234 /*
1235 * While the Linux driver prefers to use interrupts to deliver
1236 * admin queue completions, we just poll -- it seems to work
1237 * just fine.
1238 */
1239 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, 0);
1240 aq->ea_poll_mode = B_TRUE;
1241
1242 bzero(&resp, sizeof (resp));
1243 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES,
1244 ENAHW_FEAT_DEVICE_ATTRIBUTES_VER);
1245
1246 if (ret != 0) {
1247 ena_err(ena, "failed to get device attributes: %d", ret);
1248 return (B_FALSE);
1249 }
1250
1251 ena_dbg(ena, "impl ID: %u", feat->efda_impl_id);
1252 ena_dbg(ena, "device version: %u", feat->efda_device_version);
1253 ena_dbg(ena, "supported features: 0x%x",
1254 feat->efda_supported_features);
1255 ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width);
1256 ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with);
1257 maddr = feat->efda_mac_addr;
1258 ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1],
1259 maddr[2], maddr[3], maddr[4], maddr[5]);
1260 ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu);
1261
1262 bcopy(maddr, ena->ena_mac_addr, ETHERADDRL);
1263 ena->ena_max_mtu = feat->efda_max_mtu;
1264 supported_features = feat->efda_supported_features;
1265 ena->ena_supported_features = supported_features;
1266 feat = NULL;
1267 bzero(&resp, sizeof (resp));
1268
1269 if (supported_features & BIT(ENAHW_FEAT_MAX_QUEUES_EXT)) {
1270 enahw_feat_max_queue_ext_t *feat_mqe =
1271 &resp.erd_resp.erd_get_feat.ergf_max_queue_ext;
1272
1273 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT,
1274 ENAHW_FEAT_MAX_QUEUES_EXT_VER);
1275
1276 if (ret != 0) {
1277 ena_err(ena, "failed to query max queues ext: %d", ret);
1278 return (B_FALSE);
1279 }
1280
1281 ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num;
1282 ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth;
1283 ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num;
1284 ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth;
1285 ena->ena_tx_max_desc_per_pkt =
1286 feat_mqe->efmqe_max_per_packet_tx_descs;
1287 ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size;
1288
1289 ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num;
1290 ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth;
1291 ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num;
1292 ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth;
1293 ena->ena_rx_max_desc_per_pkt =
1294 feat_mqe->efmqe_max_per_packet_rx_descs;
1295
1296 ena_set_max_io_queues(ena);
1297 } else {
1298 enahw_feat_max_queue_t *feat_mq =
1299 &resp.erd_resp.erd_get_feat.ergf_max_queue;
1300
1301 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM,
1302 ENAHW_FEAT_MAX_QUEUES_NUM_VER);
1303
1304 if (ret != 0) {
1305 ena_err(ena, "failed to query max queues: %d", ret);
1306 return (B_FALSE);
1307 }
1308
1309 ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num;
1310 ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1311 ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num;
1312 ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1313 ena->ena_tx_max_desc_per_pkt =
1314 feat_mq->efmq_max_per_packet_tx_descs;
1315 ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size;
1316
1317 ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num;
1318 ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1319 ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num;
1320 ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1321 ena->ena_rx_max_desc_per_pkt =
1322 feat_mq->efmq_max_per_packet_rx_descs;
1323
1324 ena_set_max_io_queues(ena);
1325 }
1326
1327 ena->ena_mtu = ena->ena_max_mtu;
1328 ena_update_buf_sizes(ena);
1329 /*
1330 * We could use ENAHW_FEAT_HW_HINTS to determine actual SGL
1331 * sizes, for now we just force everything to use one
1332 * segment.
1333 */
1334 ena->ena_tx_sgl_max_sz = 1;
1335 ena->ena_rx_sgl_max_sz = 1;
1336
1337 if (!ena_init_host_info(ena)) {
1338 return (B_FALSE);
1339 }
1340
1341 if (!ena_setup_aenq(ena)) {
1342 return (B_FALSE);
1343 }
1344
1345 ena_get_link_config(ena);
1346
1347 if (!ena_get_offloads(ena)) {
1348 return (B_FALSE);
1349 }
1350
1351 if (!ena_stat_device_basic_init(ena)) {
1352 return (B_FALSE);
1353 }
1354
1355 if (!ena_stat_device_extended_init(ena)) {
1356 return (B_FALSE);
1357 }
1358
1359 if (!ena_stat_aenq_init(ena)) {
1360 return (B_FALSE);
1361 }
1362
1363 return (B_TRUE);
1364 }
1365
1366 static void
ena_cleanup_intr_alloc(ena_t * ena)1367 ena_cleanup_intr_alloc(ena_t *ena)
1368 {
1369 for (int i = 0; i < ena->ena_num_intrs; i++) {
1370 int ret = ddi_intr_free(ena->ena_intr_handles[i]);
1371 if (ret != DDI_SUCCESS) {
1372 ena_err(ena, "failed to free interrupt %d: %d", i, ret);
1373 }
1374 }
1375
1376 if (ena->ena_intr_handles != NULL) {
1377 kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz);
1378 ena->ena_intr_handles = NULL;
1379 ena->ena_intr_handles_sz = 0;
1380 }
1381 }
1382
1383 /*
1384 * The Linux driver supports only MSI-X interrupts. We do the same,
1385 * with the assumption that it's the only type of interrupt the device
1386 * can present.
1387 */
1388 static boolean_t
ena_attach_intr_alloc(ena_t * ena)1389 ena_attach_intr_alloc(ena_t *ena)
1390 {
1391 int ret;
1392 int types;
1393 int min, req, ideal, avail, actual;
1394
1395 ret = ddi_intr_get_supported_types(ena->ena_dip, &types);
1396 if (ret != DDI_SUCCESS) {
1397 ena_err(ena, "failed to get interrupt types: %d", ret);
1398 return (B_FALSE);
1399 }
1400
1401 ena_dbg(ena, "supported interrupt types: 0x%x", types);
1402 if ((types & DDI_INTR_TYPE_MSIX) == 0) {
1403 ena_err(ena, "the ena driver only supports MSI-X interrupts");
1404 return (B_FALSE);
1405 }
1406
1407 /* One for I/O, one for adminq. */
1408 min = 2;
1409 ideal = ena->ena_max_io_queues + 1;
1410 ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1411 if (ret != DDI_SUCCESS) {
1412 ena_err(ena, "failed to get number of MSI-X interrupts: %d",
1413 ret);
1414 return (B_FALSE);
1415 }
1416
1417 if (avail < min) {
1418 ena_err(ena, "number of MSI-X interrupts is %d, but the driver "
1419 "requires a minimum of %d", avail, min);
1420 return (B_FALSE);
1421 }
1422
1423 ena_dbg(ena, "%d MSI-X interrupts available", avail);
1424
1425 ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1426 if (ret != DDI_SUCCESS) {
1427 ena_err(ena, "failed to get available interrupts: %d", ret);
1428 return (B_FALSE);
1429 }
1430
1431 if (avail < min) {
1432 ena_err(ena, "number of available MSI-X interrupts is %d, "
1433 "but the driver requires a minimum of %d", avail, min);
1434 return (B_FALSE);
1435 }
1436
1437 req = MIN(ideal, avail);
1438 ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t);
1439 ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP);
1440
1441 ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles,
1442 DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL);
1443 if (ret != DDI_SUCCESS) {
1444 ena_err(ena, "failed to allocate %d MSI-X interrupts: %d",
1445 req, ret);
1446 return (B_FALSE);
1447 }
1448
1449 if (actual < min) {
1450 ena_err(ena, "number of allocated interrupts is %d, but the "
1451 "driver requires a minimum of %d", actual, min);
1452 return (B_FALSE);
1453 }
1454
1455 ena->ena_num_intrs = actual;
1456
1457 ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps);
1458 if (ret != DDI_SUCCESS) {
1459 ena_err(ena, "failed to get interrupt capability: %d", ret);
1460 return (B_FALSE);
1461 }
1462
1463 ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri);
1464 if (ret != DDI_SUCCESS) {
1465 ena_err(ena, "failed to get interrupt priority: %d", ret);
1466 return (B_FALSE);
1467 }
1468
1469 ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u",
1470 actual, ena->ena_intr_caps, ena->ena_intr_pri);
1471
1472 /*
1473 * The ena_lock should not be held in the datapath, but it is
1474 * held as part of the AENQ handler, which runs in interrupt
1475 * context. Therefore, we delayed the initilization of this
1476 * mutex until after the interrupts are allocated.
1477 */
1478 mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER,
1479 DDI_INTR_PRI(ena->ena_intr_pri));
1480
1481 return (B_TRUE);
1482 }
1483
1484 /*
1485 * Allocate the parent Rx queue structures. More importantly, this is
1486 * NOT allocating the queue descriptors or data buffers. Those are
1487 * allocated on demand as queues are started.
1488 */
1489 static boolean_t
ena_attach_alloc_rxqs(ena_t * ena)1490 ena_attach_alloc_rxqs(ena_t *ena)
1491 {
1492 /* We rely on the interrupt priority for initializing the mutexes. */
1493 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1494 ena->ena_num_rxqs = ena->ena_num_intrs - 1;
1495 ASSERT3U(ena->ena_num_rxqs, >, 0);
1496 ena->ena_rxqs = kmem_zalloc(ena->ena_num_rxqs * sizeof (*ena->ena_rxqs),
1497 KM_SLEEP);
1498
1499 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1500 ena_rxq_t *rxq = &ena->ena_rxqs[i];
1501
1502 rxq->er_rxqs_idx = i;
1503 /* The 0th vector is for Admin + AENQ. */
1504 rxq->er_intr_vector = i + 1;
1505 rxq->er_mrh = NULL;
1506
1507 mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER,
1508 DDI_INTR_PRI(ena->ena_intr_pri));
1509 mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER,
1510 DDI_INTR_PRI(ena->ena_intr_pri));
1511
1512 rxq->er_ena = ena;
1513 rxq->er_sq_num_descs = ena->ena_rxq_num_descs;
1514 rxq->er_cq_num_descs = ena->ena_rxq_num_descs;
1515
1516 if (!ena_stat_rxq_init(rxq)) {
1517 return (B_FALSE);
1518 }
1519
1520 if (!ena_alloc_rxq(rxq)) {
1521 return (B_FALSE);
1522 }
1523 }
1524
1525 return (B_TRUE);
1526 }
1527
1528 static void
ena_cleanup_rxqs(ena_t * ena)1529 ena_cleanup_rxqs(ena_t *ena)
1530 {
1531 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1532 ena_rxq_t *rxq = &ena->ena_rxqs[i];
1533
1534 ena_cleanup_rxq(rxq);
1535 mutex_destroy(&rxq->er_lock);
1536 mutex_destroy(&rxq->er_stat_lock);
1537 ena_stat_rxq_cleanup(rxq);
1538 }
1539
1540 kmem_free(ena->ena_rxqs, ena->ena_num_rxqs * sizeof (*ena->ena_rxqs));
1541 }
1542
1543 /*
1544 * Allocate the parent Tx queue structures. More importantly, this is
1545 * NOT allocating the queue descriptors or data buffers. Those are
1546 * allocated on demand as a queue is started.
1547 */
1548 static boolean_t
ena_attach_alloc_txqs(ena_t * ena)1549 ena_attach_alloc_txqs(ena_t *ena)
1550 {
1551 /* We rely on the interrupt priority for initializing the mutexes. */
1552 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1553 ena->ena_num_txqs = ena->ena_num_intrs - 1;
1554 ASSERT3U(ena->ena_num_txqs, >, 0);
1555 ena->ena_txqs = kmem_zalloc(ena->ena_num_txqs * sizeof (*ena->ena_txqs),
1556 KM_SLEEP);
1557
1558 for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1559 ena_txq_t *txq = &ena->ena_txqs[i];
1560
1561 txq->et_txqs_idx = i;
1562 /* The 0th vector is for Admin + AENQ. */
1563 txq->et_intr_vector = i + 1;
1564 txq->et_mrh = NULL;
1565
1566 mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER,
1567 DDI_INTR_PRI(ena->ena_intr_pri));
1568 mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER,
1569 DDI_INTR_PRI(ena->ena_intr_pri));
1570
1571 txq->et_ena = ena;
1572 txq->et_sq_num_descs = ena->ena_txq_num_descs;
1573 txq->et_cq_num_descs = ena->ena_txq_num_descs;
1574
1575 if (!ena_stat_txq_init(txq)) {
1576 return (B_FALSE);
1577 }
1578
1579 if (!ena_alloc_txq(txq)) {
1580 return (B_FALSE);
1581 }
1582 }
1583
1584 return (B_TRUE);
1585 }
1586
1587 static void
ena_cleanup_txqs(ena_t * ena)1588 ena_cleanup_txqs(ena_t *ena)
1589 {
1590 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1591 ena_txq_t *txq = &ena->ena_txqs[i];
1592
1593 ena_cleanup_txq(txq);
1594 mutex_destroy(&txq->et_lock);
1595 mutex_destroy(&txq->et_stat_lock);
1596 ena_stat_txq_cleanup(txq);
1597 }
1598
1599 kmem_free(ena->ena_txqs, ena->ena_num_txqs * sizeof (*ena->ena_txqs));
1600 }
1601
1602 ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = {
1603 {
1604 .ead_seq = ENA_ATTACH_PCI,
1605 .ead_name = "PCI config",
1606 .ead_attach_fn = ena_attach_pci,
1607 .ead_attach_hard_fail = B_TRUE,
1608 .ead_cleanup_fn = ena_cleanup_pci,
1609 },
1610
1611 {
1612 .ead_seq = ENA_ATTACH_REGS,
1613 .ead_name = "BAR mapping",
1614 .ead_attach_fn = ena_attach_regs_map,
1615 .ead_attach_hard_fail = B_TRUE,
1616 .ead_cleanup_fn = ena_cleanup_regs_map,
1617 },
1618
1619 {
1620 .ead_seq = ENA_ATTACH_DEV_INIT,
1621 .ead_name = "device initialization",
1622 .ead_attach_fn = ena_attach_device_init,
1623 .ead_attach_hard_fail = B_TRUE,
1624 .ead_cleanup_fn = ena_cleanup_device_init,
1625 },
1626
1627 {
1628 .ead_seq = ENA_ATTACH_READ_CONF,
1629 .ead_name = "ena.conf",
1630 .ead_attach_fn = ena_attach_read_conf,
1631 .ead_attach_hard_fail = B_TRUE,
1632 .ead_cleanup_fn = ena_no_cleanup,
1633 },
1634
1635 {
1636 .ead_seq = ENA_ATTACH_DEV_CFG,
1637 .ead_name = "device config",
1638 .ead_attach_fn = ena_attach_dev_cfg,
1639 .ead_attach_hard_fail = B_TRUE,
1640 .ead_cleanup_fn = ena_no_cleanup,
1641 },
1642
1643 {
1644 .ead_seq = ENA_ATTACH_INTR_ALLOC,
1645 .ead_name = "interrupt allocation",
1646 .ead_attach_fn = ena_attach_intr_alloc,
1647 .ead_attach_hard_fail = B_TRUE,
1648 .ead_cleanup_fn = ena_cleanup_intr_alloc,
1649 },
1650
1651 {
1652 .ead_seq = ENA_ATTACH_INTR_HDLRS,
1653 .ead_name = "interrupt handlers",
1654 .ead_attach_fn = ena_intr_add_handlers,
1655 .ead_attach_hard_fail = B_TRUE,
1656 .ead_cleanup_fn = ena_intr_remove_handlers,
1657 },
1658
1659 {
1660 .ead_seq = ENA_ATTACH_TXQS_ALLOC,
1661 .ead_name = "Tx queues",
1662 .ead_attach_fn = ena_attach_alloc_txqs,
1663 .ead_attach_hard_fail = B_TRUE,
1664 .ead_cleanup_fn = ena_cleanup_txqs,
1665 },
1666
1667 {
1668 .ead_seq = ENA_ATTACH_RXQS_ALLOC,
1669 .ead_name = "Rx queues",
1670 .ead_attach_fn = ena_attach_alloc_rxqs,
1671 .ead_attach_hard_fail = B_TRUE,
1672 .ead_cleanup_fn = ena_cleanup_rxqs,
1673 },
1674
1675 /*
1676 * The chance of mac_unregister() failure poses a problem to
1677 * cleanup. We address interrupt disablement and mac
1678 * unregistration explicitly in the attach/detach routines.
1679 */
1680 {
1681 .ead_seq = ENA_ATTACH_MAC_REGISTER,
1682 .ead_name = "mac registration",
1683 .ead_attach_fn = ena_mac_register,
1684 .ead_attach_hard_fail = B_TRUE,
1685 .ead_cleanup_fn = ena_no_cleanup,
1686 },
1687
1688 {
1689 .ead_seq = ENA_ATTACH_INTRS_ENABLE,
1690 .ead_name = "enable interrupts",
1691 .ead_attach_fn = ena_intrs_enable,
1692 .ead_attach_hard_fail = B_TRUE,
1693 .ead_cleanup_fn = ena_no_cleanup,
1694 }
1695 };
1696
1697 /*
1698 * This function undoes any work done by ena_attach(), either in
1699 * response to a failed attach or a planned detach. At the end of this
1700 * function ena_attach_seq should be zero, otherwise it means
1701 * something has not be freed/uninitialized.
1702 */
1703 static void
ena_cleanup(ena_t * ena)1704 ena_cleanup(ena_t *ena)
1705 {
1706 if (ena == NULL || ena->ena_attach_seq == 0) {
1707 return;
1708 }
1709
1710 /*
1711 * We VERIFY this because if the seq is greater than entries
1712 * we drift into space and execute god knows what.
1713 */
1714 VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES);
1715
1716 while (ena->ena_attach_seq > 0) {
1717 int idx = ena->ena_attach_seq - 1;
1718 ena_attach_desc_t *desc = &ena_attach_tbl[idx];
1719
1720 ena_dbg(ena, "running cleanup sequence: %s (%d)",
1721 desc->ead_name, idx);
1722
1723 desc->ead_cleanup_fn(ena);
1724 ena->ena_attach_seq--;
1725 }
1726
1727 ASSERT3U(ena->ena_attach_seq, ==, 0);
1728 mutex_destroy(&ena->ena_lock);
1729 }
1730
1731 static int
ena_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)1732 ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1733 {
1734 ena_t *ena;
1735
1736 if (cmd != DDI_ATTACH) {
1737 return (DDI_FAILURE);
1738 }
1739
1740 ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP);
1741 ena->ena_instance = ddi_get_instance(dip);
1742 ena->ena_dip = dip;
1743 ena->ena_instance = ddi_get_instance(dip);
1744 ena->ena_page_sz = ddi_ptob(dip, 1);
1745
1746 for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) {
1747 boolean_t success;
1748 ena_attach_desc_t *desc = &ena_attach_tbl[i];
1749
1750 ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name,
1751 i);
1752
1753 if (!(success = desc->ead_attach_fn(ena))) {
1754 ena_err(ena, "attach sequence failed: %s (%d)",
1755 desc->ead_name, i);
1756
1757 if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) {
1758 /*
1759 * In this specific case
1760 * ENA_ATTACH_INTRS_ENABLE has failed,
1761 * and we may or may not be able to
1762 * unregister the mac, depending on if
1763 * something in userspace has created
1764 * a client on top.
1765 *
1766 * NOTE: Something that would be nice
1767 * to add to mac is the ability to
1768 * register a provider separate from
1769 * "publishing" it to the rest of the
1770 * system. This would allow a driver
1771 * to register its mac, do some
1772 * additional work that might fail,
1773 * and then unregister if that work
1774 * fails without concern for any
1775 * chance of failure when calling
1776 * unregister. This would remove the
1777 * complexity of the situation we are
1778 * trying to address here, as we would
1779 * know that until the mac has been
1780 * "published", there is no chance for
1781 * mac_unregister() to fail.
1782 */
1783 if (ena_mac_unregister(ena) != 0) {
1784 return (DDI_FAILURE);
1785 }
1786
1787 ena->ena_attach_seq--;
1788 } else {
1789 /*
1790 * Since the ead_seq is predicated on
1791 * successful ead_attach_fn we must
1792 * run the specific cleanup handler
1793 * before calling the global cleanup
1794 * routine. This also means that all
1795 * cleanup functions must be able to
1796 * deal with partial success of the
1797 * corresponding ead_attach_fn.
1798 */
1799 desc->ead_cleanup_fn(ena);
1800 }
1801
1802 ena_cleanup(ena);
1803 kmem_free(ena, sizeof (ena_t));
1804 return (DDI_FAILURE);
1805 }
1806
1807 if (success) {
1808 ena_dbg(ena, "attach sequence completed: %s (%d)",
1809 desc->ead_name, i);
1810 }
1811
1812 ena->ena_attach_seq = desc->ead_seq;
1813 }
1814
1815 /*
1816 * Now that interrupts are enabled make sure to tell the
1817 * device that all AENQ descriptors are ready for writing.
1818 */
1819 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB,
1820 ena->ena_aenq.eaenq_num_descs);
1821
1822 ddi_set_driver_private(dip, ena);
1823 return (DDI_SUCCESS);
1824 }
1825
1826 static int
ena_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)1827 ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1828 {
1829 ena_t *ena = ddi_get_driver_private(dip);
1830
1831 if (ena == NULL) {
1832 return (DDI_FAILURE);
1833 }
1834
1835 /*
1836 * Before we can proceed to cleanup we have to treat
1837 * mac_unregister() explicitly -- if there are still
1838 * outstanding clients, then we can't proceed with detach or
1839 * cleanup.
1840 */
1841
1842 /*
1843 * Why this would fail I don't know, but if we proceed to mac
1844 * unregister, then there is a good chance we will panic in
1845 * the Rx interrupt handler when calling mac_rx_ring()
1846 */
1847 if (!ena_intrs_disable(ena)) {
1848 return (DDI_FAILURE);
1849 }
1850
1851 /* We can't detach if clients are actively using the device. */
1852 if (ena_mac_unregister(ena) != 0) {
1853 (void) ena_intrs_enable(ena);
1854 return (DDI_FAILURE);
1855 }
1856
1857 /*
1858 * At this point we can proceed with the rest of cleanup on a
1859 * best-effort basis.
1860 */
1861 ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC;
1862 ena_cleanup(ena);
1863 ddi_set_driver_private(dip, NULL);
1864 kmem_free(ena, sizeof (ena_t));
1865 return (DDI_SUCCESS);
1866 }
1867
1868 static struct cb_ops ena_cb_ops = {
1869 .cb_open = nodev,
1870 .cb_close = nodev,
1871 .cb_strategy = nodev,
1872 .cb_print = nodev,
1873 .cb_dump = nodev,
1874 .cb_read = nodev,
1875 .cb_write = nodev,
1876 .cb_ioctl = nodev,
1877 .cb_devmap = nodev,
1878 .cb_mmap = nodev,
1879 .cb_segmap = nodev,
1880 .cb_chpoll = nochpoll,
1881 .cb_prop_op = ddi_prop_op,
1882 .cb_flag = D_MP,
1883 .cb_rev = CB_REV,
1884 .cb_aread = nodev,
1885 .cb_awrite = nodev
1886 };
1887
1888 static struct dev_ops ena_dev_ops = {
1889 .devo_rev = DEVO_REV,
1890 .devo_refcnt = 0,
1891 .devo_getinfo = NULL,
1892 .devo_identify = nulldev,
1893 .devo_probe = nulldev,
1894 .devo_attach = ena_attach,
1895 .devo_detach = ena_detach,
1896 .devo_reset = nodev,
1897 .devo_quiesce = ddi_quiesce_not_supported,
1898 .devo_cb_ops = &ena_cb_ops
1899 };
1900
1901 static struct modldrv ena_modldrv = {
1902 .drv_modops = &mod_driverops,
1903 .drv_linkinfo = "AWS ENA Ethernet",
1904 .drv_dev_ops = &ena_dev_ops
1905 };
1906
1907 static struct modlinkage ena_modlinkage = {
1908 .ml_rev = MODREV_1,
1909 .ml_linkage = { &ena_modldrv, NULL }
1910 };
1911
1912 int
_init(void)1913 _init(void)
1914 {
1915 int ret;
1916
1917 mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME);
1918
1919 if ((ret = mod_install(&ena_modlinkage)) != 0) {
1920 mac_fini_ops(&ena_dev_ops);
1921 return (ret);
1922 }
1923
1924 return (ret);
1925 }
1926
1927 int
_info(struct modinfo * modinfop)1928 _info(struct modinfo *modinfop)
1929 {
1930 return (mod_info(&ena_modlinkage, modinfop));
1931 }
1932
1933 int
_fini(void)1934 _fini(void)
1935 {
1936 int ret;
1937
1938 if ((ret = mod_remove(&ena_modlinkage)) != 0) {
1939 return (ret);
1940 }
1941
1942 mac_fini_ops(&ena_dev_ops);
1943 return (ret);
1944 }
1945