19d26e4fcSRobert Mustacchi /*
29d26e4fcSRobert Mustacchi  * This file and its contents are supplied under the terms of the
39d26e4fcSRobert Mustacchi  * Common Development and Distribution License ("CDDL"), version 1.0.
49d26e4fcSRobert Mustacchi  * You may only use this file in accordance with the terms of version
59d26e4fcSRobert Mustacchi  * 1.0 of the CDDL.
69d26e4fcSRobert Mustacchi  *
79d26e4fcSRobert Mustacchi  * A full copy of the text of the CDDL should have accompanied this
89d26e4fcSRobert Mustacchi  * source.  A copy of the CDDL is also available via the Internet at
99d26e4fcSRobert Mustacchi  * http://www.illumos.org/license/CDDL.
109d26e4fcSRobert Mustacchi  */
119d26e4fcSRobert Mustacchi 
129d26e4fcSRobert Mustacchi /*
139d26e4fcSRobert Mustacchi  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
1409aee612SRyan Zezeski  * Copyright 2019 Joyent, Inc.
15*aa2a44afSPaul Winder  * Copyright 2020 RackTop Systems, Inc.
169d26e4fcSRobert Mustacchi  */
179d26e4fcSRobert Mustacchi 
189d26e4fcSRobert Mustacchi #include "i40e_sw.h"
199d26e4fcSRobert Mustacchi 
209d26e4fcSRobert Mustacchi /*
219d26e4fcSRobert Mustacchi  * ---------------------------------------------------------
229d26e4fcSRobert Mustacchi  * Buffer and Memory Management, Receiving, and Transmitting
239d26e4fcSRobert Mustacchi  * ---------------------------------------------------------
249d26e4fcSRobert Mustacchi  *
259d26e4fcSRobert Mustacchi  * Each physical function (PF), which is what we think of as an instance of the
269d26e4fcSRobert Mustacchi  * device driver, has a series of associated transmit and receive queue pairs.
279d26e4fcSRobert Mustacchi  * Effectively, what we think of in MAC as rings. Each of these has their own
289d26e4fcSRobert Mustacchi  * ring of descriptors which is used as part of doing DMA activity.
299d26e4fcSRobert Mustacchi  *
309d26e4fcSRobert Mustacchi  * The transmit ring of descriptors are 16-byte entries which are used to send
319d26e4fcSRobert Mustacchi  * packets, program filters, etc. The receive ring of descriptors are either
329d26e4fcSRobert Mustacchi  * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
339d26e4fcSRobert Mustacchi  * format so that we're in a better position if we ever want to leverage that
349d26e4fcSRobert Mustacchi  * information later on.
359d26e4fcSRobert Mustacchi  *
369d26e4fcSRobert Mustacchi  * However, these rings are just for descriptors, they don't talk or deal with
379d26e4fcSRobert Mustacchi  * how we actually store the memory that we need for DMA or the associated
389d26e4fcSRobert Mustacchi  * information that we need for keeping track of message blocks. To correspond
399d26e4fcSRobert Mustacchi  * to the hardware descriptor ring which is how we communicate with hardware, we
409d26e4fcSRobert Mustacchi  * introduce a control block which keeps track of our required metadata like DMA
419d26e4fcSRobert Mustacchi  * mappings.
429d26e4fcSRobert Mustacchi  *
439d26e4fcSRobert Mustacchi  * There are two main considerations that dictate how much memory and buffers
449d26e4fcSRobert Mustacchi  * we end up allocating. Those are:
459d26e4fcSRobert Mustacchi  *
469d26e4fcSRobert Mustacchi  *   o The size of the ring (controlled through the driver.conf file)
479d26e4fcSRobert Mustacchi  *
489d26e4fcSRobert Mustacchi  *   o The maximum size frame we can receive.
499d26e4fcSRobert Mustacchi  *
509d26e4fcSRobert Mustacchi  * The size of the rings currently defaults to 1024 descriptors and is stored in
519d26e4fcSRobert Mustacchi  * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
529d26e4fcSRobert Mustacchi  *
539d26e4fcSRobert Mustacchi  * While the size of the rings is controlled by the driver.conf, the maximum
549d26e4fcSRobert Mustacchi  * size frame is informed primarily through the use of dladm and the setting of
559d26e4fcSRobert Mustacchi  * the MTU property on the device. From the MTU, we then go and do some
569d26e4fcSRobert Mustacchi  * machinations. The first thing we do is we then have to add in space for the
579d26e4fcSRobert Mustacchi  * Ethernet header, potentially a VLAN header, and the FCS check. This value is
589d26e4fcSRobert Mustacchi  * what's stored as i40e_t`i40e_frame_max and is derived any time
599d26e4fcSRobert Mustacchi  * i40e_t`i40e_sdu changes.
609d26e4fcSRobert Mustacchi  *
619d26e4fcSRobert Mustacchi  * This size is then rounded up to the nearest 1k chunk, which represents the
629d26e4fcSRobert Mustacchi  * actual amount of memory that we'll allocate for a single frame.
639d26e4fcSRobert Mustacchi  *
6409aee612SRyan Zezeski  * Note, that for RX, we do something that might be unexpected. We always add
659d26e4fcSRobert Mustacchi  * an extra two bytes to the frame size that we allocate. We then offset the DMA
669d26e4fcSRobert Mustacchi  * address that we receive a packet into by two bytes. This ensures that the IP
679d26e4fcSRobert Mustacchi  * header will always be 4 byte aligned because the MAC header is either 14 or
689d26e4fcSRobert Mustacchi  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
699d26e4fcSRobert Mustacchi  * and MAC's lives easier.
709d26e4fcSRobert Mustacchi  *
7109aee612SRyan Zezeski  * Both the RX and TX descriptor rings (which are what we use to communicate
729d26e4fcSRobert Mustacchi  * with hardware) are allocated as a single region of DMA memory which is the
739d26e4fcSRobert Mustacchi  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
7409aee612SRyan Zezeski  * number of descriptors for an RX and TX ring.
759d26e4fcSRobert Mustacchi  *
7609aee612SRyan Zezeski  * While the RX and TX descriptors are allocated using DMA-based memory, the
779d26e4fcSRobert Mustacchi  * control blocks for each of them are allocated using normal kernel memory.
789d26e4fcSRobert Mustacchi  * They aren't special from a DMA perspective. We'll go over the design of both
799d26e4fcSRobert Mustacchi  * receiving and transmitting separately, as they have slightly different
809d26e4fcSRobert Mustacchi  * control blocks and different ways that we manage the relationship between
819d26e4fcSRobert Mustacchi  * control blocks and descriptors.
829d26e4fcSRobert Mustacchi  *
839d26e4fcSRobert Mustacchi  * ---------------------------------
849d26e4fcSRobert Mustacchi  * RX Descriptors and Control Blocks
859d26e4fcSRobert Mustacchi  * ---------------------------------
869d26e4fcSRobert Mustacchi  *
879d26e4fcSRobert Mustacchi  * For every descriptor in the ring that the driver has, we need some associated
889d26e4fcSRobert Mustacchi  * memory, which means that we need to have the receive specific control block.
899d26e4fcSRobert Mustacchi  * We have a couple different, but related goals:
909d26e4fcSRobert Mustacchi  *
919d26e4fcSRobert Mustacchi  *   o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
929d26e4fcSRobert Mustacchi  *     not want to do any additional memory allocations or DMA allocations if
939d26e4fcSRobert Mustacchi  *     we don't have to.
949d26e4fcSRobert Mustacchi  *
959d26e4fcSRobert Mustacchi  *   o We'd like to try and do as much zero-copy as possible, while taking into
969d26e4fcSRobert Mustacchi  *     account the cost of mapping in DMA resources.
979d26e4fcSRobert Mustacchi  *
989d26e4fcSRobert Mustacchi  *   o We'd like to have every receive descriptor available.
999d26e4fcSRobert Mustacchi  *
1009d26e4fcSRobert Mustacchi  * Now, these rules are a bit in tension with one another. The act of mapping in
1019d26e4fcSRobert Mustacchi  * is an exercise of trying to find the break-even point between page table
1029d26e4fcSRobert Mustacchi  * updates and bcopy. We currently start by using the same metrics that ixgbe
1039d26e4fcSRobert Mustacchi  * used; however, it should be known that this value has effectively been
1049d26e4fcSRobert Mustacchi  * cargo-culted across to yet another driver, sorry.
1059d26e4fcSRobert Mustacchi  *
1069d26e4fcSRobert Mustacchi  * If we receive a packet which is larger than our copy threshold, we'll create
1079d26e4fcSRobert Mustacchi  * a message block out of the DMA memory via desballoc(9F) and send that up to
1089d26e4fcSRobert Mustacchi  * MAC that way. This will cause us to be notified when the message block is
1099d26e4fcSRobert Mustacchi  * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
1109d26e4fcSRobert Mustacchi  * it's less than the threshold, we'll try to use allocb and bcopy it into the
1119d26e4fcSRobert Mustacchi  * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
1129d26e4fcSRobert Mustacchi  * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
1139d26e4fcSRobert Mustacchi  * the behavior and always do a bcopy or a DMA bind.
1149d26e4fcSRobert Mustacchi  *
1159d26e4fcSRobert Mustacchi  * To try and ensure that the device always has blocks that it can receive data
1169d26e4fcSRobert Mustacchi  * into, we maintain two lists of control blocks, a working list and a free
11709aee612SRyan Zezeski  * list. Each list is sized equal to the number of descriptors in the RX ring.
11809aee612SRyan Zezeski  * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
1199d26e4fcSRobert Mustacchi  * equal to twice the number of descriptors in the ring and we assign them
1209d26e4fcSRobert Mustacchi  * equally to the free list and to the working list. Each control block also has
1219d26e4fcSRobert Mustacchi  * DMA memory allocated and associated with which it will be used to receive the
1229d26e4fcSRobert Mustacchi  * actual packet data. All of a received frame's data will end up in a single
1239d26e4fcSRobert Mustacchi  * DMA buffer.
1249d26e4fcSRobert Mustacchi  *
12509aee612SRyan Zezeski  * During operation, we always maintain the invariant that each RX descriptor
12609aee612SRyan Zezeski  * has an associated RX control block which lives in the working list. If we
1279d26e4fcSRobert Mustacchi  * feel that we should loan up DMA memory to MAC in the form of a message block,
1289d26e4fcSRobert Mustacchi  * we can only do so if we can maintain this invariant. To do that, we swap in
1299d26e4fcSRobert Mustacchi  * one of the buffers from the free list. If none are available, then we resort
1309d26e4fcSRobert Mustacchi  * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
1319d26e4fcSRobert Mustacchi  * size.
1329d26e4fcSRobert Mustacchi  *
1339d26e4fcSRobert Mustacchi  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
13409aee612SRyan Zezeski  * called on the block, at which point we restore the RX control block to the
1359d26e4fcSRobert Mustacchi  * free list and are able to reuse the DMA memory again. While the scheme may
1369d26e4fcSRobert Mustacchi  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
1379d26e4fcSRobert Mustacchi  * the normal path of operation, even though we may still have to allocate
1389d26e4fcSRobert Mustacchi  * message blocks and copy.
1399d26e4fcSRobert Mustacchi  *
14009aee612SRyan Zezeski  * The following state machine describes the life time of a RX control block. In
14109aee612SRyan Zezeski  * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
1429d26e4fcSRobert Mustacchi  * control block entry as rcb.
1439d26e4fcSRobert Mustacchi  *
1449d26e4fcSRobert Mustacchi  *             |                                   |
1459d26e4fcSRobert Mustacchi  *             * ... 1/2 of all initial rcb's  ... *
1469d26e4fcSRobert Mustacchi  *             |                                   |
1479d26e4fcSRobert Mustacchi  *             v                                   v
1489d26e4fcSRobert Mustacchi  *     +------------------+               +------------------+
1499d26e4fcSRobert Mustacchi  *     | rcb on free list |---*---------->| rcb on work list |
1509d26e4fcSRobert Mustacchi  *     +------------------+   .           +------------------+
1519d26e4fcSRobert Mustacchi  *             ^              . moved to          |
1529d26e4fcSRobert Mustacchi  *             |                replace rcb       * . . Frame received,
1539d26e4fcSRobert Mustacchi  *             |                loaned to         |     entry on free list
1549d26e4fcSRobert Mustacchi  *             |                MAC + co.         |     available. rcb's
1559d26e4fcSRobert Mustacchi  *             |                                  |     memory made into mblk_t
1569d26e4fcSRobert Mustacchi  *             * . freemsg(9F)                    |     and sent up to MAC.
1579d26e4fcSRobert Mustacchi  *             |   called on                      |
1589d26e4fcSRobert Mustacchi  *             |   loaned rcb                     |
1599d26e4fcSRobert Mustacchi  *             |   and it is                      v
1609d26e4fcSRobert Mustacchi  *             |   recycled.              +-------------------+
1619d26e4fcSRobert Mustacchi  *             +--------------------<-----| rcb loaned to MAC |
1629d26e4fcSRobert Mustacchi  *                                        +-------------------+
1639d26e4fcSRobert Mustacchi  *
16409aee612SRyan Zezeski  * Finally, note that every RX control block has a reference count on it. One
1659d26e4fcSRobert Mustacchi  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
1669d26e4fcSRobert Mustacchi  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
1679d26e4fcSRobert Mustacchi  * no other DLPI consumers remain, then we'll decrement the reference count by
16809aee612SRyan Zezeski  * one. Whenever we loan up the RX control block and associated buffer to MAC,
1699d26e4fcSRobert Mustacchi  * then we bump the reference count again. Even though the device is stopped,
1709d26e4fcSRobert Mustacchi  * there may still be loaned frames in upper levels that we'll want to account
1719d26e4fcSRobert Mustacchi  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
1729d26e4fcSRobert Mustacchi  * that it is cleaned up.
1739d26e4fcSRobert Mustacchi  *
1749d26e4fcSRobert Mustacchi  * --------------------
1759d26e4fcSRobert Mustacchi  * Managing the RX Ring
1769d26e4fcSRobert Mustacchi  * --------------------
1779d26e4fcSRobert Mustacchi  *
1789d26e4fcSRobert Mustacchi  * The receive ring descriptors are arranged in a circular buffer with a head
1799d26e4fcSRobert Mustacchi  * and tail pointer. There are both the conventional head and tail pointers
1809d26e4fcSRobert Mustacchi  * which are used to partition the ring into two portions, a portion that we,
1819d26e4fcSRobert Mustacchi  * the operating system, manage and a portion that is managed by hardware. When
1829d26e4fcSRobert Mustacchi  * hardware owns a descriptor in the ring, it means that it is waiting for data
1839d26e4fcSRobert Mustacchi  * to be filled in. However, when a portion of the ring is owned by the driver,
1849d26e4fcSRobert Mustacchi  * then that means that the descriptor has been consumed and we need to go take
1859d26e4fcSRobert Mustacchi  * a look at it.
1869d26e4fcSRobert Mustacchi  *
1879d26e4fcSRobert Mustacchi  * The initial head is configured to be zero by writing it as such in the
1889d26e4fcSRobert Mustacchi  * receive queue context in the FPM (function private memory from the host). The
1899d26e4fcSRobert Mustacchi  * initial tail is written to be the last descriptor. This is written to via the
1909d26e4fcSRobert Mustacchi  * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
1919d26e4fcSRobert Mustacchi  * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
1929d26e4fcSRobert Mustacchi  * the only values we ever consult ourselves are the TAIL register and our own
1939d26e4fcSRobert Mustacchi  * state tracking. Effectively, we cache the HEAD register and then update it
1949d26e4fcSRobert Mustacchi  * ourselves based on our work.
1959d26e4fcSRobert Mustacchi  *
19609aee612SRyan Zezeski  * When we iterate over the RX descriptors and thus the received frames, we are
1979d26e4fcSRobert Mustacchi  * either in an interrupt context or we've been asked by MAC to poll on the
1989d26e4fcSRobert Mustacchi  * ring. If we've been asked to poll on the ring, we have a maximum number of
19909aee612SRyan Zezeski  * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
2009d26e4fcSRobert Mustacchi  * exceed that count, then we do not process it. When in interrupt context, we
2019d26e4fcSRobert Mustacchi  * don't have a strict byte count. However, to ensure liveness, we limit the
2029d26e4fcSRobert Mustacchi  * amount of data based on a configuration value
2039d26e4fcSRobert Mustacchi  * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
2049d26e4fcSRobert Mustacchi  * is based on similar numbers that are used for ixgbe. After some additional
2059d26e4fcSRobert Mustacchi  * time in the field, we'll have a sense as to whether or not it should be
2069d26e4fcSRobert Mustacchi  * changed.
2079d26e4fcSRobert Mustacchi  *
2089d26e4fcSRobert Mustacchi  * When processing, we start at our own HEAD pointer
2099d26e4fcSRobert Mustacchi  * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
2109d26e4fcSRobert Mustacchi  * processing. Every RX descriptor has what's described as the DD bit. This bit
2119d26e4fcSRobert Mustacchi  * (the LSB of the second 8-byte word), indicates whether or not the descriptor
2129d26e4fcSRobert Mustacchi  * is done.  When we give descriptors to the hardware, this value is always
2139d26e4fcSRobert Mustacchi  * zero. When the hardware has finished a descriptor, it will always be one.
2149d26e4fcSRobert Mustacchi  *
2159d26e4fcSRobert Mustacchi  * The first thing that we check is whether the DD bit indicates that the
2169d26e4fcSRobert Mustacchi  * current HEAD is ready. If it isn't, then we're done. That's the primary
2179d26e4fcSRobert Mustacchi  * invariant of processing a frame. If it's done, then there are a few other
2189d26e4fcSRobert Mustacchi  * things that we want to look at. In the same status word as the DD bit, there
2199d26e4fcSRobert Mustacchi  * are two other important bits:
2209d26e4fcSRobert Mustacchi  *
2219d26e4fcSRobert Mustacchi  *   o End of Packet (EOP)
2229d26e4fcSRobert Mustacchi  *   o Error bits
2239d26e4fcSRobert Mustacchi  *
2249d26e4fcSRobert Mustacchi  * The end of packet indicates that we have reached the last descriptor. Now,
2259d26e4fcSRobert Mustacchi  * you might ask when would there be more than one descriptor. The reason for
2269d26e4fcSRobert Mustacchi  * that might be due to large receive offload (lro) or header splitting
2279d26e4fcSRobert Mustacchi  * functionality, which presently isn't supported in the driver. The error bits
2289d26e4fcSRobert Mustacchi  * in the frame are only valid when EOP is set.
2299d26e4fcSRobert Mustacchi  *
2309d26e4fcSRobert Mustacchi  * If error bits are set on the frame, then we still consume it; however, we
2319d26e4fcSRobert Mustacchi  * will not generate an mblk_t to send up to MAC. If there are no error bits
2329d26e4fcSRobert Mustacchi  * set, then we'll consume the descriptor either using bcopy or DMA binding. See
2339d26e4fcSRobert Mustacchi  * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
2349d26e4fcSRobert Mustacchi  * on how that selection is made.
2359d26e4fcSRobert Mustacchi  *
2369d26e4fcSRobert Mustacchi  * Regardless of whether we construct an mblk_t or encounter an error, we end up
2379d26e4fcSRobert Mustacchi  * resetting the descriptor. This re-arms the descriptor for hardware and in the
2389d26e4fcSRobert Mustacchi  * process, we may end up assigning it a new receive control bock. After we do
2399d26e4fcSRobert Mustacchi  * this, we always update our HEAD pointer, no matter what.
2409d26e4fcSRobert Mustacchi  *
2419d26e4fcSRobert Mustacchi  * Finally, once we've consumed as much as we will in a given window, we go and
2429d26e4fcSRobert Mustacchi  * update the TAIL register to indicate all the frames we've consumed. We only
2439d26e4fcSRobert Mustacchi  * do a single bulk write for the ring.
2449d26e4fcSRobert Mustacchi  *
2459d26e4fcSRobert Mustacchi  * ---------------------------------
2469d26e4fcSRobert Mustacchi  * TX Descriptors and Control Blocks
2479d26e4fcSRobert Mustacchi  * ---------------------------------
2489d26e4fcSRobert Mustacchi  *
2499d26e4fcSRobert Mustacchi  * While the transmit path is similar in spirit to the receive path, it works
2509d26e4fcSRobert Mustacchi  * differently due to the fact that all data is originated by the operating
2519d26e4fcSRobert Mustacchi  * system and not by the device.
2529d26e4fcSRobert Mustacchi  *
25309aee612SRyan Zezeski  * Like RX, there is both a descriptor ring that we use to communicate to the
25409aee612SRyan Zezeski  * driver and which points to the memory used to transmit a frame.  Similarly,
25509aee612SRyan Zezeski  * there is a corresponding transmit control block, however, the correspondence
25609aee612SRyan Zezeski  * between descriptors and control blocks is more complex and not necessarily
25709aee612SRyan Zezeski  * 1-to-1.
2589d26e4fcSRobert Mustacchi  *
2599d26e4fcSRobert Mustacchi  * The driver is asked to process a single frame at a time. That message block
2609d26e4fcSRobert Mustacchi  * may be made up of multiple fragments linked together by the mblk_t`b_cont
2619d26e4fcSRobert Mustacchi  * member. The device has a hard limit of up to 8 buffers being allowed for use
26209aee612SRyan Zezeski  * for a single non-LSO packet or LSO segment. The number of TX ring entires
26309aee612SRyan Zezeski  * (and thus TX control blocks) used depends on the fragment sizes and DMA
26409aee612SRyan Zezeski  * layout, as explained below.
26509aee612SRyan Zezeski  *
26609aee612SRyan Zezeski  * We alter our DMA strategy based on a threshold tied to the fragment size.
26709aee612SRyan Zezeski  * This threshold is configurable via the tx_dma_threshold property. If the
26809aee612SRyan Zezeski  * fragment is above the threshold, we DMA bind it -- consuming one TCB and
26909aee612SRyan Zezeski  * potentially several data descriptors. The exact number of descriptors (equal
27009aee612SRyan Zezeski  * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
27109aee612SRyan Zezeski  * into page, b_wptr offset into page, and the physical layout of the dblk's
27209aee612SRyan Zezeski  * memory (contiguous or not). Essentially, we are at the mercy of the DMA
27309aee612SRyan Zezeski  * engine and the dblk's memory allocation. Knowing the exact number of
27409aee612SRyan Zezeski  * descriptors up front is a task best not taken on by the driver itself.
27509aee612SRyan Zezeski  * Instead, we attempt to DMA bind the fragment and verify the descriptor
27609aee612SRyan Zezeski  * layout meets hardware constraints. If the proposed DMA bind does not satisfy
27709aee612SRyan Zezeski  * the hardware constaints, then we discard it and instead copy the entire
27809aee612SRyan Zezeski  * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
27909aee612SRyan Zezeski  * larger than the TCB buffer).
28009aee612SRyan Zezeski  *
28109aee612SRyan Zezeski  * If the fragment is below or at the threshold, we copy it to the pre-allocated
28209aee612SRyan Zezeski  * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
28309aee612SRyan Zezeski  * conserve resources. We are guaranteed that the TCB buffer is made up of only
28409aee612SRyan Zezeski  * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
28509aee612SRyan Zezeski  *
28609aee612SRyan Zezeski  * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
28709aee612SRyan Zezeski  * filtering, then the TX data descriptors must be preceeded by a single TX
28809aee612SRyan Zezeski  * context descriptor.  Because there is no DMA transfer associated with the
28909aee612SRyan Zezeski  * context descriptor, we allocate a control block with a special type which
29009aee612SRyan Zezeski  * indicates to the TX ring recycle code that there are no associated DMA
29109aee612SRyan Zezeski  * resources to unbind when the control block is free'd.
29209aee612SRyan Zezeski  *
29309aee612SRyan Zezeski  * If we don't have enough space in the ring or TX control blocks available,
2949d26e4fcSRobert Mustacchi  * then we'll return the unprocessed message block to MAC. This will induce flow
2959d26e4fcSRobert Mustacchi  * control and once we recycle enough entries, we'll once again enable sending
2969d26e4fcSRobert Mustacchi  * on the ring.
2979d26e4fcSRobert Mustacchi  *
2989d26e4fcSRobert Mustacchi  * We size the working list as equal to the number of descriptors in the ring.
2999d26e4fcSRobert Mustacchi  * We size the free list as equal to 1.5 times the number of descriptors in the
30009aee612SRyan Zezeski  * ring. We'll allocate a number of TX control block entries equal to the number
3019d26e4fcSRobert Mustacchi  * of entries in the free list. By default, all entries are placed in the free
3029d26e4fcSRobert Mustacchi  * list. As we come along and try to send something, we'll allocate entries from
3039d26e4fcSRobert Mustacchi  * the free list and add them to the working list, where they'll stay until the
3049d26e4fcSRobert Mustacchi  * hardware indicates that all of the data has been written back to us. The
3059d26e4fcSRobert Mustacchi  * reason that we start with 1.5x is to help facilitate having more than one TX
3069d26e4fcSRobert Mustacchi  * buffer associated with the DMA activity.
3079d26e4fcSRobert Mustacchi  *
3089d26e4fcSRobert Mustacchi  * --------------------
3099d26e4fcSRobert Mustacchi  * Managing the TX Ring
3109d26e4fcSRobert Mustacchi  * --------------------
3119d26e4fcSRobert Mustacchi  *
3129d26e4fcSRobert Mustacchi  * The transmit descriptor ring is driven by us. We maintain our own notion of a
3139d26e4fcSRobert Mustacchi  * HEAD and TAIL register and we update the hardware with updates to the TAIL
3149d26e4fcSRobert Mustacchi  * register. When the hardware is done writing out data, it updates us by
3159d26e4fcSRobert Mustacchi  * writing back to a specific address, not by updating the individual
3169d26e4fcSRobert Mustacchi  * descriptors. That address is a 4-byte region after the main transmit
3179d26e4fcSRobert Mustacchi  * descriptor ring. This is why the descriptor ring has an extra descriptor's
3189d26e4fcSRobert Mustacchi  * worth allocated to it.
3199d26e4fcSRobert Mustacchi  *
3209d26e4fcSRobert Mustacchi  * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
3219d26e4fcSRobert Mustacchi  * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
3229d26e4fcSRobert Mustacchi  * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
3239d26e4fcSRobert Mustacchi  * points in time, through both interrupts, and our own internal checks, we'll
3249d26e4fcSRobert Mustacchi  * sync the write-back head portion of the DMA space. Based on the index it
3259d26e4fcSRobert Mustacchi  * reports back, we'll free everything between our current HEAD and the
3269d26e4fcSRobert Mustacchi  * indicated index and update HEAD to the new index.
3279d26e4fcSRobert Mustacchi  *
3289d26e4fcSRobert Mustacchi  * When a frame comes in, we try to use a number of transmit control blocks and
3299d26e4fcSRobert Mustacchi  * we'll transition them from the free list to the work list. They'll get moved
3309d26e4fcSRobert Mustacchi  * to the entry on the work list that corresponds with the transmit descriptor
3319d26e4fcSRobert Mustacchi  * they correspond to. Once we are indicated that the corresponding descriptor
3329d26e4fcSRobert Mustacchi  * has been freed, we'll return it to the list.
3339d26e4fcSRobert Mustacchi  *
3349d26e4fcSRobert Mustacchi  * The transmit control block free list is managed by keeping track of the
3359d26e4fcSRobert Mustacchi  * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
3369d26e4fcSRobert Mustacchi  * index into the free list and add things to it. In effect, we always push and
3379d26e4fcSRobert Mustacchi  * pop from the tail and protect it with a single lock,
3389d26e4fcSRobert Mustacchi  * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
3399d26e4fcSRobert Mustacchi  * stand up to further performance testing; however, it does allow us to get off
3409d26e4fcSRobert Mustacchi  * the ground with the device driver.
3419d26e4fcSRobert Mustacchi  *
3429d26e4fcSRobert Mustacchi  * The following image describes where a given transmit control block lives in
3439d26e4fcSRobert Mustacchi  * its lifetime:
3449d26e4fcSRobert Mustacchi  *
3459d26e4fcSRobert Mustacchi  *             |
3469d26e4fcSRobert Mustacchi  *             * ... Initial placement for all tcb's
3479d26e4fcSRobert Mustacchi  *             |
3489d26e4fcSRobert Mustacchi  *             v
3499d26e4fcSRobert Mustacchi  *    +------------------+                       +------------------+
3509d26e4fcSRobert Mustacchi  *    | tcb on free list |---*------------------>| tcb on work list |
3519d26e4fcSRobert Mustacchi  *    +------------------+   .                   +------------------+
35209aee612SRyan Zezeski  *             ^             . N tcbs allocated[1]         |
3539d26e4fcSRobert Mustacchi  *             |               to send frame               v
3549d26e4fcSRobert Mustacchi  *             |               or fragment on              |
3559d26e4fcSRobert Mustacchi  *             |               wire, mblk from             |
3569d26e4fcSRobert Mustacchi  *             |               MAC associated.             |
3579d26e4fcSRobert Mustacchi  *             |                                           |
3589d26e4fcSRobert Mustacchi  *             +------*-------------------------------<----+
3599d26e4fcSRobert Mustacchi  *                    .
3609d26e4fcSRobert Mustacchi  *                    . Hardware indicates
3619d26e4fcSRobert Mustacchi  *                      entry transmitted.
36209aee612SRyan Zezeski  *                      tcbs recycled, mblk
3639d26e4fcSRobert Mustacchi  *                      from MAC freed.
3649d26e4fcSRobert Mustacchi  *
36509aee612SRyan Zezeski  * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
36609aee612SRyan Zezeski  *     descriptor plus 1 data descriptor, in the non-DMA-bind case.  In the DMA
36709aee612SRyan Zezeski  *     bind case, N can be 1 context descriptor plus 1 data descriptor per
36809aee612SRyan Zezeski  *     b_cont in the mblk.  In this case, the mblk is associated with the first
36909aee612SRyan Zezeski  *     data descriptor and freed as part of freeing that data descriptor.
37009aee612SRyan Zezeski  *
3719d26e4fcSRobert Mustacchi  * ------------
3729d26e4fcSRobert Mustacchi  * Blocking MAC
3739d26e4fcSRobert Mustacchi  * ------------
3749d26e4fcSRobert Mustacchi  *
37509aee612SRyan Zezeski  * When performing transmit, we can run out of descriptors and ring entries.
37609aee612SRyan Zezeski  * When such a case happens, we return the mblk_t to MAC to indicate that we've
37709aee612SRyan Zezeski  * been blocked. At that point in time, MAC becomes blocked and will not
37809aee612SRyan Zezeski  * transmit anything out that specific ring until we notify MAC. To indicate
37909aee612SRyan Zezeski  * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
38009aee612SRyan Zezeski  * to B_TRUE.
3819d26e4fcSRobert Mustacchi  *
38209aee612SRyan Zezeski  * When we recycle TX descriptors then we'll end up signaling MAC by calling
3839d26e4fcSRobert Mustacchi  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
3849d26e4fcSRobert Mustacchi  * start sending frames out to us again.
3859d26e4fcSRobert Mustacchi  */
3869d26e4fcSRobert Mustacchi 
3879d26e4fcSRobert Mustacchi /*
3889d26e4fcSRobert Mustacchi  * We set our DMA alignment requests based on the smallest supported page size
3899d26e4fcSRobert Mustacchi  * of the corresponding platform.
3909d26e4fcSRobert Mustacchi  */
3919d26e4fcSRobert Mustacchi #if	defined(__sparc)
3929d26e4fcSRobert Mustacchi #define	I40E_DMA_ALIGNMENT 0x2000ull
3939d26e4fcSRobert Mustacchi #elif defined(__x86)
3949d26e4fcSRobert Mustacchi #define	I40E_DMA_ALIGNMENT 0x1000ull
3959d26e4fcSRobert Mustacchi #else
3969d26e4fcSRobert Mustacchi #error	"unknown architecture for i40e"
3979d26e4fcSRobert Mustacchi #endif
3989d26e4fcSRobert Mustacchi 
3999d26e4fcSRobert Mustacchi /*
4009d26e4fcSRobert Mustacchi  * This structure is used to maintain information and flags related to
40109aee612SRyan Zezeski  * transmitting a frame.  These fields are ultimately used to construct the
40209aee612SRyan Zezeski  * TX data descriptor(s) and, if necessary, the TX context descriptor.
4039d26e4fcSRobert Mustacchi  */
4049d26e4fcSRobert Mustacchi typedef struct i40e_tx_context {
40509aee612SRyan Zezeski 	enum i40e_tx_desc_cmd_bits	itc_data_cmdflags;
40609aee612SRyan Zezeski 	uint32_t			itc_data_offsets;
40709aee612SRyan Zezeski 	enum i40e_tx_ctx_desc_cmd_bits	itc_ctx_cmdflags;
40809aee612SRyan Zezeski 	uint32_t			itc_ctx_tsolen;
40909aee612SRyan Zezeski 	uint32_t			itc_ctx_mss;
4109d26e4fcSRobert Mustacchi } i40e_tx_context_t;
4119d26e4fcSRobert Mustacchi 
4129d26e4fcSRobert Mustacchi /*
4139d26e4fcSRobert Mustacchi  * Toggles on debug builds which can be used to override our RX behaviour based
4149d26e4fcSRobert Mustacchi  * on thresholds.
4159d26e4fcSRobert Mustacchi  */
4169d26e4fcSRobert Mustacchi #ifdef	DEBUG
4179d26e4fcSRobert Mustacchi typedef enum {
4189d26e4fcSRobert Mustacchi 	I40E_DEBUG_RX_DEFAULT	= 0,
4199d26e4fcSRobert Mustacchi 	I40E_DEBUG_RX_BCOPY	= 1,
4209d26e4fcSRobert Mustacchi 	I40E_DEBUG_RX_DMABIND	= 2
4219d26e4fcSRobert Mustacchi } i40e_debug_rx_t;
4229d26e4fcSRobert Mustacchi 
4239d26e4fcSRobert Mustacchi i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
4249d26e4fcSRobert Mustacchi #endif	/* DEBUG */
4259d26e4fcSRobert Mustacchi 
4269d26e4fcSRobert Mustacchi /*
4279d26e4fcSRobert Mustacchi  * Notes on the following pair of DMA attributes. The first attribute,
4289d26e4fcSRobert Mustacchi  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
4299d26e4fcSRobert Mustacchi  * and the static buffers that we associate with control blocks. For this
4309d26e4fcSRobert Mustacchi  * reason, we force an SGL length of one. While technically the driver supports
43109aee612SRyan Zezeski  * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
4329d26e4fcSRobert Mustacchi  * management here. In addition, when the Intel common code wants to allocate
4339d26e4fcSRobert Mustacchi  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
4349d26e4fcSRobert Mustacchi  * the static dma attr.
4359d26e4fcSRobert Mustacchi  *
43609aee612SRyan Zezeski  * The latter two sets of attributes, are what we use when we're binding a
43709aee612SRyan Zezeski  * bunch of mblk_t fragments to go out the door. Note that the main difference
43809aee612SRyan Zezeski  * here is that we're allowed a larger SGL length.  For non-LSO TX, we
43909aee612SRyan Zezeski  * restrict the SGL length to match the number of TX buffers available to the
44009aee612SRyan Zezeski  * PF (8).  For the LSO case we can go much larger, with the caveat that each
44109aee612SRyan Zezeski  * MSS-sized chunk (segment) must not span more than 8 data descriptors and
44209aee612SRyan Zezeski  * hence must not span more than 8 cookies.
4439d26e4fcSRobert Mustacchi  *
4449d26e4fcSRobert Mustacchi  * Note, we default to setting ourselves to be DMA capable here. However,
4459d26e4fcSRobert Mustacchi  * because we could have multiple instances which have different FMA error
4469d26e4fcSRobert Mustacchi  * checking capabilities, or end up on different buses, we make these static
4479d26e4fcSRobert Mustacchi  * and const and copy them into the i40e_t for the given device with the actual
4489d26e4fcSRobert Mustacchi  * values that reflect the actual capabilities.
4499d26e4fcSRobert Mustacchi  */
4509d26e4fcSRobert Mustacchi static const ddi_dma_attr_t i40e_g_static_dma_attr = {
4519d26e4fcSRobert Mustacchi 	DMA_ATTR_V0,			/* version number */
4529d26e4fcSRobert Mustacchi 	0x0000000000000000ull,		/* low address */
4539d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* high address */
4549d26e4fcSRobert Mustacchi 	0x00000000FFFFFFFFull,		/* dma counter max */
4559d26e4fcSRobert Mustacchi 	I40E_DMA_ALIGNMENT,		/* alignment */
4569d26e4fcSRobert Mustacchi 	0x00000FFF,			/* burst sizes */
4579d26e4fcSRobert Mustacchi 	0x00000001,			/* minimum transfer size */
4589d26e4fcSRobert Mustacchi 	0x00000000FFFFFFFFull,		/* maximum transfer size */
4599d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size */
4609d26e4fcSRobert Mustacchi 	1,				/* scatter/gather list length */
4619d26e4fcSRobert Mustacchi 	0x00000001,			/* granularity */
4629d26e4fcSRobert Mustacchi 	DDI_DMA_FLAGERR			/* DMA flags */
4639d26e4fcSRobert Mustacchi };
4649d26e4fcSRobert Mustacchi 
4659d26e4fcSRobert Mustacchi static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
4669d26e4fcSRobert Mustacchi 	DMA_ATTR_V0,			/* version number */
4679d26e4fcSRobert Mustacchi 	0x0000000000000000ull,		/* low address */
4689d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* high address */
46909aee612SRyan Zezeski 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
4709d26e4fcSRobert Mustacchi 	I40E_DMA_ALIGNMENT,		/* alignment */
4719d26e4fcSRobert Mustacchi 	0x00000FFF,			/* burst sizes */
4729d26e4fcSRobert Mustacchi 	0x00000001,			/* minimum transfer size */
4739d26e4fcSRobert Mustacchi 	0x00000000FFFFFFFFull,		/* maximum transfer size */
4749d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
4759d26e4fcSRobert Mustacchi 	I40E_TX_MAX_COOKIE,		/* scatter/gather list length */
4769d26e4fcSRobert Mustacchi 	0x00000001,			/* granularity */
4779d26e4fcSRobert Mustacchi 	DDI_DMA_FLAGERR			/* DMA flags */
4789d26e4fcSRobert Mustacchi };
4799d26e4fcSRobert Mustacchi 
48009aee612SRyan Zezeski static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
48109aee612SRyan Zezeski 	DMA_ATTR_V0,			/* version number */
48209aee612SRyan Zezeski 	0x0000000000000000ull,		/* low address */
48309aee612SRyan Zezeski 	0xFFFFFFFFFFFFFFFFull,		/* high address */
48409aee612SRyan Zezeski 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
48509aee612SRyan Zezeski 	I40E_DMA_ALIGNMENT,		/* alignment */
48609aee612SRyan Zezeski 	0x00000FFF,			/* burst sizes */
48709aee612SRyan Zezeski 	0x00000001,			/* minimum transfer size */
48809aee612SRyan Zezeski 	0x00000000FFFFFFFFull,		/* maximum transfer size */
48909aee612SRyan Zezeski 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
49009aee612SRyan Zezeski 	I40E_TX_LSO_MAX_COOKIE,		/* scatter/gather list length */
49109aee612SRyan Zezeski 	0x00000001,			/* granularity */
49209aee612SRyan Zezeski 	DDI_DMA_FLAGERR			/* DMA flags */
49309aee612SRyan Zezeski };
49409aee612SRyan Zezeski 
4959d26e4fcSRobert Mustacchi /*
4969d26e4fcSRobert Mustacchi  * Next, we have the attributes for these structures. The descriptor rings are
4979d26e4fcSRobert Mustacchi  * all strictly little endian, while the data buffers are just arrays of bytes
4989d26e4fcSRobert Mustacchi  * representing frames. Because of this, we purposefully simplify the driver
4999d26e4fcSRobert Mustacchi  * programming life by programming the descriptor ring as little endian, while
5009d26e4fcSRobert Mustacchi  * for the buffer data we keep it as unstructured.
5019d26e4fcSRobert Mustacchi  *
5029d26e4fcSRobert Mustacchi  * Note, that to keep the Intel common code operating in a reasonable way, when
5039d26e4fcSRobert Mustacchi  * we allocate DMA memory for it, we do not use byte swapping and thus use the
5049d26e4fcSRobert Mustacchi  * standard i40e_buf_acc_attr.
5059d26e4fcSRobert Mustacchi  */
5069d26e4fcSRobert Mustacchi static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
5079d26e4fcSRobert Mustacchi 	DDI_DEVICE_ATTR_V0,
5089d26e4fcSRobert Mustacchi 	DDI_STRUCTURE_LE_ACC,
5099d26e4fcSRobert Mustacchi 	DDI_STRICTORDER_ACC
5109d26e4fcSRobert Mustacchi };
5119d26e4fcSRobert Mustacchi 
5129d26e4fcSRobert Mustacchi static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
5139d26e4fcSRobert Mustacchi 	DDI_DEVICE_ATTR_V0,
5149d26e4fcSRobert Mustacchi 	DDI_NEVERSWAP_ACC,
5159d26e4fcSRobert Mustacchi 	DDI_STRICTORDER_ACC
5169d26e4fcSRobert Mustacchi };
5179d26e4fcSRobert Mustacchi 
5189d26e4fcSRobert Mustacchi /*
5199d26e4fcSRobert Mustacchi  * The next two functions are designed to be type-safe versions of macros that
5209d26e4fcSRobert Mustacchi  * are used to increment and decrement a descriptor index in the loop. Note,
5219d26e4fcSRobert Mustacchi  * these are marked inline to try and keep the data path hot and they were
5229d26e4fcSRobert Mustacchi  * effectively inlined in their previous life as macros.
5239d26e4fcSRobert Mustacchi  */
5249d26e4fcSRobert Mustacchi static inline int
i40e_next_desc(int base,int count,int size)5259d26e4fcSRobert Mustacchi i40e_next_desc(int base, int count, int size)
5269d26e4fcSRobert Mustacchi {
5279d26e4fcSRobert Mustacchi 	int out;
5289d26e4fcSRobert Mustacchi 
5299d26e4fcSRobert Mustacchi 	ASSERT(base >= 0);
5309d26e4fcSRobert Mustacchi 	ASSERT(count > 0);
5319d26e4fcSRobert Mustacchi 	ASSERT(size > 0);
5329d26e4fcSRobert Mustacchi 
5339d26e4fcSRobert Mustacchi 	if (base + count < size) {
5349d26e4fcSRobert Mustacchi 		out = base + count;
5359d26e4fcSRobert Mustacchi 	} else {
5369d26e4fcSRobert Mustacchi 		out = base + count - size;
5379d26e4fcSRobert Mustacchi 	}
5389d26e4fcSRobert Mustacchi 
5399d26e4fcSRobert Mustacchi 	ASSERT(out >= 0 && out < size);
5409d26e4fcSRobert Mustacchi 	return (out);
5419d26e4fcSRobert Mustacchi }
5429d26e4fcSRobert Mustacchi 
5439d26e4fcSRobert Mustacchi static inline int
i40e_prev_desc(int base,int count,int size)5449d26e4fcSRobert Mustacchi i40e_prev_desc(int base, int count, int size)
5459d26e4fcSRobert Mustacchi {
5469d26e4fcSRobert Mustacchi 	int out;
5479d26e4fcSRobert Mustacchi 
5489d26e4fcSRobert Mustacchi 	ASSERT(base >= 0);
5499d26e4fcSRobert Mustacchi 	ASSERT(count > 0);
5509d26e4fcSRobert Mustacchi 	ASSERT(size > 0);
5519d26e4fcSRobert Mustacchi 
5529d26e4fcSRobert Mustacchi 	if (base >= count) {
5539d26e4fcSRobert Mustacchi 		out = base - count;
5549d26e4fcSRobert Mustacchi 	} else {
5559d26e4fcSRobert Mustacchi 		out = base - count + size;
5569d26e4fcSRobert Mustacchi 	}
5579d26e4fcSRobert Mustacchi 
5589d26e4fcSRobert Mustacchi 	ASSERT(out >= 0 && out < size);
5599d26e4fcSRobert Mustacchi 	return (out);
5609d26e4fcSRobert Mustacchi }
5619d26e4fcSRobert Mustacchi 
5629d26e4fcSRobert Mustacchi /*
5639d26e4fcSRobert Mustacchi  * Free DMA memory that is represented by a i40e_dma_buffer_t.
5649d26e4fcSRobert Mustacchi  */
5659d26e4fcSRobert Mustacchi static void
i40e_free_dma_buffer(i40e_dma_buffer_t * dmap)5669d26e4fcSRobert Mustacchi i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
5679d26e4fcSRobert Mustacchi {
568ae6f9789SToomas Soome 	if (dmap->dmab_dma_address != 0) {
5699d26e4fcSRobert Mustacchi 		VERIFY(dmap->dmab_dma_handle != NULL);
5709d26e4fcSRobert Mustacchi 		(void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
571ae6f9789SToomas Soome 		dmap->dmab_dma_address = 0;
5729d26e4fcSRobert Mustacchi 		dmap->dmab_size = 0;
5739d26e4fcSRobert Mustacchi 	}
5749d26e4fcSRobert Mustacchi 
5759d26e4fcSRobert Mustacchi 	if (dmap->dmab_acc_handle != NULL) {
5769d26e4fcSRobert Mustacchi 		ddi_dma_mem_free(&dmap->dmab_acc_handle);
5779d26e4fcSRobert Mustacchi 		dmap->dmab_acc_handle = NULL;
5789d26e4fcSRobert Mustacchi 		dmap->dmab_address = NULL;
5799d26e4fcSRobert Mustacchi 	}
5809d26e4fcSRobert Mustacchi 
5819d26e4fcSRobert Mustacchi 	if (dmap->dmab_dma_handle != NULL) {
5829d26e4fcSRobert Mustacchi 		ddi_dma_free_handle(&dmap->dmab_dma_handle);
5839d26e4fcSRobert Mustacchi 		dmap->dmab_dma_handle = NULL;
5849d26e4fcSRobert Mustacchi 	}
5859d26e4fcSRobert Mustacchi 
5869d26e4fcSRobert Mustacchi 	/*
5879d26e4fcSRobert Mustacchi 	 * These should only be set if we have valid handles allocated and
5889d26e4fcSRobert Mustacchi 	 * therefore should always be NULLed out due to the above code. This
5899d26e4fcSRobert Mustacchi 	 * is here to catch us acting sloppy.
5909d26e4fcSRobert Mustacchi 	 */
591ae6f9789SToomas Soome 	ASSERT(dmap->dmab_dma_address == 0);
5929d26e4fcSRobert Mustacchi 	ASSERT(dmap->dmab_address == NULL);
5939d26e4fcSRobert Mustacchi 	ASSERT(dmap->dmab_size == 0);
5949d26e4fcSRobert Mustacchi 	dmap->dmab_len = 0;
5959d26e4fcSRobert Mustacchi }
5969d26e4fcSRobert Mustacchi 
5979d26e4fcSRobert Mustacchi /*
5989d26e4fcSRobert Mustacchi  * Allocate size bytes of DMA memory based on the passed in attributes. This
5999d26e4fcSRobert Mustacchi  * fills in the information in dmap and is designed for all of our single cookie
6009d26e4fcSRobert Mustacchi  * allocations.
6019d26e4fcSRobert Mustacchi  */
6029d26e4fcSRobert Mustacchi static boolean_t
i40e_alloc_dma_buffer(i40e_t * i40e,i40e_dma_buffer_t * dmap,ddi_dma_attr_t * attrsp,ddi_device_acc_attr_t * accp,boolean_t stream,boolean_t zero,size_t size)6039d26e4fcSRobert Mustacchi i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
6049d26e4fcSRobert Mustacchi     ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
6059d26e4fcSRobert Mustacchi     boolean_t zero, size_t size)
6069d26e4fcSRobert Mustacchi {
6079d26e4fcSRobert Mustacchi 	int ret;
6089d26e4fcSRobert Mustacchi 	uint_t flags;
6099d26e4fcSRobert Mustacchi 	size_t len;
6109d26e4fcSRobert Mustacchi 	ddi_dma_cookie_t cookie;
6119d26e4fcSRobert Mustacchi 	uint_t ncookies;
6129d26e4fcSRobert Mustacchi 
6139d26e4fcSRobert Mustacchi 	if (stream == B_TRUE)
6149d26e4fcSRobert Mustacchi 		flags = DDI_DMA_STREAMING;
6159d26e4fcSRobert Mustacchi 	else
6169d26e4fcSRobert Mustacchi 		flags = DDI_DMA_CONSISTENT;
6179d26e4fcSRobert Mustacchi 
6189d26e4fcSRobert Mustacchi 	/*
6199d26e4fcSRobert Mustacchi 	 * Step one: Allocate the DMA handle
6209d26e4fcSRobert Mustacchi 	 */
6219d26e4fcSRobert Mustacchi 	ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
6229d26e4fcSRobert Mustacchi 	    NULL, &dmap->dmab_dma_handle);
6239d26e4fcSRobert Mustacchi 	if (ret != DDI_SUCCESS) {
6249d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate dma handle for I/O "
6259d26e4fcSRobert Mustacchi 		    "buffers: %d", ret);
6269d26e4fcSRobert Mustacchi 		dmap->dmab_dma_handle = NULL;
6279d26e4fcSRobert Mustacchi 		return (B_FALSE);
6289d26e4fcSRobert Mustacchi 	}
6299d26e4fcSRobert Mustacchi 
6309d26e4fcSRobert Mustacchi 	/*
6319d26e4fcSRobert Mustacchi 	 * Step two: Allocate the DMA memory
6329d26e4fcSRobert Mustacchi 	 */
6339d26e4fcSRobert Mustacchi 	ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
6349d26e4fcSRobert Mustacchi 	    DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
6359d26e4fcSRobert Mustacchi 	    &dmap->dmab_acc_handle);
6369d26e4fcSRobert Mustacchi 	if (ret != DDI_SUCCESS) {
6379d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
6389d26e4fcSRobert Mustacchi 		    "buffers", size);
6399d26e4fcSRobert Mustacchi 		dmap->dmab_address = NULL;
6409d26e4fcSRobert Mustacchi 		dmap->dmab_acc_handle = NULL;
6419d26e4fcSRobert Mustacchi 		i40e_free_dma_buffer(dmap);
6429d26e4fcSRobert Mustacchi 		return (B_FALSE);
6439d26e4fcSRobert Mustacchi 	}
6449d26e4fcSRobert Mustacchi 
6459d26e4fcSRobert Mustacchi 	/*
6469d26e4fcSRobert Mustacchi 	 * Step three: Optionally zero
6479d26e4fcSRobert Mustacchi 	 */
6489d26e4fcSRobert Mustacchi 	if (zero == B_TRUE)
6499d26e4fcSRobert Mustacchi 		bzero(dmap->dmab_address, len);
6509d26e4fcSRobert Mustacchi 
6519d26e4fcSRobert Mustacchi 	/*
6529d26e4fcSRobert Mustacchi 	 * Step four: Bind the memory
6539d26e4fcSRobert Mustacchi 	 */
6549d26e4fcSRobert Mustacchi 	ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
6559d26e4fcSRobert Mustacchi 	    dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
6569d26e4fcSRobert Mustacchi 	    NULL, &cookie, &ncookies);
6579d26e4fcSRobert Mustacchi 	if (ret != DDI_DMA_MAPPED) {
6589d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
6599d26e4fcSRobert Mustacchi 		    "buffers: %d", size, ret);
6609d26e4fcSRobert Mustacchi 		i40e_free_dma_buffer(dmap);
6619d26e4fcSRobert Mustacchi 		return (B_FALSE);
6629d26e4fcSRobert Mustacchi 	}
6639d26e4fcSRobert Mustacchi 
6649d26e4fcSRobert Mustacchi 	VERIFY(ncookies == 1);
6659d26e4fcSRobert Mustacchi 	dmap->dmab_dma_address = cookie.dmac_laddress;
6669d26e4fcSRobert Mustacchi 	dmap->dmab_size = len;
6679d26e4fcSRobert Mustacchi 	dmap->dmab_len = 0;
6689d26e4fcSRobert Mustacchi 	return (B_TRUE);
6699d26e4fcSRobert Mustacchi }
6709d26e4fcSRobert Mustacchi 
6719d26e4fcSRobert Mustacchi /*
6729d26e4fcSRobert Mustacchi  * This function is called once the last pending rcb has been freed by the upper
6739d26e4fcSRobert Mustacchi  * levels of the system.
6749d26e4fcSRobert Mustacchi  */
6759d26e4fcSRobert Mustacchi static void
i40e_free_rx_data(i40e_rx_data_t * rxd)6769d26e4fcSRobert Mustacchi i40e_free_rx_data(i40e_rx_data_t *rxd)
6779d26e4fcSRobert Mustacchi {
6789d26e4fcSRobert Mustacchi 	VERIFY(rxd->rxd_rcb_pending == 0);
6799d26e4fcSRobert Mustacchi 
6809d26e4fcSRobert Mustacchi 	if (rxd->rxd_rcb_area != NULL) {
6819d26e4fcSRobert Mustacchi 		kmem_free(rxd->rxd_rcb_area,
6829d26e4fcSRobert Mustacchi 		    sizeof (i40e_rx_control_block_t) *
6839d26e4fcSRobert Mustacchi 		    (rxd->rxd_free_list_size + rxd->rxd_ring_size));
6849d26e4fcSRobert Mustacchi 		rxd->rxd_rcb_area = NULL;
6859d26e4fcSRobert Mustacchi 	}
6869d26e4fcSRobert Mustacchi 
6879d26e4fcSRobert Mustacchi 	if (rxd->rxd_free_list != NULL) {
6889d26e4fcSRobert Mustacchi 		kmem_free(rxd->rxd_free_list,
6899d26e4fcSRobert Mustacchi 		    sizeof (i40e_rx_control_block_t *) *
6909d26e4fcSRobert Mustacchi 		    rxd->rxd_free_list_size);
6919d26e4fcSRobert Mustacchi 		rxd->rxd_free_list = NULL;
6929d26e4fcSRobert Mustacchi 	}
6939d26e4fcSRobert Mustacchi 
6949d26e4fcSRobert Mustacchi 	if (rxd->rxd_work_list != NULL) {
6959d26e4fcSRobert Mustacchi 		kmem_free(rxd->rxd_work_list,
6969d26e4fcSRobert Mustacchi 		    sizeof (i40e_rx_control_block_t *) *
6979d26e4fcSRobert Mustacchi 		    rxd->rxd_ring_size);
6989d26e4fcSRobert Mustacchi 		rxd->rxd_work_list = NULL;
6999d26e4fcSRobert Mustacchi 	}
7009d26e4fcSRobert Mustacchi 
7019d26e4fcSRobert Mustacchi 	kmem_free(rxd, sizeof (i40e_rx_data_t));
7029d26e4fcSRobert Mustacchi }
7039d26e4fcSRobert Mustacchi 
7049d26e4fcSRobert Mustacchi static boolean_t
i40e_alloc_rx_data(i40e_t * i40e,i40e_trqpair_t * itrq)7059d26e4fcSRobert Mustacchi i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
7069d26e4fcSRobert Mustacchi {
7079d26e4fcSRobert Mustacchi 	i40e_rx_data_t *rxd;
7089d26e4fcSRobert Mustacchi 
7099d26e4fcSRobert Mustacchi 	rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
7109d26e4fcSRobert Mustacchi 	if (rxd == NULL)
7119d26e4fcSRobert Mustacchi 		return (B_FALSE);
7129d26e4fcSRobert Mustacchi 	itrq->itrq_rxdata = rxd;
7139d26e4fcSRobert Mustacchi 	rxd->rxd_i40e = i40e;
7149d26e4fcSRobert Mustacchi 
7159d26e4fcSRobert Mustacchi 	rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
7169d26e4fcSRobert Mustacchi 	rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
7179d26e4fcSRobert Mustacchi 
7189d26e4fcSRobert Mustacchi 	rxd->rxd_rcb_free = rxd->rxd_free_list_size;
7199d26e4fcSRobert Mustacchi 
7209d26e4fcSRobert Mustacchi 	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
7219d26e4fcSRobert Mustacchi 	    rxd->rxd_ring_size, KM_NOSLEEP);
7229d26e4fcSRobert Mustacchi 	if (rxd->rxd_work_list == NULL) {
72309aee612SRyan Zezeski 		i40e_error(i40e, "failed to allocate RX work list for a ring "
7249d26e4fcSRobert Mustacchi 		    "of %d entries for ring %d", rxd->rxd_ring_size,
7259d26e4fcSRobert Mustacchi 		    itrq->itrq_index);
7269d26e4fcSRobert Mustacchi 		goto cleanup;
7279d26e4fcSRobert Mustacchi 	}
7289d26e4fcSRobert Mustacchi 
7299d26e4fcSRobert Mustacchi 	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
7309d26e4fcSRobert Mustacchi 	    rxd->rxd_free_list_size, KM_NOSLEEP);
7319d26e4fcSRobert Mustacchi 	if (rxd->rxd_free_list == NULL) {
73209aee612SRyan Zezeski 		i40e_error(i40e, "failed to allocate a %d entry RX free list "
7339d26e4fcSRobert Mustacchi 		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
7349d26e4fcSRobert Mustacchi 		goto cleanup;
7359d26e4fcSRobert Mustacchi 	}
7369d26e4fcSRobert Mustacchi 
7379d26e4fcSRobert Mustacchi 	rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
7389d26e4fcSRobert Mustacchi 	    (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
7399d26e4fcSRobert Mustacchi 	if (rxd->rxd_rcb_area == NULL) {
7409d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate a %d entry rcb area for "
7419d26e4fcSRobert Mustacchi 		    "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
7429d26e4fcSRobert Mustacchi 		    itrq->itrq_index);
7439d26e4fcSRobert Mustacchi 		goto cleanup;
7449d26e4fcSRobert Mustacchi 	}
7459d26e4fcSRobert Mustacchi 
7469d26e4fcSRobert Mustacchi 	return (B_TRUE);
7479d26e4fcSRobert Mustacchi 
7489d26e4fcSRobert Mustacchi cleanup:
7499d26e4fcSRobert Mustacchi 	i40e_free_rx_data(rxd);
7509d26e4fcSRobert Mustacchi 	itrq->itrq_rxdata = NULL;
7519d26e4fcSRobert Mustacchi 	return (B_FALSE);
7529d26e4fcSRobert Mustacchi }
7539d26e4fcSRobert Mustacchi 
7549d26e4fcSRobert Mustacchi /*
7559d26e4fcSRobert Mustacchi  * Free all of the memory that we've allocated for DMA. Note that we may have
7569d26e4fcSRobert Mustacchi  * buffers that we've loaned up to the OS which are still outstanding. We'll
7579d26e4fcSRobert Mustacchi  * always free up the descriptor ring, because we no longer need that. For each
7589d26e4fcSRobert Mustacchi  * rcb, we'll iterate over it and if we send the reference count to zero, then
7599d26e4fcSRobert Mustacchi  * we'll free the message block and DMA related resources. However, if we don't
7609d26e4fcSRobert Mustacchi  * take the last one, then we'll go ahead and keep track that we'll have pending
7619d26e4fcSRobert Mustacchi  * data and clean it up when we get there.
7629d26e4fcSRobert Mustacchi  */
7639d26e4fcSRobert Mustacchi static void
i40e_free_rx_dma(i40e_rx_data_t * rxd,boolean_t failed_init)7649d26e4fcSRobert Mustacchi i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
7659d26e4fcSRobert Mustacchi {
7669d26e4fcSRobert Mustacchi 	uint32_t i, count, ref;
7679d26e4fcSRobert Mustacchi 
7689d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb;
7699d26e4fcSRobert Mustacchi 	i40e_t *i40e = rxd->rxd_i40e;
7709d26e4fcSRobert Mustacchi 
7719d26e4fcSRobert Mustacchi 	i40e_free_dma_buffer(&rxd->rxd_desc_area);
7729d26e4fcSRobert Mustacchi 	rxd->rxd_desc_ring = NULL;
7739d26e4fcSRobert Mustacchi 	rxd->rxd_desc_next = 0;
7749d26e4fcSRobert Mustacchi 
7759d26e4fcSRobert Mustacchi 	mutex_enter(&i40e->i40e_rx_pending_lock);
7769d26e4fcSRobert Mustacchi 
7779d26e4fcSRobert Mustacchi 	rcb = rxd->rxd_rcb_area;
7789d26e4fcSRobert Mustacchi 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
7799d26e4fcSRobert Mustacchi 
7809d26e4fcSRobert Mustacchi 	for (i = 0; i < count; i++, rcb++) {
7819d26e4fcSRobert Mustacchi 		VERIFY(rcb != NULL);
7829d26e4fcSRobert Mustacchi 
7839d26e4fcSRobert Mustacchi 		/*
7849d26e4fcSRobert Mustacchi 		 * If we're cleaning up from a failed creation attempt, then an
7859d26e4fcSRobert Mustacchi 		 * entry may never have been assembled which would mean that
7869d26e4fcSRobert Mustacchi 		 * it's reference count is zero. If we find that, we leave it
7879d26e4fcSRobert Mustacchi 		 * be, because nothing else should be modifying it at this
7889d26e4fcSRobert Mustacchi 		 * point. We're not at the point that any more references can be
7899d26e4fcSRobert Mustacchi 		 * added, just removed.
7909d26e4fcSRobert Mustacchi 		 */
7919d26e4fcSRobert Mustacchi 		if (failed_init == B_TRUE && rcb->rcb_ref == 0)
7929d26e4fcSRobert Mustacchi 			continue;
7939d26e4fcSRobert Mustacchi 
7949d26e4fcSRobert Mustacchi 		ref = atomic_dec_32_nv(&rcb->rcb_ref);
7959d26e4fcSRobert Mustacchi 		if (ref == 0) {
7969d26e4fcSRobert Mustacchi 			freemsg(rcb->rcb_mp);
7979d26e4fcSRobert Mustacchi 			rcb->rcb_mp = NULL;
7989d26e4fcSRobert Mustacchi 			i40e_free_dma_buffer(&rcb->rcb_dma);
7999d26e4fcSRobert Mustacchi 		} else {
8009d26e4fcSRobert Mustacchi 			atomic_inc_32(&rxd->rxd_rcb_pending);
8019d26e4fcSRobert Mustacchi 			atomic_inc_32(&i40e->i40e_rx_pending);
8029d26e4fcSRobert Mustacchi 		}
8039d26e4fcSRobert Mustacchi 	}
8049d26e4fcSRobert Mustacchi 	mutex_exit(&i40e->i40e_rx_pending_lock);
8059d26e4fcSRobert Mustacchi }
8069d26e4fcSRobert Mustacchi 
8079d26e4fcSRobert Mustacchi /*
8089d26e4fcSRobert Mustacchi  * Initialize the DMA memory for the descriptor ring and for each frame in the
8099d26e4fcSRobert Mustacchi  * control block list.
8109d26e4fcSRobert Mustacchi  */
8119d26e4fcSRobert Mustacchi static