19d26e4fcSRobert Mustacchi /*
29d26e4fcSRobert Mustacchi  * This file and its contents are supplied under the terms of the
39d26e4fcSRobert Mustacchi  * Common Development and Distribution License ("CDDL"), version 1.0.
49d26e4fcSRobert Mustacchi  * You may only use this file in accordance with the terms of version
59d26e4fcSRobert Mustacchi  * 1.0 of the CDDL.
69d26e4fcSRobert Mustacchi  *
79d26e4fcSRobert Mustacchi  * A full copy of the text of the CDDL should have accompanied this
89d26e4fcSRobert Mustacchi  * source.  A copy of the CDDL is also available via the Internet at
99d26e4fcSRobert Mustacchi  * http://www.illumos.org/license/CDDL.
109d26e4fcSRobert Mustacchi  */
119d26e4fcSRobert Mustacchi 
129d26e4fcSRobert Mustacchi /*
139d26e4fcSRobert Mustacchi  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
1409aee612SRyan Zezeski  * Copyright 2019 Joyent, Inc.
15*aa2a44afSPaul Winder  * Copyright 2020 RackTop Systems, Inc.
169d26e4fcSRobert Mustacchi  */
179d26e4fcSRobert Mustacchi 
189d26e4fcSRobert Mustacchi #include "i40e_sw.h"
199d26e4fcSRobert Mustacchi 
209d26e4fcSRobert Mustacchi /*
219d26e4fcSRobert Mustacchi  * ---------------------------------------------------------
229d26e4fcSRobert Mustacchi  * Buffer and Memory Management, Receiving, and Transmitting
239d26e4fcSRobert Mustacchi  * ---------------------------------------------------------
249d26e4fcSRobert Mustacchi  *
259d26e4fcSRobert Mustacchi  * Each physical function (PF), which is what we think of as an instance of the
269d26e4fcSRobert Mustacchi  * device driver, has a series of associated transmit and receive queue pairs.
279d26e4fcSRobert Mustacchi  * Effectively, what we think of in MAC as rings. Each of these has their own
289d26e4fcSRobert Mustacchi  * ring of descriptors which is used as part of doing DMA activity.
299d26e4fcSRobert Mustacchi  *
309d26e4fcSRobert Mustacchi  * The transmit ring of descriptors are 16-byte entries which are used to send
319d26e4fcSRobert Mustacchi  * packets, program filters, etc. The receive ring of descriptors are either
329d26e4fcSRobert Mustacchi  * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
339d26e4fcSRobert Mustacchi  * format so that we're in a better position if we ever want to leverage that
349d26e4fcSRobert Mustacchi  * information later on.
359d26e4fcSRobert Mustacchi  *
369d26e4fcSRobert Mustacchi  * However, these rings are just for descriptors, they don't talk or deal with
379d26e4fcSRobert Mustacchi  * how we actually store the memory that we need for DMA or the associated
389d26e4fcSRobert Mustacchi  * information that we need for keeping track of message blocks. To correspond
399d26e4fcSRobert Mustacchi  * to the hardware descriptor ring which is how we communicate with hardware, we
409d26e4fcSRobert Mustacchi  * introduce a control block which keeps track of our required metadata like DMA
419d26e4fcSRobert Mustacchi  * mappings.
429d26e4fcSRobert Mustacchi  *
439d26e4fcSRobert Mustacchi  * There are two main considerations that dictate how much memory and buffers
449d26e4fcSRobert Mustacchi  * we end up allocating. Those are:
459d26e4fcSRobert Mustacchi  *
469d26e4fcSRobert Mustacchi  *   o The size of the ring (controlled through the driver.conf file)
479d26e4fcSRobert Mustacchi  *
489d26e4fcSRobert Mustacchi  *   o The maximum size frame we can receive.
499d26e4fcSRobert Mustacchi  *
509d26e4fcSRobert Mustacchi  * The size of the rings currently defaults to 1024 descriptors and is stored in
519d26e4fcSRobert Mustacchi  * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
529d26e4fcSRobert Mustacchi  *
539d26e4fcSRobert Mustacchi  * While the size of the rings is controlled by the driver.conf, the maximum
549d26e4fcSRobert Mustacchi  * size frame is informed primarily through the use of dladm and the setting of
559d26e4fcSRobert Mustacchi  * the MTU property on the device. From the MTU, we then go and do some
569d26e4fcSRobert Mustacchi  * machinations. The first thing we do is we then have to add in space for the
579d26e4fcSRobert Mustacchi  * Ethernet header, potentially a VLAN header, and the FCS check. This value is
589d26e4fcSRobert Mustacchi  * what's stored as i40e_t`i40e_frame_max and is derived any time
599d26e4fcSRobert Mustacchi  * i40e_t`i40e_sdu changes.
609d26e4fcSRobert Mustacchi  *
619d26e4fcSRobert Mustacchi  * This size is then rounded up to the nearest 1k chunk, which represents the
629d26e4fcSRobert Mustacchi  * actual amount of memory that we'll allocate for a single frame.
639d26e4fcSRobert Mustacchi  *
6409aee612SRyan Zezeski  * Note, that for RX, we do something that might be unexpected. We always add
659d26e4fcSRobert Mustacchi  * an extra two bytes to the frame size that we allocate. We then offset the DMA
669d26e4fcSRobert Mustacchi  * address that we receive a packet into by two bytes. This ensures that the IP
679d26e4fcSRobert Mustacchi  * header will always be 4 byte aligned because the MAC header is either 14 or
689d26e4fcSRobert Mustacchi  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
699d26e4fcSRobert Mustacchi  * and MAC's lives easier.
709d26e4fcSRobert Mustacchi  *
7109aee612SRyan Zezeski  * Both the RX and TX descriptor rings (which are what we use to communicate
729d26e4fcSRobert Mustacchi  * with hardware) are allocated as a single region of DMA memory which is the
739d26e4fcSRobert Mustacchi  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
7409aee612SRyan Zezeski  * number of descriptors for an RX and TX ring.
759d26e4fcSRobert Mustacchi  *
7609aee612SRyan Zezeski  * While the RX and TX descriptors are allocated using DMA-based memory, the
779d26e4fcSRobert Mustacchi  * control blocks for each of them are allocated using normal kernel memory.
789d26e4fcSRobert Mustacchi  * They aren't special from a DMA perspective. We'll go over the design of both
799d26e4fcSRobert Mustacchi  * receiving and transmitting separately, as they have slightly different
809d26e4fcSRobert Mustacchi  * control blocks and different ways that we manage the relationship between
819d26e4fcSRobert Mustacchi  * control blocks and descriptors.
829d26e4fcSRobert Mustacchi  *
839d26e4fcSRobert Mustacchi  * ---------------------------------
849d26e4fcSRobert Mustacchi  * RX Descriptors and Control Blocks
859d26e4fcSRobert Mustacchi  * ---------------------------------
869d26e4fcSRobert Mustacchi  *
879d26e4fcSRobert Mustacchi  * For every descriptor in the ring that the driver has, we need some associated
889d26e4fcSRobert Mustacchi  * memory, which means that we need to have the receive specific control block.
899d26e4fcSRobert Mustacchi  * We have a couple different, but related goals:
909d26e4fcSRobert Mustacchi  *
919d26e4fcSRobert Mustacchi  *   o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
929d26e4fcSRobert Mustacchi  *     not want to do any additional memory allocations or DMA allocations if
939d26e4fcSRobert Mustacchi  *     we don't have to.
949d26e4fcSRobert Mustacchi  *
959d26e4fcSRobert Mustacchi  *   o We'd like to try and do as much zero-copy as possible, while taking into
969d26e4fcSRobert Mustacchi  *     account the cost of mapping in DMA resources.
979d26e4fcSRobert Mustacchi  *
989d26e4fcSRobert Mustacchi  *   o We'd like to have every receive descriptor available.
999d26e4fcSRobert Mustacchi  *
1009d26e4fcSRobert Mustacchi  * Now, these rules are a bit in tension with one another. The act of mapping in
1019d26e4fcSRobert Mustacchi  * is an exercise of trying to find the break-even point between page table
1029d26e4fcSRobert Mustacchi  * updates and bcopy. We currently start by using the same metrics that ixgbe
1039d26e4fcSRobert Mustacchi  * used; however, it should be known that this value has effectively been
1049d26e4fcSRobert Mustacchi  * cargo-culted across to yet another driver, sorry.
1059d26e4fcSRobert Mustacchi  *
1069d26e4fcSRobert Mustacchi  * If we receive a packet which is larger than our copy threshold, we'll create
1079d26e4fcSRobert Mustacchi  * a message block out of the DMA memory via desballoc(9F) and send that up to
1089d26e4fcSRobert Mustacchi  * MAC that way. This will cause us to be notified when the message block is
1099d26e4fcSRobert Mustacchi  * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
1109d26e4fcSRobert Mustacchi  * it's less than the threshold, we'll try to use allocb and bcopy it into the
1119d26e4fcSRobert Mustacchi  * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
1129d26e4fcSRobert Mustacchi  * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
1139d26e4fcSRobert Mustacchi  * the behavior and always do a bcopy or a DMA bind.
1149d26e4fcSRobert Mustacchi  *
1159d26e4fcSRobert Mustacchi  * To try and ensure that the device always has blocks that it can receive data
1169d26e4fcSRobert Mustacchi  * into, we maintain two lists of control blocks, a working list and a free
11709aee612SRyan Zezeski  * list. Each list is sized equal to the number of descriptors in the RX ring.
11809aee612SRyan Zezeski  * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
1199d26e4fcSRobert Mustacchi  * equal to twice the number of descriptors in the ring and we assign them
1209d26e4fcSRobert Mustacchi  * equally to the free list and to the working list. Each control block also has
1219d26e4fcSRobert Mustacchi  * DMA memory allocated and associated with which it will be used to receive the
1229d26e4fcSRobert Mustacchi  * actual packet data. All of a received frame's data will end up in a single
1239d26e4fcSRobert Mustacchi  * DMA buffer.
1249d26e4fcSRobert Mustacchi  *
12509aee612SRyan Zezeski  * During operation, we always maintain the invariant that each RX descriptor
12609aee612SRyan Zezeski  * has an associated RX control block which lives in the working list. If we
1279d26e4fcSRobert Mustacchi  * feel that we should loan up DMA memory to MAC in the form of a message block,
1289d26e4fcSRobert Mustacchi  * we can only do so if we can maintain this invariant. To do that, we swap in
1299d26e4fcSRobert Mustacchi  * one of the buffers from the free list. If none are available, then we resort
1309d26e4fcSRobert Mustacchi  * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
1319d26e4fcSRobert Mustacchi  * size.
1329d26e4fcSRobert Mustacchi  *
1339d26e4fcSRobert Mustacchi  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
13409aee612SRyan Zezeski  * called on the block, at which point we restore the RX control block to the
1359d26e4fcSRobert Mustacchi  * free list and are able to reuse the DMA memory again. While the scheme may
1369d26e4fcSRobert Mustacchi  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
1379d26e4fcSRobert Mustacchi  * the normal path of operation, even though we may still have to allocate
1389d26e4fcSRobert Mustacchi  * message blocks and copy.
1399d26e4fcSRobert Mustacchi  *
14009aee612SRyan Zezeski  * The following state machine describes the life time of a RX control block. In
14109aee612SRyan Zezeski  * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
1429d26e4fcSRobert Mustacchi  * control block entry as rcb.
1439d26e4fcSRobert Mustacchi  *
1449d26e4fcSRobert Mustacchi  *             |                                   |
1459d26e4fcSRobert Mustacchi  *             * ... 1/2 of all initial rcb's  ... *
1469d26e4fcSRobert Mustacchi  *             |                                   |
1479d26e4fcSRobert Mustacchi  *             v                                   v
1489d26e4fcSRobert Mustacchi  *     +------------------+               +------------------+
1499d26e4fcSRobert Mustacchi  *     | rcb on free list |---*---------->| rcb on work list |
1509d26e4fcSRobert Mustacchi  *     +------------------+   .           +------------------+
1519d26e4fcSRobert Mustacchi  *             ^              . moved to          |
1529d26e4fcSRobert Mustacchi  *             |                replace rcb       * . . Frame received,
1539d26e4fcSRobert Mustacchi  *             |                loaned to         |     entry on free list
1549d26e4fcSRobert Mustacchi  *             |                MAC + co.         |     available. rcb's
1559d26e4fcSRobert Mustacchi  *             |                                  |     memory made into mblk_t
1569d26e4fcSRobert Mustacchi  *             * . freemsg(9F)                    |     and sent up to MAC.
1579d26e4fcSRobert Mustacchi  *             |   called on                      |
1589d26e4fcSRobert Mustacchi  *             |   loaned rcb                     |
1599d26e4fcSRobert Mustacchi  *             |   and it is                      v
1609d26e4fcSRobert Mustacchi  *             |   recycled.              +-------------------+
1619d26e4fcSRobert Mustacchi  *             +--------------------<-----| rcb loaned to MAC |
1629d26e4fcSRobert Mustacchi  *                                        +-------------------+
1639d26e4fcSRobert Mustacchi  *
16409aee612SRyan Zezeski  * Finally, note that every RX control block has a reference count on it. One
1659d26e4fcSRobert Mustacchi  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
1669d26e4fcSRobert Mustacchi  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
1679d26e4fcSRobert Mustacchi  * no other DLPI consumers remain, then we'll decrement the reference count by
16809aee612SRyan Zezeski  * one. Whenever we loan up the RX control block and associated buffer to MAC,
1699d26e4fcSRobert Mustacchi  * then we bump the reference count again. Even though the device is stopped,
1709d26e4fcSRobert Mustacchi  * there may still be loaned frames in upper levels that we'll want to account
1719d26e4fcSRobert Mustacchi  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
1729d26e4fcSRobert Mustacchi  * that it is cleaned up.
1739d26e4fcSRobert Mustacchi  *
1749d26e4fcSRobert Mustacchi  * --------------------
1759d26e4fcSRobert Mustacchi  * Managing the RX Ring
1769d26e4fcSRobert Mustacchi  * --------------------
1779d26e4fcSRobert Mustacchi  *
1789d26e4fcSRobert Mustacchi  * The receive ring descriptors are arranged in a circular buffer with a head
1799d26e4fcSRobert Mustacchi  * and tail pointer. There are both the conventional head and tail pointers
1809d26e4fcSRobert Mustacchi  * which are used to partition the ring into two portions, a portion that we,
1819d26e4fcSRobert Mustacchi  * the operating system, manage and a portion that is managed by hardware. When
1829d26e4fcSRobert Mustacchi  * hardware owns a descriptor in the ring, it means that it is waiting for data
1839d26e4fcSRobert Mustacchi  * to be filled in. However, when a portion of the ring is owned by the driver,
1849d26e4fcSRobert Mustacchi  * then that means that the descriptor has been consumed and we need to go take
1859d26e4fcSRobert Mustacchi  * a look at it.
1869d26e4fcSRobert Mustacchi  *
1879d26e4fcSRobert Mustacchi  * The initial head is configured to be zero by writing it as such in the
1889d26e4fcSRobert Mustacchi  * receive queue context in the FPM (function private memory from the host). The
1899d26e4fcSRobert Mustacchi  * initial tail is written to be the last descriptor. This is written to via the
1909d26e4fcSRobert Mustacchi  * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
1919d26e4fcSRobert Mustacchi  * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
1929d26e4fcSRobert Mustacchi  * the only values we ever consult ourselves are the TAIL register and our own
1939d26e4fcSRobert Mustacchi  * state tracking. Effectively, we cache the HEAD register and then update it
1949d26e4fcSRobert Mustacchi  * ourselves based on our work.
1959d26e4fcSRobert Mustacchi  *
19609aee612SRyan Zezeski  * When we iterate over the RX descriptors and thus the received frames, we are
1979d26e4fcSRobert Mustacchi  * either in an interrupt context or we've been asked by MAC to poll on the
1989d26e4fcSRobert Mustacchi  * ring. If we've been asked to poll on the ring, we have a maximum number of
19909aee612SRyan Zezeski  * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
2009d26e4fcSRobert Mustacchi  * exceed that count, then we do not process it. When in interrupt context, we
2019d26e4fcSRobert Mustacchi  * don't have a strict byte count. However, to ensure liveness, we limit the
2029d26e4fcSRobert Mustacchi  * amount of data based on a configuration value
2039d26e4fcSRobert Mustacchi  * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
2049d26e4fcSRobert Mustacchi  * is based on similar numbers that are used for ixgbe. After some additional
2059d26e4fcSRobert Mustacchi  * time in the field, we'll have a sense as to whether or not it should be
2069d26e4fcSRobert Mustacchi  * changed.
2079d26e4fcSRobert Mustacchi  *
2089d26e4fcSRobert Mustacchi  * When processing, we start at our own HEAD pointer
2099d26e4fcSRobert Mustacchi  * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
2109d26e4fcSRobert Mustacchi  * processing. Every RX descriptor has what's described as the DD bit. This bit
2119d26e4fcSRobert Mustacchi  * (the LSB of the second 8-byte word), indicates whether or not the descriptor
2129d26e4fcSRobert Mustacchi  * is done.  When we give descriptors to the hardware, this value is always
2139d26e4fcSRobert Mustacchi  * zero. When the hardware has finished a descriptor, it will always be one.
2149d26e4fcSRobert Mustacchi  *
2159d26e4fcSRobert Mustacchi  * The first thing that we check is whether the DD bit indicates that the
2169d26e4fcSRobert Mustacchi  * current HEAD is ready. If it isn't, then we're done. That's the primary
2179d26e4fcSRobert Mustacchi  * invariant of processing a frame. If it's done, then there are a few other
2189d26e4fcSRobert Mustacchi  * things that we want to look at. In the same status word as the DD bit, there
2199d26e4fcSRobert Mustacchi  * are two other important bits:
2209d26e4fcSRobert Mustacchi  *
2219d26e4fcSRobert Mustacchi  *   o End of Packet (EOP)
2229d26e4fcSRobert Mustacchi  *   o Error bits
2239d26e4fcSRobert Mustacchi  *
2249d26e4fcSRobert Mustacchi  * The end of packet indicates that we have reached the last descriptor. Now,
2259d26e4fcSRobert Mustacchi  * you might ask when would there be more than one descriptor. The reason for
2269d26e4fcSRobert Mustacchi  * that might be due to large receive offload (lro) or header splitting
2279d26e4fcSRobert Mustacchi  * functionality, which presently isn't supported in the driver. The error bits
2289d26e4fcSRobert Mustacchi  * in the frame are only valid when EOP is set.
2299d26e4fcSRobert Mustacchi  *
2309d26e4fcSRobert Mustacchi  * If error bits are set on the frame, then we still consume it; however, we
2319d26e4fcSRobert Mustacchi  * will not generate an mblk_t to send up to MAC. If there are no error bits
2329d26e4fcSRobert Mustacchi  * set, then we'll consume the descriptor either using bcopy or DMA binding. See
2339d26e4fcSRobert Mustacchi  * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
2349d26e4fcSRobert Mustacchi  * on how that selection is made.
2359d26e4fcSRobert Mustacchi  *
2369d26e4fcSRobert Mustacchi  * Regardless of whether we construct an mblk_t or encounter an error, we end up
2379d26e4fcSRobert Mustacchi  * resetting the descriptor. This re-arms the descriptor for hardware and in the
2389d26e4fcSRobert Mustacchi  * process, we may end up assigning it a new receive control bock. After we do
2399d26e4fcSRobert Mustacchi  * this, we always update our HEAD pointer, no matter what.
2409d26e4fcSRobert Mustacchi  *
2419d26e4fcSRobert Mustacchi  * Finally, once we've consumed as much as we will in a given window, we go and
2429d26e4fcSRobert Mustacchi  * update the TAIL register to indicate all the frames we've consumed. We only
2439d26e4fcSRobert Mustacchi  * do a single bulk write for the ring.
2449d26e4fcSRobert Mustacchi  *
2459d26e4fcSRobert Mustacchi  * ---------------------------------
2469d26e4fcSRobert Mustacchi  * TX Descriptors and Control Blocks
2479d26e4fcSRobert Mustacchi  * ---------------------------------
2489d26e4fcSRobert Mustacchi  *
2499d26e4fcSRobert Mustacchi  * While the transmit path is similar in spirit to the receive path, it works
2509d26e4fcSRobert Mustacchi  * differently due to the fact that all data is originated by the operating
2519d26e4fcSRobert Mustacchi  * system and not by the device.
2529d26e4fcSRobert Mustacchi  *
25309aee612SRyan Zezeski  * Like RX, there is both a descriptor ring that we use to communicate to the
25409aee612SRyan Zezeski  * driver and which points to the memory used to transmit a frame.  Similarly,
25509aee612SRyan Zezeski  * there is a corresponding transmit control block, however, the correspondence
25609aee612SRyan Zezeski  * between descriptors and control blocks is more complex and not necessarily
25709aee612SRyan Zezeski  * 1-to-1.
2589d26e4fcSRobert Mustacchi  *
2599d26e4fcSRobert Mustacchi  * The driver is asked to process a single frame at a time. That message block
2609d26e4fcSRobert Mustacchi  * may be made up of multiple fragments linked together by the mblk_t`b_cont
2619d26e4fcSRobert Mustacchi  * member. The device has a hard limit of up to 8 buffers being allowed for use
26209aee612SRyan Zezeski  * for a single non-LSO packet or LSO segment. The number of TX ring entires
26309aee612SRyan Zezeski  * (and thus TX control blocks) used depends on the fragment sizes and DMA
26409aee612SRyan Zezeski  * layout, as explained below.
26509aee612SRyan Zezeski  *
26609aee612SRyan Zezeski  * We alter our DMA strategy based on a threshold tied to the fragment size.
26709aee612SRyan Zezeski  * This threshold is configurable via the tx_dma_threshold property. If the
26809aee612SRyan Zezeski  * fragment is above the threshold, we DMA bind it -- consuming one TCB and
26909aee612SRyan Zezeski  * potentially several data descriptors. The exact number of descriptors (equal
27009aee612SRyan Zezeski  * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
27109aee612SRyan Zezeski  * into page, b_wptr offset into page, and the physical layout of the dblk's
27209aee612SRyan Zezeski  * memory (contiguous or not). Essentially, we are at the mercy of the DMA
27309aee612SRyan Zezeski  * engine and the dblk's memory allocation. Knowing the exact number of
27409aee612SRyan Zezeski  * descriptors up front is a task best not taken on by the driver itself.
27509aee612SRyan Zezeski  * Instead, we attempt to DMA bind the fragment and verify the descriptor
27609aee612SRyan Zezeski  * layout meets hardware constraints. If the proposed DMA bind does not satisfy
27709aee612SRyan Zezeski  * the hardware constaints, then we discard it and instead copy the entire
27809aee612SRyan Zezeski  * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
27909aee612SRyan Zezeski  * larger than the TCB buffer).
28009aee612SRyan Zezeski  *
28109aee612SRyan Zezeski  * If the fragment is below or at the threshold, we copy it to the pre-allocated
28209aee612SRyan Zezeski  * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
28309aee612SRyan Zezeski  * conserve resources. We are guaranteed that the TCB buffer is made up of only
28409aee612SRyan Zezeski  * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
28509aee612SRyan Zezeski  *
28609aee612SRyan Zezeski  * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
28709aee612SRyan Zezeski  * filtering, then the TX data descriptors must be preceeded by a single TX
28809aee612SRyan Zezeski  * context descriptor.  Because there is no DMA transfer associated with the
28909aee612SRyan Zezeski  * context descriptor, we allocate a control block with a special type which
29009aee612SRyan Zezeski  * indicates to the TX ring recycle code that there are no associated DMA
29109aee612SRyan Zezeski  * resources to unbind when the control block is free'd.
29209aee612SRyan Zezeski  *
29309aee612SRyan Zezeski  * If we don't have enough space in the ring or TX control blocks available,
2949d26e4fcSRobert Mustacchi  * then we'll return the unprocessed message block to MAC. This will induce flow
2959d26e4fcSRobert Mustacchi  * control and once we recycle enough entries, we'll once again enable sending
2969d26e4fcSRobert Mustacchi  * on the ring.
2979d26e4fcSRobert Mustacchi  *
2989d26e4fcSRobert Mustacchi  * We size the working list as equal to the number of descriptors in the ring.
2999d26e4fcSRobert Mustacchi  * We size the free list as equal to 1.5 times the number of descriptors in the
30009aee612SRyan Zezeski  * ring. We'll allocate a number of TX control block entries equal to the number
3019d26e4fcSRobert Mustacchi  * of entries in the free list. By default, all entries are placed in the free
3029d26e4fcSRobert Mustacchi  * list. As we come along and try to send something, we'll allocate entries from
3039d26e4fcSRobert Mustacchi  * the free list and add them to the working list, where they'll stay until the
3049d26e4fcSRobert Mustacchi  * hardware indicates that all of the data has been written back to us. The
3059d26e4fcSRobert Mustacchi  * reason that we start with 1.5x is to help facilitate having more than one TX
3069d26e4fcSRobert Mustacchi  * buffer associated with the DMA activity.
3079d26e4fcSRobert Mustacchi  *
3089d26e4fcSRobert Mustacchi  * --------------------
3099d26e4fcSRobert Mustacchi  * Managing the TX Ring
3109d26e4fcSRobert Mustacchi  * --------------------
3119d26e4fcSRobert Mustacchi  *
3129d26e4fcSRobert Mustacchi  * The transmit descriptor ring is driven by us. We maintain our own notion of a
3139d26e4fcSRobert Mustacchi  * HEAD and TAIL register and we update the hardware with updates to the TAIL
3149d26e4fcSRobert Mustacchi  * register. When the hardware is done writing out data, it updates us by
3159d26e4fcSRobert Mustacchi  * writing back to a specific address, not by updating the individual
3169d26e4fcSRobert Mustacchi  * descriptors. That address is a 4-byte region after the main transmit
3179d26e4fcSRobert Mustacchi  * descriptor ring. This is why the descriptor ring has an extra descriptor's
3189d26e4fcSRobert Mustacchi  * worth allocated to it.
3199d26e4fcSRobert Mustacchi  *
3209d26e4fcSRobert Mustacchi  * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
3219d26e4fcSRobert Mustacchi  * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
3229d26e4fcSRobert Mustacchi  * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
3239d26e4fcSRobert Mustacchi  * points in time, through both interrupts, and our own internal checks, we'll
3249d26e4fcSRobert Mustacchi  * sync the write-back head portion of the DMA space. Based on the index it
3259d26e4fcSRobert Mustacchi  * reports back, we'll free everything between our current HEAD and the
3269d26e4fcSRobert Mustacchi  * indicated index and update HEAD to the new index.
3279d26e4fcSRobert Mustacchi  *
3289d26e4fcSRobert Mustacchi  * When a frame comes in, we try to use a number of transmit control blocks and
3299d26e4fcSRobert Mustacchi  * we'll transition them from the free list to the work list. They'll get moved
3309d26e4fcSRobert Mustacchi  * to the entry on the work list that corresponds with the transmit descriptor
3319d26e4fcSRobert Mustacchi  * they correspond to. Once we are indicated that the corresponding descriptor
3329d26e4fcSRobert Mustacchi  * has been freed, we'll return it to the list.
3339d26e4fcSRobert Mustacchi  *
3349d26e4fcSRobert Mustacchi  * The transmit control block free list is managed by keeping track of the
3359d26e4fcSRobert Mustacchi  * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
3369d26e4fcSRobert Mustacchi  * index into the free list and add things to it. In effect, we always push and
3379d26e4fcSRobert Mustacchi  * pop from the tail and protect it with a single lock,
3389d26e4fcSRobert Mustacchi  * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
3399d26e4fcSRobert Mustacchi  * stand up to further performance testing; however, it does allow us to get off
3409d26e4fcSRobert Mustacchi  * the ground with the device driver.
3419d26e4fcSRobert Mustacchi  *
3429d26e4fcSRobert Mustacchi  * The following image describes where a given transmit control block lives in
3439d26e4fcSRobert Mustacchi  * its lifetime:
3449d26e4fcSRobert Mustacchi  *
3459d26e4fcSRobert Mustacchi  *             |
3469d26e4fcSRobert Mustacchi  *             * ... Initial placement for all tcb's
3479d26e4fcSRobert Mustacchi  *             |
3489d26e4fcSRobert Mustacchi  *             v
3499d26e4fcSRobert Mustacchi  *    +------------------+                       +------------------+
3509d26e4fcSRobert Mustacchi  *    | tcb on free list |---*------------------>| tcb on work list |
3519d26e4fcSRobert Mustacchi  *    +------------------+   .                   +------------------+
35209aee612SRyan Zezeski  *             ^             . N tcbs allocated[1]         |
3539d26e4fcSRobert Mustacchi  *             |               to send frame               v
3549d26e4fcSRobert Mustacchi  *             |               or fragment on              |
3559d26e4fcSRobert Mustacchi  *             |               wire, mblk from             |
3569d26e4fcSRobert Mustacchi  *             |               MAC associated.             |
3579d26e4fcSRobert Mustacchi  *             |                                           |
3589d26e4fcSRobert Mustacchi  *             +------*-------------------------------<----+
3599d26e4fcSRobert Mustacchi  *                    .
3609d26e4fcSRobert Mustacchi  *                    . Hardware indicates
3619d26e4fcSRobert Mustacchi  *                      entry transmitted.
36209aee612SRyan Zezeski  *                      tcbs recycled, mblk
3639d26e4fcSRobert Mustacchi  *                      from MAC freed.
3649d26e4fcSRobert Mustacchi  *
36509aee612SRyan Zezeski  * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
36609aee612SRyan Zezeski  *     descriptor plus 1 data descriptor, in the non-DMA-bind case.  In the DMA
36709aee612SRyan Zezeski  *     bind case, N can be 1 context descriptor plus 1 data descriptor per
36809aee612SRyan Zezeski  *     b_cont in the mblk.  In this case, the mblk is associated with the first
36909aee612SRyan Zezeski  *     data descriptor and freed as part of freeing that data descriptor.
37009aee612SRyan Zezeski  *
3719d26e4fcSRobert Mustacchi  * ------------
3729d26e4fcSRobert Mustacchi  * Blocking MAC
3739d26e4fcSRobert Mustacchi  * ------------
3749d26e4fcSRobert Mustacchi  *
37509aee612SRyan Zezeski  * When performing transmit, we can run out of descriptors and ring entries.
37609aee612SRyan Zezeski  * When such a case happens, we return the mblk_t to MAC to indicate that we've
37709aee612SRyan Zezeski  * been blocked. At that point in time, MAC becomes blocked and will not
37809aee612SRyan Zezeski  * transmit anything out that specific ring until we notify MAC. To indicate
37909aee612SRyan Zezeski  * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
38009aee612SRyan Zezeski  * to B_TRUE.
3819d26e4fcSRobert Mustacchi  *
38209aee612SRyan Zezeski  * When we recycle TX descriptors then we'll end up signaling MAC by calling
3839d26e4fcSRobert Mustacchi  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
3849d26e4fcSRobert Mustacchi  * start sending frames out to us again.
3859d26e4fcSRobert Mustacchi  */
3869d26e4fcSRobert Mustacchi 
3879d26e4fcSRobert Mustacchi /*
3889d26e4fcSRobert Mustacchi  * We set our DMA alignment requests based on the smallest supported page size
3899d26e4fcSRobert Mustacchi  * of the corresponding platform.
3909d26e4fcSRobert Mustacchi  */
3919d26e4fcSRobert Mustacchi #if	defined(__sparc)
3929d26e4fcSRobert Mustacchi #define	I40E_DMA_ALIGNMENT 0x2000ull
3939d26e4fcSRobert Mustacchi #elif defined(__x86)
3949d26e4fcSRobert Mustacchi #define	I40E_DMA_ALIGNMENT 0x1000ull
3959d26e4fcSRobert Mustacchi #else
3969d26e4fcSRobert Mustacchi #error	"unknown architecture for i40e"
3979d26e4fcSRobert Mustacchi #endif
3989d26e4fcSRobert Mustacchi 
3999d26e4fcSRobert Mustacchi /*
4009d26e4fcSRobert Mustacchi  * This structure is used to maintain information and flags related to
40109aee612SRyan Zezeski  * transmitting a frame.  These fields are ultimately used to construct the
40209aee612SRyan Zezeski  * TX data descriptor(s) and, if necessary, the TX context descriptor.
4039d26e4fcSRobert Mustacchi  */
4049d26e4fcSRobert Mustacchi typedef struct i40e_tx_context {
40509aee612SRyan Zezeski 	enum i40e_tx_desc_cmd_bits	itc_data_cmdflags;
40609aee612SRyan Zezeski 	uint32_t			itc_data_offsets;
40709aee612SRyan Zezeski 	enum i40e_tx_ctx_desc_cmd_bits	itc_ctx_cmdflags;
40809aee612SRyan Zezeski 	uint32_t			itc_ctx_tsolen;
40909aee612SRyan Zezeski 	uint32_t			itc_ctx_mss;
4109d26e4fcSRobert Mustacchi } i40e_tx_context_t;
4119d26e4fcSRobert Mustacchi 
4129d26e4fcSRobert Mustacchi /*
4139d26e4fcSRobert Mustacchi  * Toggles on debug builds which can be used to override our RX behaviour based
4149d26e4fcSRobert Mustacchi  * on thresholds.
4159d26e4fcSRobert Mustacchi  */
4169d26e4fcSRobert Mustacchi #ifdef	DEBUG
4179d26e4fcSRobert Mustacchi typedef enum {
4189d26e4fcSRobert Mustacchi 	I40E_DEBUG_RX_DEFAULT	= 0,
4199d26e4fcSRobert Mustacchi 	I40E_DEBUG_RX_BCOPY	= 1,
4209d26e4fcSRobert Mustacchi 	I40E_DEBUG_RX_DMABIND	= 2
4219d26e4fcSRobert Mustacchi } i40e_debug_rx_t;
4229d26e4fcSRobert Mustacchi 
4239d26e4fcSRobert Mustacchi i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
4249d26e4fcSRobert Mustacchi #endif	/* DEBUG */
4259d26e4fcSRobert Mustacchi 
4269d26e4fcSRobert Mustacchi /*
4279d26e4fcSRobert Mustacchi  * Notes on the following pair of DMA attributes. The first attribute,
4289d26e4fcSRobert Mustacchi  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
4299d26e4fcSRobert Mustacchi  * and the static buffers that we associate with control blocks. For this
4309d26e4fcSRobert Mustacchi  * reason, we force an SGL length of one. While technically the driver supports
43109aee612SRyan Zezeski  * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
4329d26e4fcSRobert Mustacchi  * management here. In addition, when the Intel common code wants to allocate
4339d26e4fcSRobert Mustacchi  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
4349d26e4fcSRobert Mustacchi  * the static dma attr.
4359d26e4fcSRobert Mustacchi  *
43609aee612SRyan Zezeski  * The latter two sets of attributes, are what we use when we're binding a
43709aee612SRyan Zezeski  * bunch of mblk_t fragments to go out the door. Note that the main difference
43809aee612SRyan Zezeski  * here is that we're allowed a larger SGL length.  For non-LSO TX, we
43909aee612SRyan Zezeski  * restrict the SGL length to match the number of TX buffers available to the
44009aee612SRyan Zezeski  * PF (8).  For the LSO case we can go much larger, with the caveat that each
44109aee612SRyan Zezeski  * MSS-sized chunk (segment) must not span more than 8 data descriptors and
44209aee612SRyan Zezeski  * hence must not span more than 8 cookies.
4439d26e4fcSRobert Mustacchi  *
4449d26e4fcSRobert Mustacchi  * Note, we default to setting ourselves to be DMA capable here. However,
4459d26e4fcSRobert Mustacchi  * because we could have multiple instances which have different FMA error
4469d26e4fcSRobert Mustacchi  * checking capabilities, or end up on different buses, we make these static
4479d26e4fcSRobert Mustacchi  * and const and copy them into the i40e_t for the given device with the actual
4489d26e4fcSRobert Mustacchi  * values that reflect the actual capabilities.
4499d26e4fcSRobert Mustacchi  */
4509d26e4fcSRobert Mustacchi static const ddi_dma_attr_t i40e_g_static_dma_attr = {
4519d26e4fcSRobert Mustacchi 	DMA_ATTR_V0,			/* version number */
4529d26e4fcSRobert Mustacchi 	0x0000000000000000ull,		/* low address */
4539d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* high address */
4549d26e4fcSRobert Mustacchi 	0x00000000FFFFFFFFull,		/* dma counter max */
4559d26e4fcSRobert Mustacchi 	I40E_DMA_ALIGNMENT,		/* alignment */
4569d26e4fcSRobert Mustacchi 	0x00000FFF,			/* burst sizes */
4579d26e4fcSRobert Mustacchi 	0x00000001,			/* minimum transfer size */
4589d26e4fcSRobert Mustacchi 	0x00000000FFFFFFFFull,		/* maximum transfer size */
4599d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size */
4609d26e4fcSRobert Mustacchi 	1,				/* scatter/gather list length */
4619d26e4fcSRobert Mustacchi 	0x00000001,			/* granularity */
4629d26e4fcSRobert Mustacchi 	DDI_DMA_FLAGERR			/* DMA flags */
4639d26e4fcSRobert Mustacchi };
4649d26e4fcSRobert Mustacchi 
4659d26e4fcSRobert Mustacchi static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
4669d26e4fcSRobert Mustacchi 	DMA_ATTR_V0,			/* version number */
4679d26e4fcSRobert Mustacchi 	0x0000000000000000ull,		/* low address */
4689d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* high address */
46909aee612SRyan Zezeski 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
4709d26e4fcSRobert Mustacchi 	I40E_DMA_ALIGNMENT,		/* alignment */
4719d26e4fcSRobert Mustacchi 	0x00000FFF,			/* burst sizes */
4729d26e4fcSRobert Mustacchi 	0x00000001,			/* minimum transfer size */
4739d26e4fcSRobert Mustacchi 	0x00000000FFFFFFFFull,		/* maximum transfer size */
4749d26e4fcSRobert Mustacchi 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
4759d26e4fcSRobert Mustacchi 	I40E_TX_MAX_COOKIE,		/* scatter/gather list length */
4769d26e4fcSRobert Mustacchi 	0x00000001,			/* granularity */
4779d26e4fcSRobert Mustacchi 	DDI_DMA_FLAGERR			/* DMA flags */
4789d26e4fcSRobert Mustacchi };
4799d26e4fcSRobert Mustacchi 
48009aee612SRyan Zezeski static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
48109aee612SRyan Zezeski 	DMA_ATTR_V0,			/* version number */
48209aee612SRyan Zezeski 	0x0000000000000000ull,		/* low address */
48309aee612SRyan Zezeski 	0xFFFFFFFFFFFFFFFFull,		/* high address */
48409aee612SRyan Zezeski 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
48509aee612SRyan Zezeski 	I40E_DMA_ALIGNMENT,		/* alignment */
48609aee612SRyan Zezeski 	0x00000FFF,			/* burst sizes */
48709aee612SRyan Zezeski 	0x00000001,			/* minimum transfer size */
48809aee612SRyan Zezeski 	0x00000000FFFFFFFFull,		/* maximum transfer size */
48909aee612SRyan Zezeski 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
49009aee612SRyan Zezeski 	I40E_TX_LSO_MAX_COOKIE,		/* scatter/gather list length */
49109aee612SRyan Zezeski 	0x00000001,			/* granularity */
49209aee612SRyan Zezeski 	DDI_DMA_FLAGERR			/* DMA flags */
49309aee612SRyan Zezeski };
49409aee612SRyan Zezeski 
4959d26e4fcSRobert Mustacchi /*
4969d26e4fcSRobert Mustacchi  * Next, we have the attributes for these structures. The descriptor rings are
4979d26e4fcSRobert Mustacchi  * all strictly little endian, while the data buffers are just arrays of bytes
4989d26e4fcSRobert Mustacchi  * representing frames. Because of this, we purposefully simplify the driver
4999d26e4fcSRobert Mustacchi  * programming life by programming the descriptor ring as little endian, while
5009d26e4fcSRobert Mustacchi  * for the buffer data we keep it as unstructured.
5019d26e4fcSRobert Mustacchi  *
5029d26e4fcSRobert Mustacchi  * Note, that to keep the Intel common code operating in a reasonable way, when
5039d26e4fcSRobert Mustacchi  * we allocate DMA memory for it, we do not use byte swapping and thus use the
5049d26e4fcSRobert Mustacchi  * standard i40e_buf_acc_attr.
5059d26e4fcSRobert Mustacchi  */
5069d26e4fcSRobert Mustacchi static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
5079d26e4fcSRobert Mustacchi 	DDI_DEVICE_ATTR_V0,
5089d26e4fcSRobert Mustacchi 	DDI_STRUCTURE_LE_ACC,
5099d26e4fcSRobert Mustacchi 	DDI_STRICTORDER_ACC
5109d26e4fcSRobert Mustacchi };
5119d26e4fcSRobert Mustacchi 
5129d26e4fcSRobert Mustacchi static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
5139d26e4fcSRobert Mustacchi 	DDI_DEVICE_ATTR_V0,
5149d26e4fcSRobert Mustacchi 	DDI_NEVERSWAP_ACC,
5159d26e4fcSRobert Mustacchi 	DDI_STRICTORDER_ACC
5169d26e4fcSRobert Mustacchi };
5179d26e4fcSRobert Mustacchi 
5189d26e4fcSRobert Mustacchi /*
5199d26e4fcSRobert Mustacchi  * The next two functions are designed to be type-safe versions of macros that
5209d26e4fcSRobert Mustacchi  * are used to increment and decrement a descriptor index in the loop. Note,
5219d26e4fcSRobert Mustacchi  * these are marked inline to try and keep the data path hot and they were
5229d26e4fcSRobert Mustacchi  * effectively inlined in their previous life as macros.
5239d26e4fcSRobert Mustacchi  */
5249d26e4fcSRobert Mustacchi static inline int
i40e_next_desc(int base,int count,int size)5259d26e4fcSRobert Mustacchi i40e_next_desc(int base, int count, int size)
5269d26e4fcSRobert Mustacchi {
5279d26e4fcSRobert Mustacchi 	int out;
5289d26e4fcSRobert Mustacchi 
5299d26e4fcSRobert Mustacchi 	ASSERT(base >= 0);
5309d26e4fcSRobert Mustacchi 	ASSERT(count > 0);
5319d26e4fcSRobert Mustacchi 	ASSERT(size > 0);
5329d26e4fcSRobert Mustacchi 
5339d26e4fcSRobert Mustacchi 	if (base + count < size) {
5349d26e4fcSRobert Mustacchi 		out = base + count;
5359d26e4fcSRobert Mustacchi 	} else {
5369d26e4fcSRobert Mustacchi 		out = base + count - size;
5379d26e4fcSRobert Mustacchi 	}
5389d26e4fcSRobert Mustacchi 
5399d26e4fcSRobert Mustacchi 	ASSERT(out >= 0 && out < size);
5409d26e4fcSRobert Mustacchi 	return (out);
5419d26e4fcSRobert Mustacchi }
5429d26e4fcSRobert Mustacchi 
5439d26e4fcSRobert Mustacchi static inline int
i40e_prev_desc(int base,int count,int size)5449d26e4fcSRobert Mustacchi i40e_prev_desc(int base, int count, int size)
5459d26e4fcSRobert Mustacchi {
5469d26e4fcSRobert Mustacchi 	int out;
5479d26e4fcSRobert Mustacchi 
5489d26e4fcSRobert Mustacchi 	ASSERT(base >= 0);
5499d26e4fcSRobert Mustacchi 	ASSERT(count > 0);
5509d26e4fcSRobert Mustacchi 	ASSERT(size > 0);
5519d26e4fcSRobert Mustacchi 
5529d26e4fcSRobert Mustacchi 	if (base >= count) {
5539d26e4fcSRobert Mustacchi 		out = base - count;
5549d26e4fcSRobert Mustacchi 	} else {
5559d26e4fcSRobert Mustacchi 		out = base - count + size;
5569d26e4fcSRobert Mustacchi 	}
5579d26e4fcSRobert Mustacchi 
5589d26e4fcSRobert Mustacchi 	ASSERT(out >= 0 && out < size);
5599d26e4fcSRobert Mustacchi 	return (out);
5609d26e4fcSRobert Mustacchi }
5619d26e4fcSRobert Mustacchi 
5629d26e4fcSRobert Mustacchi /*
5639d26e4fcSRobert Mustacchi  * Free DMA memory that is represented by a i40e_dma_buffer_t.
5649d26e4fcSRobert Mustacchi  */
5659d26e4fcSRobert Mustacchi static void
i40e_free_dma_buffer(i40e_dma_buffer_t * dmap)5669d26e4fcSRobert Mustacchi i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
5679d26e4fcSRobert Mustacchi {
568ae6f9789SToomas Soome 	if (dmap->dmab_dma_address != 0) {
5699d26e4fcSRobert Mustacchi 		VERIFY(dmap->dmab_dma_handle != NULL);
5709d26e4fcSRobert Mustacchi 		(void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
571ae6f9789SToomas Soome 		dmap->dmab_dma_address = 0;
5729d26e4fcSRobert Mustacchi 		dmap->dmab_size = 0;
5739d26e4fcSRobert Mustacchi 	}
5749d26e4fcSRobert Mustacchi 
5759d26e4fcSRobert Mustacchi 	if (dmap->dmab_acc_handle != NULL) {
5769d26e4fcSRobert Mustacchi 		ddi_dma_mem_free(&dmap->dmab_acc_handle);
5779d26e4fcSRobert Mustacchi 		dmap->dmab_acc_handle = NULL;
5789d26e4fcSRobert Mustacchi 		dmap->dmab_address = NULL;
5799d26e4fcSRobert Mustacchi 	}
5809d26e4fcSRobert Mustacchi 
5819d26e4fcSRobert Mustacchi 	if (dmap->dmab_dma_handle != NULL) {
5829d26e4fcSRobert Mustacchi 		ddi_dma_free_handle(&dmap->dmab_dma_handle);
5839d26e4fcSRobert Mustacchi 		dmap->dmab_dma_handle = NULL;
5849d26e4fcSRobert Mustacchi 	}
5859d26e4fcSRobert Mustacchi 
5869d26e4fcSRobert Mustacchi 	/*
5879d26e4fcSRobert Mustacchi 	 * These should only be set if we have valid handles allocated and
5889d26e4fcSRobert Mustacchi 	 * therefore should always be NULLed out due to the above code. This
5899d26e4fcSRobert Mustacchi 	 * is here to catch us acting sloppy.
5909d26e4fcSRobert Mustacchi 	 */
591ae6f9789SToomas Soome 	ASSERT(dmap->dmab_dma_address == 0);
5929d26e4fcSRobert Mustacchi 	ASSERT(dmap->dmab_address == NULL);
5939d26e4fcSRobert Mustacchi 	ASSERT(dmap->dmab_size == 0);
5949d26e4fcSRobert Mustacchi 	dmap->dmab_len = 0;
5959d26e4fcSRobert Mustacchi }
5969d26e4fcSRobert Mustacchi 
5979d26e4fcSRobert Mustacchi /*
5989d26e4fcSRobert Mustacchi  * Allocate size bytes of DMA memory based on the passed in attributes. This
5999d26e4fcSRobert Mustacchi  * fills in the information in dmap and is designed for all of our single cookie
6009d26e4fcSRobert Mustacchi  * allocations.
6019d26e4fcSRobert Mustacchi  */
6029d26e4fcSRobert Mustacchi static boolean_t
i40e_alloc_dma_buffer(i40e_t * i40e,i40e_dma_buffer_t * dmap,ddi_dma_attr_t * attrsp,ddi_device_acc_attr_t * accp,boolean_t stream,boolean_t zero,size_t size)6039d26e4fcSRobert Mustacchi i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
6049d26e4fcSRobert Mustacchi     ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
6059d26e4fcSRobert Mustacchi     boolean_t zero, size_t size)
6069d26e4fcSRobert Mustacchi {
6079d26e4fcSRobert Mustacchi 	int ret;
6089d26e4fcSRobert Mustacchi 	uint_t flags;
6099d26e4fcSRobert Mustacchi 	size_t len;
6109d26e4fcSRobert Mustacchi 	ddi_dma_cookie_t cookie;
6119d26e4fcSRobert Mustacchi 	uint_t ncookies;
6129d26e4fcSRobert Mustacchi 
6139d26e4fcSRobert Mustacchi 	if (stream == B_TRUE)
6149d26e4fcSRobert Mustacchi 		flags = DDI_DMA_STREAMING;
6159d26e4fcSRobert Mustacchi 	else
6169d26e4fcSRobert Mustacchi 		flags = DDI_DMA_CONSISTENT;
6179d26e4fcSRobert Mustacchi 
6189d26e4fcSRobert Mustacchi 	/*
6199d26e4fcSRobert Mustacchi 	 * Step one: Allocate the DMA handle
6209d26e4fcSRobert Mustacchi 	 */
6219d26e4fcSRobert Mustacchi 	ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
6229d26e4fcSRobert Mustacchi 	    NULL, &dmap->dmab_dma_handle);
6239d26e4fcSRobert Mustacchi 	if (ret != DDI_SUCCESS) {
6249d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate dma handle for I/O "
6259d26e4fcSRobert Mustacchi 		    "buffers: %d", ret);
6269d26e4fcSRobert Mustacchi 		dmap->dmab_dma_handle = NULL;
6279d26e4fcSRobert Mustacchi 		return (B_FALSE);
6289d26e4fcSRobert Mustacchi 	}
6299d26e4fcSRobert Mustacchi 
6309d26e4fcSRobert Mustacchi 	/*
6319d26e4fcSRobert Mustacchi 	 * Step two: Allocate the DMA memory
6329d26e4fcSRobert Mustacchi 	 */
6339d26e4fcSRobert Mustacchi 	ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
6349d26e4fcSRobert Mustacchi 	    DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
6359d26e4fcSRobert Mustacchi 	    &dmap->dmab_acc_handle);
6369d26e4fcSRobert Mustacchi 	if (ret != DDI_SUCCESS) {
6379d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
6389d26e4fcSRobert Mustacchi 		    "buffers", size);
6399d26e4fcSRobert Mustacchi 		dmap->dmab_address = NULL;
6409d26e4fcSRobert Mustacchi 		dmap->dmab_acc_handle = NULL;
6419d26e4fcSRobert Mustacchi 		i40e_free_dma_buffer(dmap);
6429d26e4fcSRobert Mustacchi 		return (B_FALSE);
6439d26e4fcSRobert Mustacchi 	}
6449d26e4fcSRobert Mustacchi 
6459d26e4fcSRobert Mustacchi 	/*
6469d26e4fcSRobert Mustacchi 	 * Step three: Optionally zero
6479d26e4fcSRobert Mustacchi 	 */
6489d26e4fcSRobert Mustacchi 	if (zero == B_TRUE)
6499d26e4fcSRobert Mustacchi 		bzero(dmap->dmab_address, len);
6509d26e4fcSRobert Mustacchi 
6519d26e4fcSRobert Mustacchi 	/*
6529d26e4fcSRobert Mustacchi 	 * Step four: Bind the memory
6539d26e4fcSRobert Mustacchi 	 */
6549d26e4fcSRobert Mustacchi 	ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
6559d26e4fcSRobert Mustacchi 	    dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
6569d26e4fcSRobert Mustacchi 	    NULL, &cookie, &ncookies);
6579d26e4fcSRobert Mustacchi 	if (ret != DDI_DMA_MAPPED) {
6589d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
6599d26e4fcSRobert Mustacchi 		    "buffers: %d", size, ret);
6609d26e4fcSRobert Mustacchi 		i40e_free_dma_buffer(dmap);
6619d26e4fcSRobert Mustacchi 		return (B_FALSE);
6629d26e4fcSRobert Mustacchi 	}
6639d26e4fcSRobert Mustacchi 
6649d26e4fcSRobert Mustacchi 	VERIFY(ncookies == 1);
6659d26e4fcSRobert Mustacchi 	dmap->dmab_dma_address = cookie.dmac_laddress;
6669d26e4fcSRobert Mustacchi 	dmap->dmab_size = len;
6679d26e4fcSRobert Mustacchi 	dmap->dmab_len = 0;
6689d26e4fcSRobert Mustacchi 	return (B_TRUE);
6699d26e4fcSRobert Mustacchi }
6709d26e4fcSRobert Mustacchi 
6719d26e4fcSRobert Mustacchi /*
6729d26e4fcSRobert Mustacchi  * This function is called once the last pending rcb has been freed by the upper
6739d26e4fcSRobert Mustacchi  * levels of the system.
6749d26e4fcSRobert Mustacchi  */
6759d26e4fcSRobert Mustacchi static void
i40e_free_rx_data(i40e_rx_data_t * rxd)6769d26e4fcSRobert Mustacchi i40e_free_rx_data(i40e_rx_data_t *rxd)
6779d26e4fcSRobert Mustacchi {
6789d26e4fcSRobert Mustacchi 	VERIFY(rxd->rxd_rcb_pending == 0);
6799d26e4fcSRobert Mustacchi 
6809d26e4fcSRobert Mustacchi 	if (rxd->rxd_rcb_area != NULL) {
6819d26e4fcSRobert Mustacchi 		kmem_free(rxd->rxd_rcb_area,
6829d26e4fcSRobert Mustacchi 		    sizeof (i40e_rx_control_block_t) *
6839d26e4fcSRobert Mustacchi 		    (rxd->rxd_free_list_size + rxd->rxd_ring_size));
6849d26e4fcSRobert Mustacchi 		rxd->rxd_rcb_area = NULL;
6859d26e4fcSRobert Mustacchi 	}
6869d26e4fcSRobert Mustacchi 
6879d26e4fcSRobert Mustacchi 	if (rxd->rxd_free_list != NULL) {
6889d26e4fcSRobert Mustacchi 		kmem_free(rxd->rxd_free_list,
6899d26e4fcSRobert Mustacchi 		    sizeof (i40e_rx_control_block_t *) *
6909d26e4fcSRobert Mustacchi 		    rxd->rxd_free_list_size);
6919d26e4fcSRobert Mustacchi 		rxd->rxd_free_list = NULL;
6929d26e4fcSRobert Mustacchi 	}
6939d26e4fcSRobert Mustacchi 
6949d26e4fcSRobert Mustacchi 	if (rxd->rxd_work_list != NULL) {
6959d26e4fcSRobert Mustacchi 		kmem_free(rxd->rxd_work_list,
6969d26e4fcSRobert Mustacchi 		    sizeof (i40e_rx_control_block_t *) *
6979d26e4fcSRobert Mustacchi 		    rxd->rxd_ring_size);
6989d26e4fcSRobert Mustacchi 		rxd->rxd_work_list = NULL;
6999d26e4fcSRobert Mustacchi 	}
7009d26e4fcSRobert Mustacchi 
7019d26e4fcSRobert Mustacchi 	kmem_free(rxd, sizeof (i40e_rx_data_t));
7029d26e4fcSRobert Mustacchi }
7039d26e4fcSRobert Mustacchi 
7049d26e4fcSRobert Mustacchi static boolean_t
i40e_alloc_rx_data(i40e_t * i40e,i40e_trqpair_t * itrq)7059d26e4fcSRobert Mustacchi i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
7069d26e4fcSRobert Mustacchi {
7079d26e4fcSRobert Mustacchi 	i40e_rx_data_t *rxd;
7089d26e4fcSRobert Mustacchi 
7099d26e4fcSRobert Mustacchi 	rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
7109d26e4fcSRobert Mustacchi 	if (rxd == NULL)
7119d26e4fcSRobert Mustacchi 		return (B_FALSE);
7129d26e4fcSRobert Mustacchi 	itrq->itrq_rxdata = rxd;
7139d26e4fcSRobert Mustacchi 	rxd->rxd_i40e = i40e;
7149d26e4fcSRobert Mustacchi 
7159d26e4fcSRobert Mustacchi 	rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
7169d26e4fcSRobert Mustacchi 	rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
7179d26e4fcSRobert Mustacchi 
7189d26e4fcSRobert Mustacchi 	rxd->rxd_rcb_free = rxd->rxd_free_list_size;
7199d26e4fcSRobert Mustacchi 
7209d26e4fcSRobert Mustacchi 	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
7219d26e4fcSRobert Mustacchi 	    rxd->rxd_ring_size, KM_NOSLEEP);
7229d26e4fcSRobert Mustacchi 	if (rxd->rxd_work_list == NULL) {
72309aee612SRyan Zezeski 		i40e_error(i40e, "failed to allocate RX work list for a ring "
7249d26e4fcSRobert Mustacchi 		    "of %d entries for ring %d", rxd->rxd_ring_size,
7259d26e4fcSRobert Mustacchi 		    itrq->itrq_index);
7269d26e4fcSRobert Mustacchi 		goto cleanup;
7279d26e4fcSRobert Mustacchi 	}
7289d26e4fcSRobert Mustacchi 
7299d26e4fcSRobert Mustacchi 	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
7309d26e4fcSRobert Mustacchi 	    rxd->rxd_free_list_size, KM_NOSLEEP);
7319d26e4fcSRobert Mustacchi 	if (rxd->rxd_free_list == NULL) {
73209aee612SRyan Zezeski 		i40e_error(i40e, "failed to allocate a %d entry RX free list "
7339d26e4fcSRobert Mustacchi 		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
7349d26e4fcSRobert Mustacchi 		goto cleanup;
7359d26e4fcSRobert Mustacchi 	}
7369d26e4fcSRobert Mustacchi 
7379d26e4fcSRobert Mustacchi 	rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
7389d26e4fcSRobert Mustacchi 	    (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
7399d26e4fcSRobert Mustacchi 	if (rxd->rxd_rcb_area == NULL) {
7409d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate a %d entry rcb area for "
7419d26e4fcSRobert Mustacchi 		    "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
7429d26e4fcSRobert Mustacchi 		    itrq->itrq_index);
7439d26e4fcSRobert Mustacchi 		goto cleanup;
7449d26e4fcSRobert Mustacchi 	}
7459d26e4fcSRobert Mustacchi 
7469d26e4fcSRobert Mustacchi 	return (B_TRUE);
7479d26e4fcSRobert Mustacchi 
7489d26e4fcSRobert Mustacchi cleanup:
7499d26e4fcSRobert Mustacchi 	i40e_free_rx_data(rxd);
7509d26e4fcSRobert Mustacchi 	itrq->itrq_rxdata = NULL;
7519d26e4fcSRobert Mustacchi 	return (B_FALSE);
7529d26e4fcSRobert Mustacchi }
7539d26e4fcSRobert Mustacchi 
7549d26e4fcSRobert Mustacchi /*
7559d26e4fcSRobert Mustacchi  * Free all of the memory that we've allocated for DMA. Note that we may have
7569d26e4fcSRobert Mustacchi  * buffers that we've loaned up to the OS which are still outstanding. We'll
7579d26e4fcSRobert Mustacchi  * always free up the descriptor ring, because we no longer need that. For each
7589d26e4fcSRobert Mustacchi  * rcb, we'll iterate over it and if we send the reference count to zero, then
7599d26e4fcSRobert Mustacchi  * we'll free the message block and DMA related resources. However, if we don't
7609d26e4fcSRobert Mustacchi  * take the last one, then we'll go ahead and keep track that we'll have pending
7619d26e4fcSRobert Mustacchi  * data and clean it up when we get there.
7629d26e4fcSRobert Mustacchi  */
7639d26e4fcSRobert Mustacchi static void
i40e_free_rx_dma(i40e_rx_data_t * rxd,boolean_t failed_init)7649d26e4fcSRobert Mustacchi i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
7659d26e4fcSRobert Mustacchi {
7669d26e4fcSRobert Mustacchi 	uint32_t i, count, ref;
7679d26e4fcSRobert Mustacchi 
7689d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb;
7699d26e4fcSRobert Mustacchi 	i40e_t *i40e = rxd->rxd_i40e;
7709d26e4fcSRobert Mustacchi 
7719d26e4fcSRobert Mustacchi 	i40e_free_dma_buffer(&rxd->rxd_desc_area);
7729d26e4fcSRobert Mustacchi 	rxd->rxd_desc_ring = NULL;
7739d26e4fcSRobert Mustacchi 	rxd->rxd_desc_next = 0;
7749d26e4fcSRobert Mustacchi 
7759d26e4fcSRobert Mustacchi 	mutex_enter(&i40e->i40e_rx_pending_lock);
7769d26e4fcSRobert Mustacchi 
7779d26e4fcSRobert Mustacchi 	rcb = rxd->rxd_rcb_area;
7789d26e4fcSRobert Mustacchi 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
7799d26e4fcSRobert Mustacchi 
7809d26e4fcSRobert Mustacchi 	for (i = 0; i < count; i++, rcb++) {
7819d26e4fcSRobert Mustacchi 		VERIFY(rcb != NULL);
7829d26e4fcSRobert Mustacchi 
7839d26e4fcSRobert Mustacchi 		/*
7849d26e4fcSRobert Mustacchi 		 * If we're cleaning up from a failed creation attempt, then an
7859d26e4fcSRobert Mustacchi 		 * entry may never have been assembled which would mean that
7869d26e4fcSRobert Mustacchi 		 * it's reference count is zero. If we find that, we leave it
7879d26e4fcSRobert Mustacchi 		 * be, because nothing else should be modifying it at this
7889d26e4fcSRobert Mustacchi 		 * point. We're not at the point that any more references can be
7899d26e4fcSRobert Mustacchi 		 * added, just removed.
7909d26e4fcSRobert Mustacchi 		 */
7919d26e4fcSRobert Mustacchi 		if (failed_init == B_TRUE && rcb->rcb_ref == 0)
7929d26e4fcSRobert Mustacchi 			continue;
7939d26e4fcSRobert Mustacchi 
7949d26e4fcSRobert Mustacchi 		ref = atomic_dec_32_nv(&rcb->rcb_ref);
7959d26e4fcSRobert Mustacchi 		if (ref == 0) {
7969d26e4fcSRobert Mustacchi 			freemsg(rcb->rcb_mp);
7979d26e4fcSRobert Mustacchi 			rcb->rcb_mp = NULL;
7989d26e4fcSRobert Mustacchi 			i40e_free_dma_buffer(&rcb->rcb_dma);
7999d26e4fcSRobert Mustacchi 		} else {
8009d26e4fcSRobert Mustacchi 			atomic_inc_32(&rxd->rxd_rcb_pending);
8019d26e4fcSRobert Mustacchi 			atomic_inc_32(&i40e->i40e_rx_pending);
8029d26e4fcSRobert Mustacchi 		}
8039d26e4fcSRobert Mustacchi 	}
8049d26e4fcSRobert Mustacchi 	mutex_exit(&i40e->i40e_rx_pending_lock);
8059d26e4fcSRobert Mustacchi }
8069d26e4fcSRobert Mustacchi 
8079d26e4fcSRobert Mustacchi /*
8089d26e4fcSRobert Mustacchi  * Initialize the DMA memory for the descriptor ring and for each frame in the
8099d26e4fcSRobert Mustacchi  * control block list.
8109d26e4fcSRobert Mustacchi  */
8119d26e4fcSRobert Mustacchi static boolean_t
i40e_alloc_rx_dma(i40e_rx_data_t * rxd)8129d26e4fcSRobert Mustacchi i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
8139d26e4fcSRobert Mustacchi {
8149d26e4fcSRobert Mustacchi 	int i, count;
8159d26e4fcSRobert Mustacchi 	size_t dmasz;
8169d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb;
8179d26e4fcSRobert Mustacchi 	i40e_t *i40e = rxd->rxd_i40e;
8189d26e4fcSRobert Mustacchi 
8199d26e4fcSRobert Mustacchi 	/*
82009aee612SRyan Zezeski 	 * First allocate the RX descriptor ring.
8219d26e4fcSRobert Mustacchi 	 */
8229d26e4fcSRobert Mustacchi 	dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
8239d26e4fcSRobert Mustacchi 	VERIFY(dmasz > 0);
8249d26e4fcSRobert Mustacchi 	if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
8259d26e4fcSRobert Mustacchi 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
8269d26e4fcSRobert Mustacchi 	    B_TRUE, dmasz) == B_FALSE) {
8279d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate DMA resources "
82809aee612SRyan Zezeski 		    "for RX descriptor ring");
8299d26e4fcSRobert Mustacchi 		return (B_FALSE);
8309d26e4fcSRobert Mustacchi 	}
8319d26e4fcSRobert Mustacchi 	rxd->rxd_desc_ring =
8329d26e4fcSRobert Mustacchi 	    (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
8339d26e4fcSRobert Mustacchi 	rxd->rxd_desc_next = 0;
8349d26e4fcSRobert Mustacchi 
8359d26e4fcSRobert Mustacchi 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
8369d26e4fcSRobert Mustacchi 	rcb = rxd->rxd_rcb_area;
8379d26e4fcSRobert Mustacchi 
8389d26e4fcSRobert Mustacchi 	dmasz = i40e->i40e_rx_buf_size;
8399d26e4fcSRobert Mustacchi 	VERIFY(dmasz > 0);
8409d26e4fcSRobert Mustacchi 	for (i = 0; i < count; i++, rcb++) {
8419d26e4fcSRobert Mustacchi 		i40e_dma_buffer_t *dmap;
8429d26e4fcSRobert Mustacchi 		VERIFY(rcb != NULL);
8439d26e4fcSRobert Mustacchi 
8449d26e4fcSRobert Mustacchi 		if (i < rxd->rxd_ring_size) {
8459d26e4fcSRobert Mustacchi 			rxd->rxd_work_list[i] = rcb;
8469d26e4fcSRobert Mustacchi 		} else {
8479d26e4fcSRobert Mustacchi 			rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
8489d26e4fcSRobert Mustacchi 		}
8499d26e4fcSRobert Mustacchi 
8509d26e4fcSRobert Mustacchi 		dmap = &rcb->rcb_dma;
8519d26e4fcSRobert Mustacchi 		if (i40e_alloc_dma_buffer(i40e, dmap,
8529d26e4fcSRobert Mustacchi 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
8539d26e4fcSRobert Mustacchi 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
85409aee612SRyan Zezeski 			i40e_error(i40e, "failed to allocate RX dma buffer");
8559d26e4fcSRobert Mustacchi 			return (B_FALSE);
8569d26e4fcSRobert Mustacchi 		}
8579d26e4fcSRobert Mustacchi 
8589d26e4fcSRobert Mustacchi 		/*
8599d26e4fcSRobert Mustacchi 		 * Initialize the control block and offset the DMA address. See
8609d26e4fcSRobert Mustacchi 		 * the note in the big theory statement that explains how this
8619d26e4fcSRobert Mustacchi 		 * helps IP deal with alignment. Note, we don't worry about
8629d26e4fcSRobert Mustacchi 		 * whether or not we successfully get an mblk_t from desballoc,
8639d26e4fcSRobert Mustacchi 		 * it's a common case that we have to handle later on in the
8649d26e4fcSRobert Mustacchi 		 * system.
8659d26e4fcSRobert Mustacchi 		 */
8669d26e4fcSRobert Mustacchi 		dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
8679d26e4fcSRobert Mustacchi 		dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
8689d26e4fcSRobert Mustacchi 		dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
8699d26e4fcSRobert Mustacchi 
8709d26e4fcSRobert Mustacchi 		rcb->rcb_ref = 1;
8719d26e4fcSRobert Mustacchi 		rcb->rcb_rxd = rxd;
8729d26e4fcSRobert Mustacchi 		rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
8739d26e4fcSRobert Mustacchi 		rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
8749d26e4fcSRobert Mustacchi 		rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
8759d26e4fcSRobert Mustacchi 		    dmap->dmab_size, 0, &rcb->rcb_free_rtn);
8769d26e4fcSRobert Mustacchi 	}
8779d26e4fcSRobert Mustacchi 
8789d26e4fcSRobert Mustacchi 	return (B_TRUE);
8799d26e4fcSRobert Mustacchi }
8809d26e4fcSRobert Mustacchi 
8819d26e4fcSRobert Mustacchi static void
i40e_free_tx_dma(i40e_trqpair_t * itrq)8829d26e4fcSRobert Mustacchi i40e_free_tx_dma(i40e_trqpair_t *itrq)
8839d26e4fcSRobert Mustacchi {
8849d26e4fcSRobert Mustacchi 	size_t fsz;
8859d26e4fcSRobert Mustacchi 
8869d26e4fcSRobert Mustacchi 	if (itrq->itrq_tcb_area != NULL) {
8879d26e4fcSRobert Mustacchi 		uint32_t i;
8889d26e4fcSRobert Mustacchi 		i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
8899d26e4fcSRobert Mustacchi 
8909d26e4fcSRobert Mustacchi 		for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
8919d26e4fcSRobert Mustacchi 			i40e_free_dma_buffer(&tcb->tcb_dma);
8929d26e4fcSRobert Mustacchi 			if (tcb->tcb_dma_handle != NULL) {
8939d26e4fcSRobert Mustacchi 				ddi_dma_free_handle(&tcb->tcb_dma_handle);
8949d26e4fcSRobert Mustacchi 				tcb->tcb_dma_handle = NULL;
8959d26e4fcSRobert Mustacchi 			}
89609aee612SRyan Zezeski 			if (tcb->tcb_lso_dma_handle != NULL) {
89709aee612SRyan Zezeski 				ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
89809aee612SRyan Zezeski 				tcb->tcb_lso_dma_handle = NULL;
89909aee612SRyan Zezeski 			}
9009d26e4fcSRobert Mustacchi 		}
9019d26e4fcSRobert Mustacchi 
9029d26e4fcSRobert Mustacchi 		fsz = sizeof (i40e_tx_control_block_t) *
9039d26e4fcSRobert Mustacchi 		    itrq->itrq_tx_free_list_size;
9049d26e4fcSRobert Mustacchi 		kmem_free(itrq->itrq_tcb_area, fsz);
9059d26e4fcSRobert Mustacchi 		itrq->itrq_tcb_area = NULL;
9069d26e4fcSRobert Mustacchi 	}
9079d26e4fcSRobert Mustacchi 
9089d26e4fcSRobert Mustacchi 	if (itrq->itrq_tcb_free_list != NULL) {
9099d26e4fcSRobert Mustacchi 		fsz = sizeof (i40e_tx_control_block_t *) *
9109d26e4fcSRobert Mustacchi 		    itrq->itrq_tx_free_list_size;
9119d26e4fcSRobert Mustacchi 		kmem_free(itrq->itrq_tcb_free_list, fsz);
9129d26e4fcSRobert Mustacchi 		itrq->itrq_tcb_free_list = NULL;
9139d26e4fcSRobert Mustacchi 	}
9149d26e4fcSRobert Mustacchi 
9159d26e4fcSRobert Mustacchi 	if (itrq->itrq_tcb_work_list != NULL) {
9169d26e4fcSRobert Mustacchi 		fsz = sizeof (i40e_tx_control_block_t *) *
9179d26e4fcSRobert Mustacchi 		    itrq->itrq_tx_ring_size;
9189d26e4fcSRobert Mustacchi 		kmem_free(itrq->itrq_tcb_work_list, fsz);
9199d26e4fcSRobert Mustacchi 		itrq->itrq_tcb_work_list = NULL;
9209d26e4fcSRobert Mustacchi 	}
9219d26e4fcSRobert Mustacchi 
9229d26e4fcSRobert Mustacchi 	i40e_free_dma_buffer(&itrq->itrq_desc_area);
9239d26e4fcSRobert Mustacchi 	itrq->itrq_desc_ring = NULL;
9249d26e4fcSRobert Mustacchi 
9259d26e4fcSRobert Mustacchi }
9269d26e4fcSRobert Mustacchi 
9279d26e4fcSRobert Mustacchi static boolean_t
i40e_alloc_tx_dma(i40e_trqpair_t * itrq)9289d26e4fcSRobert Mustacchi i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
9299d26e4fcSRobert Mustacchi {
9309d26e4fcSRobert Mustacchi 	int i, ret;
9319d26e4fcSRobert Mustacchi 	size_t dmasz;
9329d26e4fcSRobert Mustacchi 	i40e_tx_control_block_t *tcb;
9339d26e4fcSRobert Mustacchi 	i40e_t *i40e = itrq->itrq_i40e;
9349d26e4fcSRobert Mustacchi 
9359d26e4fcSRobert Mustacchi 	itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
9369d26e4fcSRobert Mustacchi 	itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
9379d26e4fcSRobert Mustacchi 	    (i40e->i40e_tx_ring_size >> 1);
9389d26e4fcSRobert Mustacchi 
9399d26e4fcSRobert Mustacchi 	/*
94009aee612SRyan Zezeski 	 * Allocate an additional TX descriptor for the writeback head.
9419d26e4fcSRobert Mustacchi 	 */
9429d26e4fcSRobert Mustacchi 	dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
9439d26e4fcSRobert Mustacchi 	dmasz += sizeof (i40e_tx_desc_t);
9449d26e4fcSRobert Mustacchi 
9459d26e4fcSRobert Mustacchi 	VERIFY(dmasz > 0);
9469d26e4fcSRobert Mustacchi 	if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
9479d26e4fcSRobert Mustacchi 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
9489d26e4fcSRobert Mustacchi 	    B_FALSE, B_TRUE, dmasz) == B_FALSE) {
94909aee612SRyan Zezeski 		i40e_error(i40e, "failed to allocate DMA resources for TX "
9509d26e4fcSRobert Mustacchi 		    "descriptor ring");
9519d26e4fcSRobert Mustacchi 		return (B_FALSE);
9529d26e4fcSRobert Mustacchi 	}
9539d26e4fcSRobert Mustacchi 	itrq->itrq_desc_ring =
9549d26e4fcSRobert Mustacchi 	    (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
9559d26e4fcSRobert Mustacchi 	itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
9569d26e4fcSRobert Mustacchi 	    itrq->itrq_tx_ring_size);
9579d26e4fcSRobert Mustacchi 	itrq->itrq_desc_head = 0;
9589d26e4fcSRobert Mustacchi 	itrq->itrq_desc_tail = 0;
9599d26e4fcSRobert Mustacchi 	itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
9609d26e4fcSRobert Mustacchi 
9619d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
9629d26e4fcSRobert Mustacchi 	    sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
9639d26e4fcSRobert Mustacchi 	if (itrq->itrq_tcb_work_list == NULL) {
96409aee612SRyan Zezeski 		i40e_error(i40e, "failed to allocate a %d entry TX work list "
9659d26e4fcSRobert Mustacchi 		    "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
9669d26e4fcSRobert Mustacchi 		goto cleanup;
9679d26e4fcSRobert Mustacchi 	}
9689d26e4fcSRobert Mustacchi 
9699d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
9709d26e4fcSRobert Mustacchi 	    sizeof (i40e_tx_control_block_t *), KM_SLEEP);
9719d26e4fcSRobert Mustacchi 	if (itrq->itrq_tcb_free_list == NULL) {
97209aee612SRyan Zezeski 		i40e_error(i40e, "failed to allocate a %d entry TX free list "
9739d26e4fcSRobert Mustacchi 		    "for ring %d", itrq->itrq_tx_free_list_size,
9749d26e4fcSRobert Mustacchi 		    itrq->itrq_index);
9759d26e4fcSRobert Mustacchi 		goto cleanup;
9769d26e4fcSRobert Mustacchi 	}
9779d26e4fcSRobert Mustacchi 
9789d26e4fcSRobert Mustacchi 	/*
97909aee612SRyan Zezeski 	 * We allocate enough TX control blocks to cover the free list.
9809d26e4fcSRobert Mustacchi 	 */
9819d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
9829d26e4fcSRobert Mustacchi 	    itrq->itrq_tx_free_list_size, KM_NOSLEEP);
9839d26e4fcSRobert Mustacchi 	if (itrq->itrq_tcb_area == NULL) {
9849d26e4fcSRobert Mustacchi 		i40e_error(i40e, "failed to allocate a %d entry tcb area for "
9859d26e4fcSRobert Mustacchi 		    "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
9869d26e4fcSRobert Mustacchi 		goto cleanup;
9879d26e4fcSRobert Mustacchi 	}
9889d26e4fcSRobert Mustacchi 
9899d26e4fcSRobert Mustacchi 	/*
9909d26e4fcSRobert Mustacchi 	 * For each tcb, allocate DMA memory.
9919d26e4fcSRobert Mustacchi 	 */
9929d26e4fcSRobert Mustacchi 	dmasz = i40e->i40e_tx_buf_size;
9939d26e4fcSRobert Mustacchi 	VERIFY(dmasz > 0);
9949d26e4fcSRobert Mustacchi 	tcb = itrq->itrq_tcb_area;
9959d26e4fcSRobert Mustacchi 	for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
9969d26e4fcSRobert Mustacchi 		VERIFY(tcb != NULL);
9979d26e4fcSRobert Mustacchi 
9989d26e4fcSRobert Mustacchi 		/*
9999d26e4fcSRobert Mustacchi 		 * Allocate both a DMA buffer which we'll use for when we copy
10009d26e4fcSRobert Mustacchi 		 * packets for transmission and allocate a DMA handle which
10019d26e4fcSRobert Mustacchi 		 * we'll use when we bind data.
10029d26e4fcSRobert Mustacchi 		 */
10039d26e4fcSRobert Mustacchi 		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
10049d26e4fcSRobert Mustacchi 		    &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
10059d26e4fcSRobert Mustacchi 		    &tcb->tcb_dma_handle);
10069d26e4fcSRobert Mustacchi 		if (ret != DDI_SUCCESS) {
100709aee612SRyan Zezeski 			i40e_error(i40e, "failed to allocate DMA handle for TX "
10089d26e4fcSRobert Mustacchi 			    "data binding on ring %d: %d", itrq->itrq_index,
10099d26e4fcSRobert Mustacchi 			    ret);
10109d26e4fcSRobert Mustacchi 			tcb->tcb_dma_handle = NULL;
10119d26e4fcSRobert Mustacchi 			goto cleanup;
10129d26e4fcSRobert Mustacchi 		}
10139d26e4fcSRobert Mustacchi 
101409aee612SRyan Zezeski 		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
101509aee612SRyan Zezeski 		    &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
101609aee612SRyan Zezeski 		    &tcb->tcb_lso_dma_handle);
101709aee612SRyan Zezeski 		if (ret != DDI_SUCCESS) {
101809aee612SRyan Zezeski 			i40e_error(i40e, "failed to allocate DMA handle for TX "
101909aee612SRyan Zezeski 			    "LSO data binding on ring %d: %d", itrq->itrq_index,
102009aee612SRyan Zezeski 			    ret);
102109aee612SRyan Zezeski 			tcb->tcb_lso_dma_handle = NULL;
102209aee612SRyan Zezeski 			goto cleanup;
102309aee612SRyan Zezeski 		}
102409aee612SRyan Zezeski 
10259d26e4fcSRobert Mustacchi 		if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
10269d26e4fcSRobert Mustacchi 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
10279d26e4fcSRobert Mustacchi 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
10289d26e4fcSRobert Mustacchi 			i40e_error(i40e, "failed to allocate %ld bytes of "
102909aee612SRyan Zezeski 			    "DMA for TX data binding on ring %d", dmasz,
10309d26e4fcSRobert Mustacchi 			    itrq->itrq_index);
10319d26e4fcSRobert Mustacchi 			goto cleanup;
10329d26e4fcSRobert Mustacchi 		}
10339d26e4fcSRobert Mustacchi 
10349d26e4fcSRobert Mustacchi 		itrq->itrq_tcb_free_list[i] = tcb;
10359d26e4fcSRobert Mustacchi 	}
10369d26e4fcSRobert Mustacchi 
10379d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
10389d26e4fcSRobert Mustacchi 
10399d26e4fcSRobert Mustacchi 	return (B_TRUE);
10409d26e4fcSRobert Mustacchi 
10419d26e4fcSRobert Mustacchi cleanup:
10429d26e4fcSRobert Mustacchi 	i40e_free_tx_dma(itrq);
10439d26e4fcSRobert Mustacchi 	return (B_FALSE);
10449d26e4fcSRobert Mustacchi }
10459d26e4fcSRobert Mustacchi 
10469d26e4fcSRobert Mustacchi /*
1047*aa2a44afSPaul Winder  * Free all memory associated with a ring. Note, this is done as part of
1048*aa2a44afSPaul Winder  * the GLDv3 ring stop routine.
10499d26e4fcSRobert Mustacchi  */
10509d26e4fcSRobert Mustacchi void
i40e_free_ring_mem(i40e_trqpair_t * itrq,boolean_t failed_init)1051*aa2a44afSPaul Winder i40e_free_ring_mem(i40e_trqpair_t *itrq, boolean_t failed_init)
10529d26e4fcSRobert Mustacchi {
1053*aa2a44afSPaul Winder 	i40e_t *i40e = itrq->itrq_i40e;
1054*aa2a44afSPaul Winder 	i40e_rx_data_t *rxd = itrq->itrq_rxdata;
105509aee612SRyan Zezeski 
1056*aa2a44afSPaul Winder 	/*
1057*aa2a44afSPaul Winder 	 * In some cases i40e_alloc_rx_data() may have failed
1058*aa2a44afSPaul Winder 	 * and in that case there is no rxd to free.
1059*aa2a44afSPaul Winder 	 */
1060*aa2a44afSPaul Winder 	if (rxd == NULL)
1061*aa2a44afSPaul Winder 		return;
10629d26e4fcSRobert Mustacchi 
1063*aa2a44afSPaul Winder 	/*
1064*aa2a44afSPaul Winder 	 * Clean up our RX data. We have to free DMA resources first and
1065*aa2a44afSPaul Winder 	 * then if we have no more pending RCB's, then we'll go ahead
1066*aa2a44afSPaul Winder 	 * and clean things up. Note, we can't set the stopped flag on
1067*aa2a44afSPaul Winder 	 * the RX data until after we've done the first pass of the
1068*aa2a44afSPaul Winder 	 * pending resources. Otherwise we might race with
1069*aa2a44afSPaul Winder 	 * i40e_rx_recycle on determining who should free the
1070*aa2a44afSPaul Winder 	 * i40e_rx_data_t above.
1071*aa2a44afSPaul Winder 	 */
1072*aa2a44afSPaul Winder 	i40e_free_rx_dma(rxd, failed_init);
10739d26e4fcSRobert Mustacchi 
1074*aa2a44afSPaul Winder 	mutex_enter(&i40e->i40e_rx_pending_lock);
1075*aa2a44afSPaul Winder 	rxd->rxd_shutdown = B_TRUE;
1076*aa2a44afSPaul Winder 	if (rxd->rxd_rcb_pending == 0) {
1077*aa2a44afSPaul Winder 		i40e_free_rx_data(rxd);
1078*aa2a44afSPaul Winder 		itrq->itrq_rxdata = NULL;
10799d26e4fcSRobert Mustacchi 	}
1080*aa2a44afSPaul Winder 	mutex_exit(&i40e->i40e_rx_pending_lock);
1081*aa2a44afSPaul Winder 
1082*aa2a44afSPaul Winder 	i40e_free_tx_dma(itrq);
10839d26e4fcSRobert Mustacchi }
10849d26e4fcSRobert Mustacchi 
10859d26e4fcSRobert Mustacchi /*
1086*aa2a44afSPaul Winder  * Allocate all of the resources associated with a ring.
1087*aa2a44afSPaul Winder  * Note this is done as part of the GLDv3 ring start routine.
1088*aa2a44afSPaul Winder  * This takes care of both DMA and non-DMA related resources.
10899d26e4fcSRobert Mustacchi  */
10909d26e4fcSRobert Mustacchi boolean_t
i40e_alloc_ring_mem(i40e_trqpair_t * itrq)1091*aa2a44afSPaul Winder i40e_alloc_ring_mem(i40e_trqpair_t *itrq)
10929d26e4fcSRobert Mustacchi {
1093*aa2a44afSPaul Winder 	if (!i40e_alloc_rx_data(itrq->itrq_i40e, itrq))
1094*aa2a44afSPaul Winder 		goto free;
10959d26e4fcSRobert Mustacchi 
1096*aa2a44afSPaul Winder 	if (!i40e_alloc_rx_dma(itrq->itrq_rxdata))
1097*aa2a44afSPaul Winder 		goto free;
10989d26e4fcSRobert Mustacchi 
1099*aa2a44afSPaul Winder 	if (!i40e_alloc_tx_dma(itrq))
1100*aa2a44afSPaul Winder 		goto free;
11019d26e4fcSRobert Mustacchi 
11029d26e4fcSRobert Mustacchi 	return (B_TRUE);
11039d26e4fcSRobert Mustacchi 
1104*aa2a44afSPaul Winder free:
1105*aa2a44afSPaul Winder 	i40e_free_ring_mem(itrq, B_TRUE);
11069d26e4fcSRobert Mustacchi 	return (B_FALSE);
11079d26e4fcSRobert Mustacchi }
11089d26e4fcSRobert Mustacchi 
11099d26e4fcSRobert Mustacchi 
11109d26e4fcSRobert Mustacchi /*
11119d26e4fcSRobert Mustacchi  * Because every instance of i40e may have different support for FMA
11129d26e4fcSRobert Mustacchi  * capabilities, we copy the DMA attributes into the i40e_t and set them that
11139d26e4fcSRobert Mustacchi  * way and use them for determining attributes.
11149d26e4fcSRobert Mustacchi  */
11159d26e4fcSRobert Mustacchi void
i40e_init_dma_attrs(i40e_t * i40e,boolean_t fma)11169d26e4fcSRobert Mustacchi i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
11179d26e4fcSRobert Mustacchi {
11189d26e4fcSRobert Mustacchi 	bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
11199d26e4fcSRobert Mustacchi 	    sizeof (ddi_dma_attr_t));
11209d26e4fcSRobert Mustacchi 	bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
11219d26e4fcSRobert Mustacchi 	    sizeof (ddi_dma_attr_t));
112209aee612SRyan Zezeski 	bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
112309aee612SRyan Zezeski 	    sizeof (ddi_dma_attr_t));
11249d26e4fcSRobert Mustacchi 	bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
11259d26e4fcSRobert Mustacchi 	    sizeof (ddi_device_acc_attr_t));
11269d26e4fcSRobert Mustacchi 	bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
11279d26e4fcSRobert Mustacchi 	    sizeof (ddi_device_acc_attr_t));
11289d26e4fcSRobert Mustacchi 
11299d26e4fcSRobert Mustacchi 	if (fma == B_TRUE) {
11309d26e4fcSRobert Mustacchi 		i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
11319d26e4fcSRobert Mustacchi 		i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
113209aee612SRyan Zezeski 		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
113309aee612SRyan Zezeski 		    DDI_DMA_FLAGERR;
11349d26e4fcSRobert Mustacchi 	} else {
11359d26e4fcSRobert Mustacchi 		i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
11369d26e4fcSRobert Mustacchi 		i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
113709aee612SRyan Zezeski 		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
113809aee612SRyan Zezeski 		    ~DDI_DMA_FLAGERR;
11399d26e4fcSRobert Mustacchi 	}
11409d26e4fcSRobert Mustacchi }
11419d26e4fcSRobert Mustacchi 
11429d26e4fcSRobert Mustacchi static void
i40e_rcb_free(i40e_rx_data_t * rxd,i40e_rx_control_block_t * rcb)11439d26e4fcSRobert Mustacchi i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
11449d26e4fcSRobert Mustacchi {
11459d26e4fcSRobert Mustacchi 	mutex_enter(&rxd->rxd_free_lock);
11469d26e4fcSRobert Mustacchi 	ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
11479d26e4fcSRobert Mustacchi 	ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
11489d26e4fcSRobert Mustacchi 	rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
11499d26e4fcSRobert Mustacchi 	rxd->rxd_rcb_free++;
11509d26e4fcSRobert Mustacchi 	mutex_exit(&rxd->rxd_free_lock);
11519d26e4fcSRobert Mustacchi }
11529d26e4fcSRobert Mustacchi 
11539d26e4fcSRobert Mustacchi static i40e_rx_control_block_t *
i40e_rcb_alloc(i40e_rx_data_t * rxd)11549d26e4fcSRobert Mustacchi i40e_rcb_alloc(i40e_rx_data_t *rxd)
11559d26e4fcSRobert Mustacchi {
11569d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb;
11579d26e4fcSRobert Mustacchi 
11589d26e4fcSRobert Mustacchi 	mutex_enter(&rxd->rxd_free_lock);
11599d26e4fcSRobert Mustacchi 	if (rxd->rxd_rcb_free == 0) {
11609d26e4fcSRobert Mustacchi 		mutex_exit(&rxd->rxd_free_lock);
11619d26e4fcSRobert Mustacchi 		return (NULL);
11629d26e4fcSRobert Mustacchi 	}
11639d26e4fcSRobert Mustacchi 	rxd->rxd_rcb_free--;
11649d26e4fcSRobert Mustacchi 	rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
11659d26e4fcSRobert Mustacchi 	VERIFY(rcb != NULL);
11669d26e4fcSRobert Mustacchi 	rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
11679d26e4fcSRobert Mustacchi 	mutex_exit(&rxd->rxd_free_lock);
11689d26e4fcSRobert Mustacchi 
11699d26e4fcSRobert Mustacchi 	return (rcb);
11709d26e4fcSRobert Mustacchi }
11719d26e4fcSRobert Mustacchi 
11729d26e4fcSRobert Mustacchi /*
11739d26e4fcSRobert Mustacchi  * This is the callback that we get from the OS when freemsg(9F) has been called
11749d26e4fcSRobert Mustacchi  * on a loaned descriptor. In addition, if we take the last reference count
117509aee612SRyan Zezeski  * here, then we have to tear down all of the RX data.
11769d26e4fcSRobert Mustacchi  */
11779d26e4fcSRobert Mustacchi void
i40e_rx_recycle(caddr_t arg)11789d26e4fcSRobert Mustacchi i40e_rx_recycle(caddr_t arg)
11799d26e4fcSRobert Mustacchi {
11809d26e4fcSRobert Mustacchi 	uint32_t ref;
11819d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb;
11829d26e4fcSRobert Mustacchi 	i40e_rx_data_t *rxd;
11839d26e4fcSRobert Mustacchi 	i40e_t *i40e;
11849d26e4fcSRobert Mustacchi 
11859d26e4fcSRobert Mustacchi 	/* LINTED: E_BAD_PTR_CAST_ALIGN */
11869d26e4fcSRobert Mustacchi 	rcb = (i40e_rx_control_block_t *)arg;
11879d26e4fcSRobert Mustacchi 	rxd = rcb->rcb_rxd;
11889d26e4fcSRobert Mustacchi 	i40e = rxd->rxd_i40e;
11899d26e4fcSRobert Mustacchi 
11909d26e4fcSRobert Mustacchi 	/*
11919d26e4fcSRobert Mustacchi 	 * It's possible for this to be called with a reference count of zero.
11929d26e4fcSRobert Mustacchi 	 * That will happen when we're doing the freemsg after taking the last
11939d26e4fcSRobert Mustacchi 	 * reference because we're tearing down everything and this rcb is not
11949d26e4fcSRobert Mustacchi 	 * outstanding.
11959d26e4fcSRobert Mustacchi 	 */
11969d26e4fcSRobert Mustacchi 	if (rcb->rcb_ref == 0)
11979d26e4fcSRobert Mustacchi 		return;
11989d26e4fcSRobert Mustacchi 
11999d26e4fcSRobert Mustacchi 	/*
12009d26e4fcSRobert Mustacchi 	 * Don't worry about failure of desballoc here. It'll only become fatal
12019d26e4fcSRobert Mustacchi 	 * if we're trying to use it and we can't in i40e_rx_bind().
12029d26e4fcSRobert Mustacchi 	 */
12039d26e4fcSRobert Mustacchi 	rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
12049d26e4fcSRobert Mustacchi 	    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
12059d26e4fcSRobert Mustacchi 	i40e_rcb_free(rxd, rcb);
12069d26e4fcSRobert Mustacchi 
12079d26e4fcSRobert Mustacchi 	/*
12089d26e4fcSRobert Mustacchi 	 * It's possible that the rcb was being used while we are shutting down
12099d26e4fcSRobert Mustacchi 	 * the device. In that case, we'll take the final reference from the
12109d26e4fcSRobert Mustacchi 	 * device here.
12119d26e4fcSRobert Mustacchi 	 */
12129d26e4fcSRobert Mustacchi 	ref = atomic_dec_32_nv(&rcb->rcb_ref);
12139d26e4fcSRobert Mustacchi 	if (ref == 0) {
12149d26e4fcSRobert Mustacchi 		freemsg(rcb->rcb_mp);
12159d26e4fcSRobert Mustacchi 		rcb->rcb_mp = NULL;
12169d26e4fcSRobert Mustacchi 		i40e_free_dma_buffer(&rcb->rcb_dma);
12179d26e4fcSRobert Mustacchi 
12189d26e4fcSRobert Mustacchi 		mutex_enter(&i40e->i40e_rx_pending_lock);
12199d26e4fcSRobert Mustacchi 		atomic_dec_32(&rxd->rxd_rcb_pending);
12209d26e4fcSRobert Mustacchi 		atomic_dec_32(&i40e->i40e_rx_pending);
12219d26e4fcSRobert Mustacchi 
12229d26e4fcSRobert Mustacchi 		/*
12239d26e4fcSRobert Mustacchi 		 * If this was the last block and it's been indicated that we've
12249d26e4fcSRobert Mustacchi 		 * passed the shutdown point, we should clean up.
12259d26e4fcSRobert Mustacchi 		 */
12269d26e4fcSRobert Mustacchi 		if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
12279d26e4fcSRobert Mustacchi 			i40e_free_rx_data(rxd);
12289d26e4fcSRobert Mustacchi 			cv_broadcast(&i40e->i40e_rx_pending_cv);
12299d26e4fcSRobert Mustacchi 		}
12309d26e4fcSRobert Mustacchi 
12319d26e4fcSRobert Mustacchi 		mutex_exit(&i40e->i40e_rx_pending_lock);
12329d26e4fcSRobert Mustacchi 	}
12339d26e4fcSRobert Mustacchi }
12349d26e4fcSRobert Mustacchi 
12359d26e4fcSRobert Mustacchi static mblk_t *
i40e_rx_bind(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)12369d26e4fcSRobert Mustacchi i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
12379d26e4fcSRobert Mustacchi     uint32_t plen)
12389d26e4fcSRobert Mustacchi {
12399d26e4fcSRobert Mustacchi 	mblk_t *mp;
12409d26e4fcSRobert Mustacchi 	i40e_t *i40e = rxd->rxd_i40e;
12419d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb, *rep_rcb;
12429d26e4fcSRobert Mustacchi 
12439d26e4fcSRobert Mustacchi 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
12449d26e4fcSRobert Mustacchi 
12459d26e4fcSRobert Mustacchi 	if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
12469d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
12479d26e4fcSRobert Mustacchi 		return (NULL);
12489d26e4fcSRobert Mustacchi 	}
12499d26e4fcSRobert Mustacchi 
12509d26e4fcSRobert Mustacchi 	rcb = rxd->rxd_work_list[index];
12519d26e4fcSRobert Mustacchi 
12529d26e4fcSRobert Mustacchi 	/*
12539d26e4fcSRobert Mustacchi 	 * Check to make sure we have a mblk_t. If we don't, this is our last
12549d26e4fcSRobert Mustacchi 	 * chance to try and get one.
12559d26e4fcSRobert Mustacchi 	 */
12569d26e4fcSRobert Mustacchi 	if (rcb->rcb_mp == NULL) {
12579d26e4fcSRobert Mustacchi 		rcb->rcb_mp =
12589d26e4fcSRobert Mustacchi 		    desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
12599d26e4fcSRobert Mustacchi 		    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
12609d26e4fcSRobert Mustacchi 		if (rcb->rcb_mp == NULL) {
12619d26e4fcSRobert Mustacchi 			itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
12629d26e4fcSRobert Mustacchi 			i40e_rcb_free(rxd, rcb);
12639d26e4fcSRobert Mustacchi 			return (NULL);
12649d26e4fcSRobert Mustacchi 		}
12659d26e4fcSRobert Mustacchi 	}
12669d26e4fcSRobert Mustacchi 
12679d26e4fcSRobert Mustacchi 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
12689d26e4fcSRobert Mustacchi 
12699d26e4fcSRobert Mustacchi 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
12709d26e4fcSRobert Mustacchi 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
12719d26e4fcSRobert Mustacchi 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
12729d26e4fcSRobert Mustacchi 		i40e_rcb_free(rxd, rcb);
12739d26e4fcSRobert Mustacchi 		return (NULL);
12749d26e4fcSRobert Mustacchi 	}
12759d26e4fcSRobert Mustacchi 
12769d26e4fcSRobert Mustacchi 	/*
12779d26e4fcSRobert Mustacchi 	 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
12789d26e4fcSRobert Mustacchi 	 */
12799d26e4fcSRobert Mustacchi 	mp = rcb->rcb_mp;
12809d26e4fcSRobert Mustacchi 	atomic_inc_32(&rcb->rcb_ref);
12819d26e4fcSRobert Mustacchi 	mp->b_wptr = mp->b_rptr + plen;
12829d26e4fcSRobert Mustacchi 	mp->b_next = mp->b_cont = NULL;
12839d26e4fcSRobert Mustacchi 
12849d26e4fcSRobert Mustacchi 	rxd->rxd_work_list[index] = rep_rcb;
12859d26e4fcSRobert Mustacchi 	return (mp);
12869d26e4fcSRobert Mustacchi }
12879d26e4fcSRobert Mustacchi 
12889d26e4fcSRobert Mustacchi /*
12899d26e4fcSRobert Mustacchi  * We're going to allocate a new message block for this frame and attempt to
12909d26e4fcSRobert Mustacchi  * receive it. See the big theory statement for more information on when we copy
12919d26e4fcSRobert Mustacchi  * versus bind.
12929d26e4fcSRobert Mustacchi  */
12939d26e4fcSRobert Mustacchi static mblk_t *
i40e_rx_copy(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)12949d26e4fcSRobert Mustacchi i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
12959d26e4fcSRobert Mustacchi     uint32_t plen)
12969d26e4fcSRobert Mustacchi {
12979d26e4fcSRobert Mustacchi 	i40e_t *i40e = rxd->rxd_i40e;
12989d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb;
12999d26e4fcSRobert Mustacchi 	mblk_t *mp;
13009d26e4fcSRobert Mustacchi 
13019d26e4fcSRobert Mustacchi 	ASSERT(index < rxd->rxd_ring_size);
13029d26e4fcSRobert Mustacchi 	rcb = rxd->rxd_work_list[index];
13039d26e4fcSRobert Mustacchi 
13049d26e4fcSRobert Mustacchi 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
13059d26e4fcSRobert Mustacchi 
13069d26e4fcSRobert Mustacchi 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
13079d26e4fcSRobert Mustacchi 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
13089d26e4fcSRobert Mustacchi 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
13099d26e4fcSRobert Mustacchi 		return (NULL);
13109d26e4fcSRobert Mustacchi 	}
13119d26e4fcSRobert Mustacchi 
13129d26e4fcSRobert Mustacchi 	mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
13139d26e4fcSRobert Mustacchi 	if (mp == NULL) {
13149d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
13159d26e4fcSRobert Mustacchi 		return (NULL);
13169d26e4fcSRobert Mustacchi 	}
13179d26e4fcSRobert Mustacchi 
13189d26e4fcSRobert Mustacchi 	mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
13199d26e4fcSRobert Mustacchi 	bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
13209d26e4fcSRobert Mustacchi 	mp->b_wptr = mp->b_rptr + plen;
13219d26e4fcSRobert Mustacchi 
13229d26e4fcSRobert Mustacchi 	return (mp);
13239d26e4fcSRobert Mustacchi }
13249d26e4fcSRobert Mustacchi 
13259d26e4fcSRobert Mustacchi /*
13269d26e4fcSRobert Mustacchi  * Determine if the device has enabled any checksum flags for us. The level of
13279d26e4fcSRobert Mustacchi  * checksum computed will depend on the type packet that we have, which is
13289d26e4fcSRobert Mustacchi  * contained in ptype. For example, the checksum logic it does will vary
13299d26e4fcSRobert Mustacchi  * depending on whether or not the packet is considered tunneled, whether it
13309d26e4fcSRobert Mustacchi  * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
13319d26e4fcSRobert Mustacchi  * valid.
13329d26e4fcSRobert Mustacchi  *
13339d26e4fcSRobert Mustacchi  * While there are additional checksums that we could recognize here, we'll need
13349d26e4fcSRobert Mustacchi  * to get some additional GLDv3 enhancements to be able to properly describe
13359d26e4fcSRobert Mustacchi  * them.
13369d26e4fcSRobert Mustacchi  */
13379d26e4fcSRobert Mustacchi static void
i40e_rx_hcksum(i40e_trqpair_t * itrq,mblk_t * mp,uint64_t status,uint32_t err,uint32_t ptype)13389d26e4fcSRobert Mustacchi i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
13399d26e4fcSRobert Mustacchi     uint32_t ptype)
13409d26e4fcSRobert Mustacchi {
13419d26e4fcSRobert Mustacchi 	uint32_t cksum;
13429d26e4fcSRobert Mustacchi 	struct i40e_rx_ptype_decoded pinfo;
13439d26e4fcSRobert Mustacchi 
13449d26e4fcSRobert Mustacchi 	ASSERT(ptype <= 255);
13459d26e4fcSRobert Mustacchi 	pinfo = decode_rx_desc_ptype(ptype);
13469d26e4fcSRobert Mustacchi 
13479d26e4fcSRobert Mustacchi 	cksum = 0;
13489d26e4fcSRobert Mustacchi 
13499d26e4fcSRobert Mustacchi 	/*
13509d26e4fcSRobert Mustacchi 	 * If the ptype isn't something that we know in the driver, then we
13519d26e4fcSRobert Mustacchi 	 * shouldn't even consider moving forward.
13529d26e4fcSRobert Mustacchi 	 */
13539d26e4fcSRobert Mustacchi 	if (pinfo.known == 0) {
13549d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
13559d26e4fcSRobert Mustacchi 		return;
13569d26e4fcSRobert Mustacchi 	}
13579d26e4fcSRobert Mustacchi 
13589d26e4fcSRobert Mustacchi 	/*
13599d26e4fcSRobert Mustacchi 	 * If hardware didn't set the L3L4P bit on the frame, then there is no
13609d26e4fcSRobert Mustacchi 	 * checksum offload to consider.
13619d26e4fcSRobert Mustacchi 	 */
13629d26e4fcSRobert Mustacchi 	if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
13639d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
13649d26e4fcSRobert Mustacchi 		return;
13659d26e4fcSRobert Mustacchi 	}
13669d26e4fcSRobert Mustacchi 
13679d26e4fcSRobert Mustacchi 	/*
13689d26e4fcSRobert Mustacchi 	 * The device tells us that IPv6 checksums where a Destination Options
13699d26e4fcSRobert Mustacchi 	 * Header or a Routing header shouldn't be trusted. Discard all
13709d26e4fcSRobert Mustacchi 	 * checksums in this case.
13719d26e4fcSRobert Mustacchi 	 */
13729d26e4fcSRobert Mustacchi 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
13739d26e4fcSRobert Mustacchi 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
13749d26e4fcSRobert Mustacchi 	    (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
13759d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
13769d26e4fcSRobert Mustacchi 		return;
13779d26e4fcSRobert Mustacchi 	}
13789d26e4fcSRobert Mustacchi 
13799d26e4fcSRobert Mustacchi 	/*
13809d26e4fcSRobert Mustacchi 	 * The hardware denotes three kinds of possible errors. Two are reserved
13819d26e4fcSRobert Mustacchi 	 * for inner and outer IP checksum errors (IPE and EIPE) and the latter
13829d26e4fcSRobert Mustacchi 	 * is for L4 checksum errors (L4E). If there is only one IP header, then
13839d26e4fcSRobert Mustacchi 	 * the only thing that we care about is IPE. Note that since we don't
13849d26e4fcSRobert Mustacchi 	 * support inner checksums, we will ignore IPE being set on tunneled
13859d26e4fcSRobert Mustacchi 	 * packets and only care about EIPE.
13869d26e4fcSRobert Mustacchi 	 */
13879d26e4fcSRobert Mustacchi 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
13889d26e4fcSRobert Mustacchi 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
13899d26e4fcSRobert Mustacchi 		if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
13909d26e4fcSRobert Mustacchi 			if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
13919d26e4fcSRobert Mustacchi 				itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
13929d26e4fcSRobert Mustacchi 			} else {
13939d26e4fcSRobert Mustacchi 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
13949d26e4fcSRobert Mustacchi 				cksum |= HCK_IPV4_HDRCKSUM_OK;
13959d26e4fcSRobert Mustacchi 			}
13969d26e4fcSRobert Mustacchi 		} else {
13979d26e4fcSRobert Mustacchi 			if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
13989d26e4fcSRobert Mustacchi 				itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
13999d26e4fcSRobert Mustacchi 			} else {
14009d26e4fcSRobert Mustacchi 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
14019d26e4fcSRobert Mustacchi 				cksum |= HCK_IPV4_HDRCKSUM_OK;
14029d26e4fcSRobert Mustacchi 			}
14039d26e4fcSRobert Mustacchi 		}
14049d26e4fcSRobert Mustacchi 	}
14059d26e4fcSRobert Mustacchi 
14069d26e4fcSRobert Mustacchi 	/*
14079d26e4fcSRobert Mustacchi 	 * We only have meaningful L4 checksums in the case of IP->L4 and
14089d26e4fcSRobert Mustacchi 	 * IP->IP->L4. There is not outer L4 checksum data available in any
14099d26e4fcSRobert Mustacchi 	 * other case. Further, we don't bother reporting the valid checksum in
14109d26e4fcSRobert Mustacchi 	 * the case of IP->IP->L4 set.
14119d26e4fcSRobert Mustacchi 	 */
14129d26e4fcSRobert Mustacchi 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
14139d26e4fcSRobert Mustacchi 	    pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
14149d26e4fcSRobert Mustacchi 	    (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
14159d26e4fcSRobert Mustacchi 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
14169d26e4fcSRobert Mustacchi 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
14179d26e4fcSRobert Mustacchi 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
14189d26e4fcSRobert Mustacchi 		ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
14199d26e4fcSRobert Mustacchi 		if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
14209d26e4fcSRobert Mustacchi 			itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
14219d26e4fcSRobert Mustacchi 		} else {
14229d26e4fcSRobert Mustacchi 			itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
14239d26e4fcSRobert Mustacchi 			cksum |= HCK_FULLCKSUM_OK;
14249d26e4fcSRobert Mustacchi 		}
14259d26e4fcSRobert Mustacchi 	}
14269d26e4fcSRobert Mustacchi 
14279d26e4fcSRobert Mustacchi 	if (cksum != 0) {
14289d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
14299d26e4fcSRobert Mustacchi 		mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
14309d26e4fcSRobert Mustacchi 	} else {
14319d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
14329d26e4fcSRobert Mustacchi 	}
14339d26e4fcSRobert Mustacchi }
14349d26e4fcSRobert Mustacchi 
14359d26e4fcSRobert Mustacchi mblk_t *
i40e_ring_rx(i40e_trqpair_t * itrq,int poll_bytes)14369d26e4fcSRobert Mustacchi i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
14379d26e4fcSRobert Mustacchi {
14389d26e4fcSRobert Mustacchi 	i40e_t *i40e;
14399d26e4fcSRobert Mustacchi 	i40e_hw_t *hw;
14409d26e4fcSRobert Mustacchi 	i40e_rx_data_t *rxd;
14419d26e4fcSRobert Mustacchi 	uint32_t cur_head;
14429d26e4fcSRobert Mustacchi 	i40e_rx_desc_t *cur_desc;
14439d26e4fcSRobert Mustacchi 	i40e_rx_control_block_t *rcb;
14449d26e4fcSRobert Mustacchi 	uint64_t rx_bytes, rx_frames;
14459d26e4fcSRobert Mustacchi 	uint64_t stword;
14469d26e4fcSRobert Mustacchi 	mblk_t *mp, *mp_head, **mp_tail;
14479d26e4fcSRobert Mustacchi 
14489d26e4fcSRobert Mustacchi 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
14499d26e4fcSRobert Mustacchi 	rxd = itrq->itrq_rxdata;
14509d26e4fcSRobert Mustacchi 	i40e = itrq->itrq_i40e;
14519d26e4fcSRobert Mustacchi 	hw = &i40e->i40e_hw_space;
14529d26e4fcSRobert Mustacchi 
14539d26e4fcSRobert Mustacchi 	if (!(i40e->i40e_state & I40E_STARTED) ||
14549d26e4fcSRobert Mustacchi 	    (i40e->i40e_state & I40E_OVERTEMP) ||
14559d26e4fcSRobert Mustacchi 	    (i40e->i40e_state & I40E_SUSPENDED) ||
14569d26e4fcSRobert Mustacchi 	    (i40e->i40e_state & I40E_ERROR))
14579d26e4fcSRobert Mustacchi 		return (NULL);
14589d26e4fcSRobert Mustacchi 
14599d26e4fcSRobert Mustacchi 	/*
14609d26e4fcSRobert Mustacchi 	 * Before we do anything else, we have to make sure that all of the DMA
14619d26e4fcSRobert Mustacchi 	 * buffers are synced up and then check to make sure that they're
14629d26e4fcSRobert Mustacchi 	 * actually good from an FM perspective.
14639d26e4fcSRobert Mustacchi 	 */
14649d26e4fcSRobert Mustacchi 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
14659d26e4fcSRobert Mustacchi 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
14669d26e4fcSRobert Mustacchi 	    DDI_FM_OK) {
14679d26e4fcSRobert Mustacchi 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
14689d26e4fcSRobert Mustacchi 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
14699d26e4fcSRobert Mustacchi 		return (NULL);
14709d26e4fcSRobert Mustacchi 	}
14719d26e4fcSRobert Mustacchi 
14729d26e4fcSRobert Mustacchi 	/*
14739d26e4fcSRobert Mustacchi 	 * Prepare our stats. We do a limited amount of processing in both
14749d26e4fcSRobert Mustacchi 	 * polling and interrupt context. The limit in interrupt context is
14759d26e4fcSRobert Mustacchi 	 * based on frames, in polling context based on bytes.
14769d26e4fcSRobert Mustacchi 	 */
14779d26e4fcSRobert Mustacchi 	rx_bytes = rx_frames = 0;
14789d26e4fcSRobert Mustacchi 	mp_head = NULL;
14799d26e4fcSRobert Mustacchi 	mp_tail = &mp_head;
14809d26e4fcSRobert Mustacchi 
14819d26e4fcSRobert Mustacchi 	/*
14829d26e4fcSRobert Mustacchi 	 * At this point, the descriptor ring is available to check. We'll try
14839d26e4fcSRobert Mustacchi 	 * and process until we either run out of poll_bytes or descriptors.
14849d26e4fcSRobert Mustacchi 	 */
14859d26e4fcSRobert Mustacchi 	cur_head = rxd->rxd_desc_next;
14869d26e4fcSRobert Mustacchi 	cur_desc = &rxd->rxd_desc_ring[cur_head];
14879d26e4fcSRobert Mustacchi 	stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
14889d26e4fcSRobert Mustacchi 
14899d26e4fcSRobert Mustacchi 	/*
14909d26e4fcSRobert Mustacchi 	 * Note, the primary invariant of this loop should be that cur_head,
14919d26e4fcSRobert Mustacchi 	 * cur_desc, and stword always point to the currently processed
14929d26e4fcSRobert Mustacchi 	 * descriptor. When we leave the loop, it should point to a descriptor
14939d26e4fcSRobert Mustacchi 	 * that HAS NOT been processed. Meaning, that if we haven't consumed the
14949d26e4fcSRobert Mustacchi 	 * frame, the descriptor should not be advanced.
14959d26e4fcSRobert Mustacchi 	 */
14969d26e4fcSRobert Mustacchi 	while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
14979d26e4fcSRobert Mustacchi 		uint32_t error, eop, plen, ptype;
14989d26e4fcSRobert Mustacchi 
14999d26e4fcSRobert Mustacchi 		/*
15009d26e4fcSRobert Mustacchi 		 * The DD, PLEN, and EOP bits are the only ones that are valid
15019d26e4fcSRobert Mustacchi 		 * in every frame. The error information is only valid when EOP
15029d26e4fcSRobert Mustacchi 		 * is set in the same frame.
15039d26e4fcSRobert Mustacchi 		 *
15049d26e4fcSRobert Mustacchi 		 * At this time, because we don't do any LRO or header
15059d26e4fcSRobert Mustacchi 		 * splitting. We expect that every frame should have EOP set in
15069d26e4fcSRobert Mustacchi 		 * it. When later functionality comes in, we'll want to
15079d26e4fcSRobert Mustacchi 		 * re-evaluate this.
15089d26e4fcSRobert Mustacchi 		 */
15099d26e4fcSRobert Mustacchi 		eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
15109d26e4fcSRobert Mustacchi 		VERIFY(eop != 0);
15119d26e4fcSRobert Mustacchi 
15129d26e4fcSRobert Mustacchi 		error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
15139d26e4fcSRobert Mustacchi 		    I40E_RXD_QW1_ERROR_SHIFT;
15149d26e4fcSRobert Mustacchi 		if (error & I40E_RX_ERR_BITS) {
15159d26e4fcSRobert Mustacchi 			itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
15169d26e4fcSRobert Mustacchi 			goto discard;
15179d26e4fcSRobert Mustacchi 		}
15189d26e4fcSRobert Mustacchi 
15199d26e4fcSRobert Mustacchi 		plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
15209d26e4fcSRobert Mustacchi 		    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
15219d26e4fcSRobert Mustacchi 
15229d26e4fcSRobert Mustacchi 		ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
15239d26e4fcSRobert Mustacchi 		    I40E_RXD_QW1_PTYPE_SHIFT;
15249d26e4fcSRobert Mustacchi 
15259d26e4fcSRobert Mustacchi 		/*
15269d26e4fcSRobert Mustacchi 		 * This packet contains valid data. We should check to see if
15279d26e4fcSRobert Mustacchi 		 * we're actually going to consume it based on its length (to
15289d26e4fcSRobert Mustacchi 		 * ensure that we don't overshoot our quota). We determine
15299d26e4fcSRobert Mustacchi 		 * whether to bcopy or bind the DMA resources based on the size
15309d26e4fcSRobert Mustacchi 		 * of the frame. However, if on debug, we allow it to be
15319d26e4fcSRobert Mustacchi 		 * overridden for testing purposes.
15329d26e4fcSRobert Mustacchi 		 *
15339d26e4fcSRobert Mustacchi 		 * We should be smarter about this and do DMA binding for
15349d26e4fcSRobert Mustacchi 		 * larger frames, but for now, it's really more important that
15359d26e4fcSRobert Mustacchi 		 * we actually just get something simple working.
15369d26e4fcSRobert Mustacchi 		 */
15379d26e4fcSRobert Mustacchi 
15389d26e4fcSRobert Mustacchi 		/*
15399d26e4fcSRobert Mustacchi 		 * Ensure we don't exceed our polling quota by reading this
15409d26e4fcSRobert Mustacchi 		 * frame. Note we only bump bytes now, we bump frames later.
15419d26e4fcSRobert Mustacchi 		 */
15429d26e4fcSRobert Mustacchi 		if ((poll_bytes != I40E_POLL_NULL) &&
15439d26e4fcSRobert Mustacchi 		    (rx_bytes + plen) > poll_bytes)
15449d26e4fcSRobert Mustacchi 			break;
15459d26e4fcSRobert Mustacchi 		rx_bytes += plen;
15469d26e4fcSRobert Mustacchi 
15479d26e4fcSRobert Mustacchi 		mp = NULL;
15489d26e4fcSRobert Mustacchi 		if (plen >= i40e->i40e_rx_dma_min)
15499d26e4fcSRobert Mustacchi 			mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
15509d26e4fcSRobert Mustacchi 		if (mp == NULL)
15519d26e4fcSRobert Mustacchi 			mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
15529d26e4fcSRobert Mustacchi 
15539d26e4fcSRobert Mustacchi 		if (mp != NULL) {
15549d26e4fcSRobert Mustacchi 			if (i40e->i40e_rx_hcksum_enable)
15559d26e4fcSRobert Mustacchi 				i40e_rx_hcksum(itrq, mp, stword, error, ptype);
15569d26e4fcSRobert Mustacchi 			*mp_tail = mp;
15579d26e4fcSRobert Mustacchi 			mp_tail = &mp->b_next;
15589d26e4fcSRobert Mustacchi 		}
15599d26e4fcSRobert Mustacchi 
15609d26e4fcSRobert Mustacchi 		/*
15619d26e4fcSRobert Mustacchi 		 * Now we need to prepare this frame for use again. See the
15629d26e4fcSRobert Mustacchi 		 * discussion in the big theory statements.
15639d26e4fcSRobert Mustacchi 		 *
15649d26e4fcSRobert Mustacchi 		 * However, right now we're doing the simple version of this.
15659d26e4fcSRobert Mustacchi 		 * Normally what we'd do would depend on whether or not we were
15669d26e4fcSRobert Mustacchi 		 * doing DMA binding or bcopying. But because we're always doing
15679d26e4fcSRobert Mustacchi 		 * bcopying, we can just always use the current index as a key
15689d26e4fcSRobert Mustacchi 		 * for what to do and reassign the buffer based on the ring.
15699d26e4fcSRobert Mustacchi 		 */
15709d26e4fcSRobert Mustacchi discard:
15719d26e4fcSRobert Mustacchi 		rcb = rxd->rxd_work_list[cur_head];
15729d26e4fcSRobert Mustacchi 		cur_desc->read.pkt_addr =
15739d26e4fcSRobert Mustacchi 		    CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
15749d26e4fcSRobert Mustacchi 		cur_desc->read.hdr_addr = 0;
15759d26e4fcSRobert Mustacchi 
15769d26e4fcSRobert Mustacchi 		/*
15779d26e4fcSRobert Mustacchi 		 * Finally, update our loop invariants.
15789d26e4fcSRobert Mustacchi 		 */
15799d26e4fcSRobert Mustacchi 		cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
15809d26e4fcSRobert Mustacchi 		cur_desc = &rxd->rxd_desc_ring[cur_head];
15819d26e4fcSRobert Mustacchi 		stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
15829d26e4fcSRobert Mustacchi 
15839d26e4fcSRobert Mustacchi 		/*
15849d26e4fcSRobert Mustacchi 		 * To help provide liveness, we limit the amount of data that
15859d26e4fcSRobert Mustacchi 		 * we'll end up counting. Note that in these cases, an interrupt
15869d26e4fcSRobert Mustacchi 		 * is not dissimilar from a polling request.
15879d26e4fcSRobert Mustacchi 		 */
15889d26e4fcSRobert Mustacchi 		rx_frames++;
15899d26e4fcSRobert Mustacchi 		if (rx_frames > i40e->i40e_rx_limit_per_intr) {
15909d26e4fcSRobert Mustacchi 			itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
15919d26e4fcSRobert Mustacchi 			break;
15929d26e4fcSRobert Mustacchi 		}
15939d26e4fcSRobert Mustacchi 	}
15949d26e4fcSRobert Mustacchi 
15959d26e4fcSRobert Mustacchi 	/*
15969d26e4fcSRobert Mustacchi 	 * As we've modified the ring, we need to make sure that we sync the
15979d26e4fcSRobert Mustacchi 	 * descriptor ring for the device. Next, we update the hardware and
15989d26e4fcSRobert Mustacchi 	 * update our notion of where the head for us to read from hardware is
15999d26e4fcSRobert Mustacchi 	 * next.
16009d26e4fcSRobert Mustacchi 	 */
16019d26e4fcSRobert Mustacchi 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
16029d26e4fcSRobert Mustacchi 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
16039d26e4fcSRobert Mustacchi 	    DDI_FM_OK) {
16049d26e4fcSRobert Mustacchi 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
16059d26e4fcSRobert Mustacchi 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
16069d26e4fcSRobert Mustacchi 	}
16079d26e4fcSRobert Mustacchi 
16089d26e4fcSRobert Mustacchi 	if (rx_frames != 0) {
16099d26e4fcSRobert Mustacchi 		uint32_t tail;
16109d26e4fcSRobert Mustacchi 		ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
16119d26e4fcSRobert Mustacchi 		rxd->rxd_desc_next = cur_head;
16129d26e4fcSRobert Mustacchi 		tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
16139d26e4fcSRobert Mustacchi 
16149d26e4fcSRobert Mustacchi 		I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
16159d26e4fcSRobert Mustacchi 		if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
16169d26e4fcSRobert Mustacchi 			ddi_fm_service_impact(i40e->i40e_dip,
16179d26e4fcSRobert Mustacchi 			    DDI_SERVICE_DEGRADED);
16189d26e4fcSRobert Mustacchi 			atomic_or_32(&i40e->i40e_state, I40E_ERROR);
16199d26e4fcSRobert Mustacchi 		}
16209d26e4fcSRobert Mustacchi 
16219d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
16229d26e4fcSRobert Mustacchi 		itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
16239d26e4fcSRobert Mustacchi 	}
16249d26e4fcSRobert Mustacchi 
16259d26e4fcSRobert Mustacchi #ifdef DEBUG
16269d26e4fcSRobert Mustacchi 	if (rx_frames == 0) {
16279d26e4fcSRobert Mustacchi 		ASSERT(rx_bytes == 0);
16289d26e4fcSRobert Mustacchi 	}
16299d26e4fcSRobert Mustacchi #endif
16309d26e4fcSRobert Mustacchi 
16319d26e4fcSRobert Mustacchi 	return (mp_head);
16329d26e4fcSRobert Mustacchi }
16339d26e4fcSRobert Mustacchi 
16349d26e4fcSRobert Mustacchi /*
16359d26e4fcSRobert Mustacchi  * This function is called by the GLDv3 when it wants to poll on a ring. The
16369d26e4fcSRobert Mustacchi  * only primary difference from when we call this during an interrupt is that we
16379d26e4fcSRobert Mustacchi  * have a limit on the number of bytes that we should consume.
16389d26e4fcSRobert Mustacchi  */
16399d26e4fcSRobert Mustacchi mblk_t *
i40e_ring_rx_poll(void * arg,int poll_bytes)16409d26e4fcSRobert Mustacchi i40e_ring_rx_poll(void *arg, int poll_bytes)
16419d26e4fcSRobert Mustacchi {
16429d26e4fcSRobert Mustacchi 	i40e_trqpair_t *itrq = arg;
16439d26e4fcSRobert Mustacchi 	mblk_t *mp;
16449d26e4fcSRobert Mustacchi 
16459d26e4fcSRobert Mustacchi 	ASSERT(poll_bytes > 0);
16469d26e4fcSRobert Mustacchi 	if (poll_bytes == 0)
16479d26e4fcSRobert Mustacchi 		return (NULL);
16489d26e4fcSRobert Mustacchi 
16499d26e4fcSRobert Mustacchi 	mutex_enter(&itrq->itrq_rx_lock);
16509d26e4fcSRobert Mustacchi 	mp = i40e_ring_rx(itrq, poll_bytes);
16519d26e4fcSRobert Mustacchi 	mutex_exit(&itrq->itrq_rx_lock);
16529d26e4fcSRobert Mustacchi 
16539d26e4fcSRobert Mustacchi 	return (mp);
16549d26e4fcSRobert Mustacchi }
16559d26e4fcSRobert Mustacchi 
16569d26e4fcSRobert Mustacchi /*
16579d26e4fcSRobert Mustacchi  * Attempt to put togther the information we'll need to feed into a descriptor
16589d26e4fcSRobert Mustacchi  * to properly program the hardware for checksum offload as well as the
16599d26e4fcSRobert Mustacchi  * generally required flags.
16609d26e4fcSRobert Mustacchi  *
166109aee612SRyan Zezeski  * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
166209aee612SRyan Zezeski  * 'or' into the descriptor based on the checksum flags for this mblk_t and the
16639d26e4fcSRobert Mustacchi  * actual information we care about.
166409aee612SRyan Zezeski  *
166509aee612SRyan Zezeski  * If the mblk requires LSO then we'll also gather the information that will be
166609aee612SRyan Zezeski  * used to construct the Transmit Context Descriptor.
16679d26e4fcSRobert Mustacchi  */
16689d26e4fcSRobert Mustacchi static int
i40e_tx_context(i40e_t * i40e,i40e_trqpair_t * itrq,mblk_t * mp,mac_ether_offload_info_t * meo,i40e_tx_context_t * tctx)16699d26e4fcSRobert Mustacchi i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
167009aee612SRyan Zezeski     mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
16719d26e4fcSRobert Mustacchi {
167209aee612SRyan Zezeski 	uint32_t chkflags, start, mss, lsoflags;
16739d26e4fcSRobert Mustacchi 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
16749d26e4fcSRobert Mustacchi 
16759d26e4fcSRobert Mustacchi 	bzero(tctx, sizeof (i40e_tx_context_t));
16769d26e4fcSRobert Mustacchi 
16779d26e4fcSRobert Mustacchi 	if (i40e->i40e_tx_hcksum_enable != B_TRUE)
16789d26e4fcSRobert Mustacchi 		return (0);
16799d26e4fcSRobert Mustacchi 
168009aee612SRyan Zezeski 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
168109aee612SRyan Zezeski 	mac_lso_get(mp, &mss, &lsoflags);
16829d26e4fcSRobert Mustacchi 
168309aee612SRyan Zezeski 	if (chkflags == 0 && lsoflags == 0)
168409aee612SRyan Zezeski 		return (0);
16859d26e4fcSRobert Mustacchi 
16869d26e4fcSRobert Mustacchi 	/*
16879d26e4fcSRobert Mustacchi 	 * Have we been asked to checksum an IPv4 header. If so, verify that we
16889d26e4fcSRobert Mustacchi 	 * have sufficient information and then set the proper fields in the
16899d26e4fcSRobert Mustacchi 	 * command structure.
16909d26e4fcSRobert Mustacchi 	 */
169109aee612SRyan Zezeski 	if (chkflags & HCK_IPV4_HDRCKSUM) {
169209aee612SRyan Zezeski 		if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
16939d26e4fcSRobert Mustacchi 			txs->itxs_hck_nol2info.value.ui64++;
16949d26e4fcSRobert Mustacchi 			return (-1);
16959d26e4fcSRobert Mustacchi 		}
169609aee612SRyan Zezeski 		if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
16979d26e4fcSRobert Mustacchi 			txs->itxs_hck_nol3info.value.ui64++;
16989d26e4fcSRobert Mustacchi 			return (-1);
16999d26e4fcSRobert Mustacchi 		}
170009aee612SRyan Zezeski 		if (meo->meoi_l3proto != ETHERTYPE_IP) {
17019d26e4fcSRobert Mustacchi 			txs->itxs_hck_badl3.value.ui64++;
17029d26e4fcSRobert Mustacchi 			return (-1);
17039d26e4fcSRobert Mustacchi 		}
170409aee612SRyan Zezeski 		tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
170509aee612SRyan Zezeski 		tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
17069d26e4fcSRobert Mustacchi 		    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
170709aee612SRyan Zezeski 		tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
17089d26e4fcSRobert Mustacchi 		    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
17099d26e4fcSRobert Mustacchi 	}
17109d26e4fcSRobert Mustacchi 
17119d26e4fcSRobert Mustacchi 	/*
17129d26e4fcSRobert Mustacchi 	 * We've been asked to provide an L4 header, first, set up the IP
17139d26e4fcSRobert Mustacchi 	 * information in the descriptor if we haven't already before moving
17149d26e4fcSRobert Mustacchi 	 * onto seeing if we have enough information for the L4 checksum
17159d26e4fcSRobert Mustacchi 	 * offload.
17169d26e4fcSRobert Mustacchi 	 */
171709aee612SRyan Zezeski 	if (chkflags & HCK_PARTIALCKSUM) {
171809aee612SRyan Zezeski 		if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
17199d26e4fcSRobert Mustacchi 			txs->itxs_hck_nol4info.value.ui64++;
17209d26e4fcSRobert Mustacchi 			return (-1);
17219d26e4fcSRobert Mustacchi 		}
17229d26e4fcSRobert Mustacchi 
172309aee612SRyan Zezeski 		if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
172409aee612SRyan Zezeski 			if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
17259d26e4fcSRobert Mustacchi 				txs->itxs_hck_nol2info.value.ui64++;
17269d26e4fcSRobert Mustacchi 				return (-1);
17279d26e4fcSRobert Mustacchi 			}
172809aee612SRyan Zezeski 			if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
17299d26e4fcSRobert Mustacchi 				txs->itxs_hck_nol3info.value.ui64++;
17309d26e4fcSRobert Mustacchi 				return (-1);
17319d26e4fcSRobert Mustacchi 			}
17329d26e4fcSRobert Mustacchi 
173309aee612SRyan Zezeski 			if (meo->meoi_l3proto == ETHERTYPE_IP) {
173409aee612SRyan Zezeski 				tctx->itc_data_cmdflags |=
17359d26e4fcSRobert Mustacchi 				    I40E_TX_DESC_CMD_IIPT_IPV4;
173609aee612SRyan Zezeski 			} else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
173709aee612SRyan Zezeski 				tctx->itc_data_cmdflags |=
17389d26e4fcSRobert Mustacchi 				    I40E_TX_DESC_CMD_IIPT_IPV6;
17399d26e4fcSRobert Mustacchi 			} else {
17409d26e4fcSRobert Mustacchi 				txs->itxs_hck_badl3.value.ui64++;
17419d26e4fcSRobert Mustacchi 				return (-1);
17429d26e4fcSRobert Mustacchi 			}
174309aee612SRyan Zezeski 			tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
17449d26e4fcSRobert Mustacchi 			    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
174509aee612SRyan Zezeski 			tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
17469d26e4fcSRobert Mustacchi 			    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
17479d26e4fcSRobert Mustacchi 		}
17489d26e4fcSRobert Mustacchi 
174909aee612SRyan Zezeski 		switch (meo->meoi_l4proto) {
17509d26e4fcSRobert Mustacchi 		case IPPROTO_TCP:
175109aee612SRyan Zezeski 			tctx->itc_data_cmdflags |=
175209aee612SRyan Zezeski 			    I40E_TX_DESC_CMD_L4T_EOFT_TCP;
17539d26e4fcSRobert Mustacchi 			break;
17549d26e4fcSRobert Mustacchi 		case IPPROTO_UDP:
175509aee612SRyan Zezeski 			tctx->itc_data_cmdflags |=
175609aee612SRyan Zezeski 			    I40E_TX_DESC_CMD_L4T_EOFT_UDP;
17579d26e4fcSRobert Mustacchi 			break;
17589d26e4fcSRobert Mustacchi 		case IPPROTO_SCTP:
175909aee612SRyan Zezeski 			tctx->itc_data_cmdflags |=
176009aee612SRyan Zezeski 			    I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
17619d26e4fcSRobert Mustacchi 			break;
17629d26e4fcSRobert Mustacchi 		default:
17639d26e4fcSRobert Mustacchi 			txs->itxs_hck_badl4.value.ui64++;
17649d26e4fcSRobert Mustacchi 			return (-1);
17659d26e4fcSRobert Mustacchi 		}
17669d26e4fcSRobert Mustacchi 
176709aee612SRyan Zezeski 		tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
17689d26e4fcSRobert Mustacchi 		    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
17699d26e4fcSRobert Mustacchi 	}
17709d26e4fcSRobert Mustacchi 
177109aee612SRyan Zezeski 	if (lsoflags & HW_LSO) {
177209aee612SRyan Zezeski 		/*
177309aee612SRyan Zezeski 		 * LSO requires that checksum offloads are enabled.  If for
177409aee612SRyan Zezeski 		 * some reason they're not we bail out with an error.
177509aee612SRyan Zezeski 		 */
177685f496faSRobert Mustacchi 		if ((meo->meoi_l3proto == ETHERTYPE_IP &&
177785f496faSRobert Mustacchi 		    (chkflags & HCK_IPV4_HDRCKSUM) == 0) ||
177809aee612SRyan Zezeski 		    (chkflags & HCK_PARTIALCKSUM) == 0) {
177909aee612SRyan Zezeski 			txs->itxs_lso_nohck.value.ui64++;
178009aee612SRyan Zezeski 			return (-1);
178109aee612SRyan Zezeski 		}
178209aee612SRyan Zezeski 
178309aee612SRyan Zezeski 		tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
178409aee612SRyan Zezeski 		tctx->itc_ctx_mss = mss;
178509aee612SRyan Zezeski 		tctx->itc_ctx_tsolen = msgsize(mp) -
178609aee612SRyan Zezeski 		    (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
178709aee612SRyan Zezeski 	}
178809aee612SRyan Zezeski 
17899d26e4fcSRobert Mustacchi 	return (0);
17909d26e4fcSRobert Mustacchi }
17919d26e4fcSRobert Mustacchi 
17929d26e4fcSRobert Mustacchi static void
i40e_tcb_free(i40e_trqpair_t * itrq,i40e_tx_control_block_t * tcb)17939d26e4fcSRobert Mustacchi i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
17949d26e4fcSRobert Mustacchi {
17959d26e4fcSRobert Mustacchi 	ASSERT(tcb != NULL);
17969d26e4fcSRobert Mustacchi 
17979d26e4fcSRobert Mustacchi 	mutex_enter(&itrq->itrq_tcb_lock);
17989d26e4fcSRobert Mustacchi 	ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
17999d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
18009d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_free++;
18019d26e4fcSRobert Mustacchi 	mutex_exit(&itrq->itrq_tcb_lock);
18029d26e4fcSRobert Mustacchi }
18039d26e4fcSRobert Mustacchi 
18049d26e4fcSRobert Mustacchi static i40e_tx_control_block_t *
i40e_tcb_alloc(i40e_trqpair_t * itrq)18059d26e4fcSRobert Mustacchi i40e_tcb_alloc(i40e_trqpair_t *itrq)
18069d26e4fcSRobert Mustacchi {
18079d26e4fcSRobert Mustacchi 	i40e_tx_control_block_t *ret;
18089d26e4fcSRobert Mustacchi 
18099d26e4fcSRobert Mustacchi 	mutex_enter(&itrq->itrq_tcb_lock);
18109d26e4fcSRobert Mustacchi 	if (itrq->itrq_tcb_free == 0) {
18119d26e4fcSRobert Mustacchi 		mutex_exit(&itrq->itrq_tcb_lock);
18129d26e4fcSRobert Mustacchi 		return (NULL);
18139d26e4fcSRobert Mustacchi 	}
18149d26e4fcSRobert Mustacchi 
18159d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_free--;
18169d26e4fcSRobert Mustacchi 	ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
18179d26e4fcSRobert Mustacchi 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
18189d26e4fcSRobert Mustacchi 	mutex_exit(&itrq->itrq_tcb_lock);
18199d26e4fcSRobert Mustacchi 
18209d26e4fcSRobert Mustacchi 	ASSERT(ret != NULL);
18219d26e4fcSRobert Mustacchi 	return (ret);
18229d26e4fcSRobert Mustacchi }
18239d26e4fcSRobert Mustacchi 
18249d26e4fcSRobert Mustacchi /*
18259d26e4fcSRobert Mustacchi  * This should be used to free any DMA resources, associated mblk_t's, etc. It's
18269d26e4fcSRobert Mustacchi  * used as part of recycling the message blocks when we have either an interrupt
18279d26e4fcSRobert Mustacchi  * or other activity that indicates that we need to take a look.
18289d26e4fcSRobert Mustacchi  */
18299d26e4fcSRobert Mustacchi static void
i40e_tcb_reset(i40e_tx_control_block_t * tcb)18309d26e4fcSRobert Mustacchi i40e_tcb_reset(i40e_tx_control_block_t *tcb)
18319d26e4fcSRobert Mustacchi {
18329d26e4fcSRobert Mustacchi 	switch (tcb->tcb_type) {
18339d26e4fcSRobert Mustacchi 	case I40E_TX_COPY:
18349d26e4fcSRobert Mustacchi 		tcb->tcb_dma.dmab_len = 0;
18359d26e4fcSRobert Mustacchi 		break;
18369d26e4fcSRobert Mustacchi 	case I40E_TX_DMA:
183709aee612SRyan Zezeski 		if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
183809aee612SRyan Zezeski 			(void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
183909aee612SRyan Zezeski 		else if (tcb->tcb_bind_ncookies > 0)
184009aee612SRyan Zezeski 			(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
184109aee612SRyan Zezeski 		if (tcb->tcb_bind_info != NULL) {
184209aee612SRyan Zezeski 			kmem_free(tcb->tcb_bind_info,
184309aee612SRyan Zezeski 			    tcb->tcb_bind_ncookies *
184409aee612SRyan Zezeski 			    sizeof (struct i40e_dma_bind_info));
184509aee612SRyan Zezeski 		}
184609aee612SRyan Zezeski 		tcb->tcb_bind_info = NULL;
184709aee612SRyan Zezeski 		tcb->tcb_bind_ncookies = 0;
184809aee612SRyan Zezeski 		tcb->tcb_used_lso = B_FALSE;
184909aee612SRyan Zezeski 		break;
185009aee612SRyan Zezeski 	case I40E_TX_DESC:
18519d26e4fcSRobert Mustacchi 		break;
18529d26e4fcSRobert Mustacchi 	case I40E_TX_NONE:
18539d26e4fcSRobert Mustacchi 		/* Cast to pacify lint */
18549d26e4fcSRobert Mustacchi 		panic("trying to free tcb %p with bad type none", (void *)tcb);
18559d26e4fcSRobert Mustacchi 	default:
18569d26e4fcSRobert Mustacchi 		panic("unknown i40e tcb type: %d", tcb->tcb_type);
18579d26e4fcSRobert Mustacchi 	}
18589d26e4fcSRobert Mustacchi 
18599d26e4fcSRobert Mustacchi 	tcb->tcb_type = I40E_TX_NONE;
186009aee612SRyan Zezeski 	if (tcb->tcb_mp != NULL) {
186109aee612SRyan Zezeski 		freemsg(tcb->tcb_mp);
186209aee612SRyan Zezeski 		tcb->tcb_mp = NULL;
186309aee612SRyan Zezeski 	}
18649d26e4fcSRobert Mustacchi 	tcb->tcb_next = NULL;
18659d26e4fcSRobert Mustacchi }
18669d26e4fcSRobert Mustacchi 
18679d26e4fcSRobert Mustacchi /*
18689d26e4fcSRobert Mustacchi  * This is called as part of shutting down to clean up all outstanding
18699d26e4fcSRobert Mustacchi  * descriptors. Similar to recycle, except we don't re-arm anything and instead
18709d26e4fcSRobert Mustacchi  * just return control blocks to the free list.
18719d26e4fcSRobert Mustacchi  */
18729d26e4fcSRobert Mustacchi void
i40e_tx_cleanup_ring(i40e_trqpair_t * itrq)18739d26e4fcSRobert Mustacchi i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
18749d26e4fcSRobert Mustacchi {
18759d26e4fcSRobert Mustacchi 	uint32_t index;
18769d26e4fcSRobert Mustacchi 
18779d26e4fcSRobert Mustacchi 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
18789d26e4fcSRobert Mustacchi 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
18799d26e4fcSRobert Mustacchi 
18809d26e4fcSRobert Mustacchi 	/*
18819d26e4fcSRobert Mustacchi 	 * Because we should have shut down the chip at this point, it should be
18829d26e4fcSRobert Mustacchi 	 * safe to just clean up all the entries between our head and tail.
18839d26e4fcSRobert Mustacchi 	 */
18849d26e4fcSRobert Mustacchi #ifdef	DEBUG
18859d26e4fcSRobert Mustacchi 	index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
18869d26e4fcSRobert Mustacchi 	    I40E_QTX_ENA(itrq->itrq_index));
18879d26e4fcSRobert Mustacchi 	VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
18889d26e4fcSRobert Mustacchi 	    I40E_QTX_ENA_QENA_STAT_MASK));
18899d26e4fcSRobert Mustacchi #endif
18909d26e4fcSRobert Mustacchi 
18919d26e4fcSRobert Mustacchi 	index = itrq->itrq_desc_head;
18929d26e4fcSRobert Mustacchi 	while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
18939d26e4fcSRobert Mustacchi 		i40e_tx_control_block_t *tcb;
18949d26e4fcSRobert Mustacchi 
18959d26e4fcSRobert Mustacchi 		tcb = itrq->itrq_tcb_work_list[index];
189609aee612SRyan Zezeski 		if (tcb != NULL) {
189709aee612SRyan Zezeski 			itrq->itrq_tcb_work_list[index] = NULL;
189809aee612SRyan Zezeski 			i40e_tcb_reset(tcb);
189909aee612SRyan Zezeski 			i40e_tcb_free(itrq, tcb);
190009aee612SRyan Zezeski 		}
19019d26e4fcSRobert Mustacchi 
19029d26e4fcSRobert Mustacchi 		bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
19039d26e4fcSRobert Mustacchi 		index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
19049d26e4fcSRobert Mustacchi 		itrq->itrq_desc_free++;
19059d26e4fcSRobert Mustacchi 	}
19069d26e4fcSRobert Mustacchi 
19079d26e4fcSRobert Mustacchi 	ASSERT(index == itrq->itrq_desc_tail);
19089d26e4fcSRobert Mustacchi 	itrq->itrq_desc_head = index;
19099d26e4fcSRobert Mustacchi }
19109d26e4fcSRobert Mustacchi 
19119d26e4fcSRobert Mustacchi /*
19129d26e4fcSRobert Mustacchi  * We're here either by hook or by crook. We need to see if there are transmit
19139d26e4fcSRobert Mustacchi  * descriptors available for us to go and clean up and return to the hardware.
19149d26e4fcSRobert Mustacchi  * We may also be blocked, and if so, we should make sure that we let it know
19159d26e4fcSRobert Mustacchi  * we're good to go.
19169d26e4fcSRobert Mustacchi  */
19179d26e4fcSRobert Mustacchi void
i40e_tx_recycle_ring(i40e_trqpair_t * itrq)19189d26e4fcSRobert Mustacchi i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
19199d26e4fcSRobert Mustacchi {
19209d26e4fcSRobert Mustacchi 	uint32_t wbhead, toclean, count;
19219d26e4fcSRobert Mustacchi 	i40e_tx_control_block_t *tcbhead;
19229d26e4fcSRobert Mustacchi 	i40e_t *i40e = itrq->itrq_i40e;
192309aee612SRyan Zezeski 	uint_t desc_per_tcb, i;
19249d26e4fcSRobert Mustacchi 
19259d26e4fcSRobert Mustacchi 	mutex_enter(&itrq->itrq_tx_lock);
19269d26e4fcSRobert Mustacchi 
19279d26e4fcSRobert Mustacchi 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
19289d26e4fcSRobert Mustacchi 	if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
19299d26e4fcSRobert Mustacchi 		if (itrq->itrq_tx_blocked == B_TRUE) {
19309d26e4fcSRobert Mustacchi 			itrq->itrq_tx_blocked = B_FALSE;
19319d26e4fcSRobert Mustacchi 			mac_tx_ring_update(i40e->i40e_mac_hdl,
19329d26e4fcSRobert Mustacchi 			    itrq->itrq_mactxring);
19339d26e4fcSRobert Mustacchi 			itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
19349d26e4fcSRobert Mustacchi 		}
19359d26e4fcSRobert Mustacchi 		mutex_exit(&itrq->itrq_tx_lock);
19369d26e4fcSRobert Mustacchi 		return;
19379d26e4fcSRobert Mustacchi 	}
19389d26e4fcSRobert Mustacchi 
19399d26e4fcSRobert Mustacchi 	/*
19409d26e4fcSRobert Mustacchi 	 * Now we need to try and see if there's anything available. The driver
19419d26e4fcSRobert Mustacchi 	 * will write to the head location and it guarantees that it does not
19429d26e4fcSRobert Mustacchi 	 * use relaxed ordering.
19439d26e4fcSRobert Mustacchi 	 */
19449d26e4fcSRobert Mustacchi 	VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
19459d26e4fcSRobert Mustacchi 	    (uintptr_t)itrq->itrq_desc_wbhead,
19469d26e4fcSRobert Mustacchi 	    sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
19479d26e4fcSRobert Mustacchi 
19489d26e4fcSRobert Mustacchi 	if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
19499d26e4fcSRobert Mustacchi 	    DDI_FM_OK) {
19509d26e4fcSRobert Mustacchi 		mutex_exit(&itrq->itrq_tx_lock);
19519d26e4fcSRobert Mustacchi 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
19529d26e4fcSRobert Mustacchi 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
19539d26e4fcSRobert Mustacchi 		return;
19549d26e4fcSRobert Mustacchi 	}
19559d26e4fcSRobert Mustacchi 
19569d26e4fcSRobert Mustacchi 	wbhead = *itrq->itrq_desc_wbhead;
19579d26e4fcSRobert Mustacchi 	toclean = itrq->itrq_desc_head;
19589d26e4fcSRobert Mustacchi 	count = 0;
19599d26e4fcSRobert Mustacchi 	tcbhead = NULL;
19609d26e4fcSRobert Mustacchi 
19619d26e4fcSRobert Mustacchi 	while (toclean != wbhead) {
19629d26e4fcSRobert Mustacchi 		i40e_tx_control_block_t *tcb;
19639d26e4fcSRobert Mustacchi 
19649d26e4fcSRobert Mustacchi 		tcb = itrq->itrq_tcb_work_list[toclean];
19659d26e4fcSRobert Mustacchi 		itrq->itrq_tcb_work_list[toclean] = NULL;
19669d26e4fcSRobert Mustacchi 		ASSERT(tcb != NULL);
19679d26e4fcSRobert Mustacchi 		tcb->tcb_next = tcbhead;
19689d26e4fcSRobert Mustacchi 		tcbhead = tcb;
19699d26e4fcSRobert Mustacchi 
19709d26e4fcSRobert Mustacchi 		/*
197109aee612SRyan Zezeski 		 * In the DMA bind case, there may not necessarily be a 1:1
197209aee612SRyan Zezeski 		 * mapping between tcb's and descriptors.  If the tcb type
197309aee612SRyan Zezeski 		 * indicates a DMA binding then check the number of DMA
197409aee612SRyan Zezeski 		 * cookies to determine how many entries to clean in the
197509aee612SRyan Zezeski 		 * descriptor ring.
19769d26e4fcSRobert Mustacchi 		 */
197709aee612SRyan Zezeski 		if (tcb->tcb_type == I40E_TX_DMA)
197809aee612SRyan Zezeski 			desc_per_tcb = tcb->tcb_bind_ncookies;
197909aee612SRyan Zezeski 		else
198009aee612SRyan Zezeski 			desc_per_tcb = 1;
198109aee612SRyan Zezeski 
198209aee612SRyan Zezeski 		for (i = 0; i < desc_per_tcb; i++) {
198309aee612SRyan Zezeski 			/*
198409aee612SRyan Zezeski 			 * We zero this out for sanity purposes.
198509aee612SRyan Zezeski 			 */
198609aee612SRyan Zezeski 			bzero(&itrq->itrq_desc_ring[toclean],
198709aee612SRyan Zezeski 			    sizeof (i40e_tx_desc_t));
198809aee612SRyan Zezeski 			toclean = i40e_next_desc(toclean, 1,
198909aee612SRyan Zezeski 			    itrq->itrq_tx_ring_size);
199009aee612SRyan Zezeski 			count++;
199109aee612SRyan Zezeski 		}
19929d26e4fcSRobert Mustacchi 	}
19939d26e4fcSRobert Mustacchi 
19949d26e4fcSRobert Mustacchi 	itrq->itrq_desc_head = wbhead;
19959d26e4fcSRobert Mustacchi 	itrq->itrq_desc_free += count;
19969d26e4fcSRobert Mustacchi 	itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
19979d26e4fcSRobert Mustacchi 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
19989d26e4fcSRobert Mustacchi 
19999d26e4fcSRobert Mustacchi 	if (itrq->itrq_tx_blocked == B_TRUE &&
20009d26e4fcSRobert Mustacchi 	    itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
20019d26e4fcSRobert Mustacchi 		itrq->itrq_tx_blocked = B_FALSE;
20029d26e4fcSRobert Mustacchi 
20039d26e4fcSRobert Mustacchi 		mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
20049d26e4fcSRobert Mustacchi 		itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
20059d26e4fcSRobert Mustacchi 	}
20069d26e4fcSRobert Mustacchi 
20079d26e4fcSRobert Mustacchi 	mutex_exit(&itrq->itrq_tx_lock);
20089d26e4fcSRobert Mustacchi 
20099d26e4fcSRobert Mustacchi 	/*
20109d26e4fcSRobert Mustacchi 	 * Now clean up the tcb.
20119d26e4fcSRobert Mustacchi 	 */
20129d26e4fcSRobert Mustacchi 	while (tcbhead != NULL) {
20139d26e4fcSRobert Mustacchi 		i40e_tx_control_block_t *tcb = tcbhead;
20149d26e4fcSRobert Mustacchi 
20159d26e4fcSRobert Mustacchi 		tcbhead = tcb->tcb_next;
20169d26e4fcSRobert Mustacchi 		i40e_tcb_reset(tcb);
20179d26e4fcSRobert Mustacchi 		i40e_tcb_free(itrq, tcb);
20189d26e4fcSRobert Mustacchi 	}
20199d26e4fcSRobert Mustacchi 
20209d26e4fcSRobert Mustacchi 	DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
20219d26e4fcSRobert Mustacchi }
20229d26e4fcSRobert Mustacchi 
202309aee612SRyan Zezeski static void
i40e_tx_copy_fragment(i40e_tx_control_block_t * tcb,const mblk_t * mp,const size_t off,const size_t len)202409aee612SRyan Zezeski i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
202509aee612SRyan Zezeski     const size_t off, const size_t len)
202609aee612SRyan Zezeski {
202709aee612SRyan Zezeski 	const void *soff = mp->b_rptr + off;
202809aee612SRyan Zezeski 	void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
202909aee612SRyan Zezeski 
203009aee612SRyan Zezeski 	ASSERT3U(len, >, 0);
203109aee612SRyan Zezeski 	ASSERT3P(soff, >=, mp->b_rptr);
203209aee612SRyan Zezeski 	ASSERT3P(soff, <=, mp->b_wptr);
203309aee612SRyan Zezeski 	ASSERT3U(len, <=, MBLKL(mp));
203409aee612SRyan Zezeski 	ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
203509aee612SRyan Zezeski 	ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
203609aee612SRyan Zezeski 	bcopy(soff, doff, len);
203709aee612SRyan Zezeski 	tcb->tcb_type = I40E_TX_COPY;
203809aee612SRyan Zezeski 	tcb->tcb_dma.dmab_len += len;
203909aee612SRyan Zezeski 	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
204009aee612SRyan Zezeski }
204109aee612SRyan Zezeski 
204209aee612SRyan Zezeski static i40e_tx_control_block_t *
i40e_tx_bind_fragment(i40e_trqpair_t * itrq,const mblk_t * mp,size_t off,boolean_t use_lso)204309aee612SRyan Zezeski i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
204409aee612SRyan Zezeski     size_t off, boolean_t use_lso)
204509aee612SRyan Zezeski {
204609aee612SRyan Zezeski 	ddi_dma_handle_t dma_handle;
204709aee612SRyan Zezeski 	ddi_dma_cookie_t dma_cookie;
204809aee612SRyan Zezeski 	uint_t i = 0, ncookies = 0, dmaflags;
204909aee612SRyan Zezeski 	i40e_tx_control_block_t *tcb;
205009aee612SRyan Zezeski 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
205109aee612SRyan Zezeski 
205209aee612SRyan Zezeski 	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
205309aee612SRyan Zezeski 		txs->itxs_err_notcb.value.ui64++;
205409aee612SRyan Zezeski 		return (NULL);
205509aee612SRyan Zezeski 	}
205609aee612SRyan Zezeski 	tcb->tcb_type = I40E_TX_DMA;
205709aee612SRyan Zezeski 
205809aee612SRyan Zezeski 	if (use_lso == B_TRUE)
205909aee612SRyan Zezeski 		dma_handle = tcb->tcb_lso_dma_handle;
206009aee612SRyan Zezeski 	else
206109aee612SRyan Zezeski 		dma_handle = tcb->tcb_dma_handle;
206209aee612SRyan Zezeski 
206309aee612SRyan Zezeski 	dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
206409aee612SRyan Zezeski 	if (ddi_dma_addr_bind_handle(dma_handle, NULL,
206509aee612SRyan Zezeski 	    (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
206609aee612SRyan Zezeski 	    DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
206709aee612SRyan Zezeski 		txs->itxs_bind_fails.value.ui64++;
206809aee612SRyan Zezeski 		goto bffail;
206909aee612SRyan Zezeski 	}
207009aee612SRyan Zezeski 
207109aee612SRyan Zezeski 	tcb->tcb_bind_ncookies = ncookies;
207209aee612SRyan Zezeski 	tcb->tcb_used_lso = use_lso;
207309aee612SRyan Zezeski 
207409aee612SRyan Zezeski 	tcb->tcb_bind_info =
207509aee612SRyan Zezeski 	    kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
207609aee612SRyan Zezeski 	    KM_NOSLEEP);
207709aee612SRyan Zezeski 	if (tcb->tcb_bind_info == NULL)
207809aee612SRyan Zezeski 		goto bffail;
207909aee612SRyan Zezeski 
208009aee612SRyan Zezeski 	while (i < ncookies) {
208109aee612SRyan Zezeski 		if (i > 0)
208209aee612SRyan Zezeski 			ddi_dma_nextcookie(dma_handle, &dma_cookie);
208309aee612SRyan Zezeski 
208409aee612SRyan Zezeski 		tcb->tcb_bind_info[i].dbi_paddr =
208509aee612SRyan Zezeski 		    (caddr_t)dma_cookie.dmac_laddress;
208609aee612SRyan Zezeski 		tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
208709aee612SRyan Zezeski 	}
208809aee612SRyan Zezeski 
208909aee612SRyan Zezeski 	return (tcb);
209009aee612SRyan Zezeski 
209109aee612SRyan Zezeski bffail:
209209aee612SRyan Zezeski 	i40e_tcb_reset(tcb);
209309aee612SRyan Zezeski 	i40e_tcb_free(itrq, tcb);
209409aee612SRyan Zezeski 	return (NULL);
209509aee612SRyan Zezeski }
209609aee612SRyan Zezeski 
209709aee612SRyan Zezeski static void
i40e_tx_set_data_desc(i40e_trqpair_t * itrq,i40e_tx_context_t * tctx,caddr_t buff,size_t len,boolean_t last_desc)209809aee612SRyan Zezeski i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
209909aee612SRyan Zezeski     caddr_t buff, size_t len, boolean_t last_desc)
210009aee612SRyan Zezeski {
210109aee612SRyan Zezeski 	i40e_tx_desc_t *txdesc;
210209aee612SRyan Zezeski 	int cmd;
210309aee612SRyan Zezeski 
210409aee612SRyan Zezeski 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
210509aee612SRyan Zezeski 	itrq->itrq_desc_free--;
210609aee612SRyan Zezeski 	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
210709aee612SRyan Zezeski 	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
210809aee612SRyan Zezeski 	    itrq->itrq_tx_ring_size);
210909aee612SRyan Zezeski 
211009aee612SRyan Zezeski 	cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
211109aee612SRyan Zezeski 
211209aee612SRyan Zezeski 	/*
211309aee612SRyan Zezeski 	 * The last data descriptor needs the EOP bit set, so that the HW knows
211409aee612SRyan Zezeski 	 * that we're ready to send.  Additionally, we set the RS (Report
211509aee612SRyan Zezeski 	 * Status) bit, so that we are notified when the transmit engine has
211609aee612SRyan Zezeski 	 * completed DMA'ing all of the data descriptors and data buffers
211709aee612SRyan Zezeski 	 * associated with this frame.
211809aee612SRyan Zezeski 	 */
211909aee612SRyan Zezeski 	if (last_desc == B_TRUE) {
212009aee612SRyan Zezeski 		cmd |= I40E_TX_DESC_CMD_EOP;
212109aee612SRyan Zezeski 		cmd |= I40E_TX_DESC_CMD_RS;
212209aee612SRyan Zezeski 	}
212309aee612SRyan Zezeski 
212409aee612SRyan Zezeski 	/*
212509aee612SRyan Zezeski 	 * Per the X710 manual, section 8.4.2.1.1, the buffer size
212609aee612SRyan Zezeski 	 * must be a value from 1 to 16K minus 1, inclusive.
212709aee612SRyan Zezeski 	 */
212809aee612SRyan Zezeski 	ASSERT3U(len, >=, 1);
212909aee612SRyan Zezeski 	ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ);
213009aee612SRyan Zezeski 
213109aee612SRyan Zezeski 	txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
213209aee612SRyan Zezeski 	txdesc->cmd_type_offset_bsz =
213309aee612SRyan Zezeski 	    LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
213409aee612SRyan Zezeski 	    ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
213509aee612SRyan Zezeski 	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
213609aee612SRyan Zezeski 	    ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
213709aee612SRyan Zezeski }
213809aee612SRyan Zezeski 
213909aee612SRyan Zezeski /*
214009aee612SRyan Zezeski  * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
214109aee612SRyan Zezeski  */
214209aee612SRyan Zezeski static inline void
tcb_list_append(i40e_tx_control_block_t ** head,i40e_tx_control_block_t ** tail,i40e_tx_control_block_t * tcb)214309aee612SRyan Zezeski tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
214409aee612SRyan Zezeski     i40e_tx_control_block_t *tcb)
214509aee612SRyan Zezeski {
214609aee612SRyan Zezeski 	if (*head == NULL) {
214709aee612SRyan Zezeski 		*head = tcb;
214809aee612SRyan Zezeski 		*tail = *head;
214909aee612SRyan Zezeski 	} else {
215009aee612SRyan Zezeski 		ASSERT3P(*tail, !=, NULL);
215109aee612SRyan Zezeski 		ASSERT3P((*tail)->tcb_next, ==, NULL);
215209aee612SRyan Zezeski 		(*tail)->tcb_next = tcb;
215309aee612SRyan Zezeski 		*tail = tcb;
215409aee612SRyan Zezeski 	}
215509aee612SRyan Zezeski }
215609aee612SRyan Zezeski 
215709aee612SRyan Zezeski /*
215809aee612SRyan Zezeski  * This function takes a single packet, possibly consisting of
215909aee612SRyan Zezeski  * multiple mblks, and creates a TCB chain to send to the controller.
216009aee612SRyan Zezeski  * This TCB chain may span up to a maximum of 8 descriptors. A copy
216109aee612SRyan Zezeski  * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
216209aee612SRyan Zezeski  * more, depending on several factors. For each fragment (invidual
216309aee612SRyan Zezeski  * mblk making up the packet), we determine if its size dictates a
216409aee612SRyan Zezeski  * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
216509aee612SRyan Zezeski  * count of descriptors used; when that count reaches the max we force
216609aee612SRyan Zezeski  * all remaining fragments into a single TCB buffer. We have a
216709aee612SRyan Zezeski  * guarantee that the TCB buffer is always larger than the MTU -- so
216809aee612SRyan Zezeski  * there is always enough room. Consecutive fragments below the DMA
216909aee612SRyan Zezeski  * threshold are copied into a single TCB. In the event of an error
217009aee612SRyan Zezeski  * this function returns NULL but leaves 'mp' alone.
217109aee612SRyan Zezeski  */
217209aee612SRyan Zezeski static i40e_tx_control_block_t *
i40e_non_lso_chain(i40e_trqpair_t * itrq,mblk_t * mp,uint_t * ndesc)217309aee612SRyan Zezeski i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
217409aee612SRyan Zezeski {
217509aee612SRyan Zezeski 	const mblk_t *nmp = mp;
217609aee612SRyan Zezeski 	uint_t needed_desc = 0;
217709aee612SRyan Zezeski 	boolean_t force_copy = B_FALSE;
217809aee612SRyan Zezeski 	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
217909aee612SRyan Zezeski 	i40e_t *i40e = itrq->itrq_i40e;
218009aee612SRyan Zezeski 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
218109aee612SRyan Zezeski 
218209aee612SRyan Zezeski 	/* TCB buffer is always larger than MTU. */
218309aee612SRyan Zezeski 	ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
218409aee612SRyan Zezeski 
218509aee612SRyan Zezeski 	while (nmp != NULL) {
218609aee612SRyan Zezeski 		const size_t nmp_len = MBLKL(nmp);
218709aee612SRyan Zezeski 
218809aee612SRyan Zezeski 		/* Ignore zero-length mblks. */
218909aee612SRyan Zezeski 		if (nmp_len == 0) {
219009aee612SRyan Zezeski 			nmp = nmp->b_cont;
219109aee612SRyan Zezeski 			continue;
219209aee612SRyan Zezeski 		}
219309aee612SRyan Zezeski 
219409aee612SRyan Zezeski 		if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
219509aee612SRyan Zezeski 			/* Compress consecutive copies into one TCB. */
219609aee612SRyan Zezeski 			if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
219709aee612SRyan Zezeski 				i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
219809aee612SRyan Zezeski 				nmp = nmp->b_cont;
219909aee612SRyan Zezeski 				continue;
220009aee612SRyan Zezeski 			}
220109aee612SRyan Zezeski 
220209aee612SRyan Zezeski 			if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
220309aee612SRyan Zezeski 				txs->itxs_err_notcb.value.ui64++;
220409aee612SRyan Zezeski 				goto fail;
220509aee612SRyan Zezeski 			}
220609aee612SRyan Zezeski 
220709aee612SRyan Zezeski 			/*
220809aee612SRyan Zezeski 			 * TCB DMA buffer is guaranteed to be one
220909aee612SRyan Zezeski 			 * cookie by i40e_alloc_dma_buffer().
221009aee612SRyan Zezeski 			 */
221109aee612SRyan Zezeski 			i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
221209aee612SRyan Zezeski 			needed_desc++;
221309aee612SRyan Zezeski 			tcb_list_append(&tcbhead, &tcbtail, tcb);
221409aee612SRyan Zezeski 		} else {
221509aee612SRyan Zezeski 			uint_t total_desc;
221609aee612SRyan Zezeski 
221709aee612SRyan Zezeski 			tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
221809aee612SRyan Zezeski 			if (tcb == NULL) {
221909aee612SRyan Zezeski 				i40e_error(i40e, "dma bind failed!");
222009aee612SRyan Zezeski 				goto fail;
222109aee612SRyan Zezeski 			}
222209aee612SRyan Zezeski 
222309aee612SRyan Zezeski 			/*
222409aee612SRyan Zezeski 			 * If the new total exceeds the max or we've
222509aee612SRyan Zezeski 			 * reached the limit and there's data left,
222609aee612SRyan Zezeski 			 * then give up binding and copy the rest into
222709aee612SRyan Zezeski 			 * the pre-allocated TCB buffer.
222809aee612SRyan Zezeski 			 */
222909aee612SRyan Zezeski 			total_desc = needed_desc + tcb->tcb_bind_ncookies;
223009aee612SRyan Zezeski 			if ((total_desc > I40E_TX_MAX_COOKIE) ||
223109aee612SRyan Zezeski 			    (total_desc == I40E_TX_MAX_COOKIE &&
223209aee612SRyan Zezeski 			    nmp->b_cont != NULL)) {
223309aee612SRyan Zezeski 				i40e_tcb_reset(tcb);
223409aee612SRyan Zezeski 				i40e_tcb_free(itrq, tcb);
223509aee612SRyan Zezeski 
223609aee612SRyan Zezeski 				if (tcbtail != NULL &&
223709aee612SRyan Zezeski 				    tcbtail->tcb_type == I40E_TX_COPY) {
223809aee612SRyan Zezeski 					tcb = tcbtail;
223909aee612SRyan Zezeski 				} else {
224009aee612SRyan Zezeski 					tcb = NULL;
224109aee612SRyan Zezeski 				}
224209aee612SRyan Zezeski 
224309aee612SRyan Zezeski 				force_copy = B_TRUE;
224409aee612SRyan Zezeski 				txs->itxs_force_copy.value.ui64++;
224509aee612SRyan Zezeski 				continue;
224609aee612SRyan Zezeski 			}
224709aee612SRyan Zezeski 
224809aee612SRyan Zezeski 			needed_desc += tcb->tcb_bind_ncookies;
224909aee612SRyan Zezeski 			tcb_list_append(&tcbhead, &tcbtail, tcb);
225009aee612SRyan Zezeski 		}
225109aee612SRyan Zezeski 
225209aee612SRyan Zezeski 		nmp = nmp->b_cont;
225309aee612SRyan Zezeski 	}
225409aee612SRyan Zezeski 
225509aee612SRyan Zezeski 	ASSERT3P(nmp, ==, NULL);
225609aee612SRyan Zezeski 	ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
225709aee612SRyan Zezeski 	ASSERT3P(tcbhead, !=, NULL);
225809aee612SRyan Zezeski 	*ndesc += needed_desc;
225909aee612SRyan Zezeski 	return (tcbhead);
226009aee612SRyan Zezeski 
226109aee612SRyan Zezeski fail:
226209aee612SRyan Zezeski 	tcb = tcbhead;
226309aee612SRyan Zezeski 	while (tcb != NULL) {
226409aee612SRyan Zezeski 		i40e_tx_control_block_t *next = tcb->tcb_next;
226509aee612SRyan Zezeski 
226609aee612SRyan Zezeski 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
226709aee612SRyan Zezeski 		    tcb->tcb_type == I40E_TX_COPY);
226809aee612SRyan Zezeski 
226909aee612SRyan Zezeski 		tcb->tcb_mp = NULL;
227009aee612SRyan Zezeski 		i40e_tcb_reset(tcb);
227109aee612SRyan Zezeski 		i40e_tcb_free(itrq, tcb);
227209aee612SRyan Zezeski 		tcb = next;
227309aee612SRyan Zezeski 	}
227409aee612SRyan Zezeski 
227509aee612SRyan Zezeski 	return (NULL);
227609aee612SRyan Zezeski }
227709aee612SRyan Zezeski 
227809aee612SRyan Zezeski /*
227909aee612SRyan Zezeski  * Section 8.4.1 of the 700-series programming guide states that a
228009aee612SRyan Zezeski  * segment may span up to 8 data descriptors; including both header
228109aee612SRyan Zezeski  * and payload data. However, empirical evidence shows that the
228209aee612SRyan Zezeski  * controller freezes the Tx queue when presented with a segment of 8
228309aee612SRyan Zezeski  * descriptors. Or, at least, when the first segment contains 8
228409aee612SRyan Zezeski  * descriptors. One explanation is that the controller counts the
228509aee612SRyan Zezeski  * context descriptor against the first segment, even though the
228609aee612SRyan Zezeski  * programming guide makes no mention of such a constraint. In any
228709aee612SRyan Zezeski  * case, we limit TSO segments to 7 descriptors to prevent Tx queue
228809aee612SRyan Zezeski  * freezes. We still allow non-TSO segments to utilize all 8
228909aee612SRyan Zezeski  * descriptors as they have not demonstrated the faulty behavior.
229009aee612SRyan Zezeski  */
229109aee612SRyan Zezeski uint_t i40e_lso_num_descs = 7;
229209aee612SRyan Zezeski 
229309aee612SRyan Zezeski #define	I40E_TCB_LEFT(tcb)				\
229409aee612SRyan Zezeski 	((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
229509aee612SRyan Zezeski 
229609aee612SRyan Zezeski /*
229709aee612SRyan Zezeski  * This function is similar in spirit to i40e_non_lso_chain(), but
229809aee612SRyan Zezeski  * much more complicated in reality. Like the previous function, it
229909aee612SRyan Zezeski  * takes a packet (an LSO packet) as input and returns a chain of
230009aee612SRyan Zezeski  * TCBs. The complication comes with the fact that we are no longer
230109aee612SRyan Zezeski  * trying to fit the entire packet into 8 descriptors, but rather we
230209aee612SRyan Zezeski  * must fit each MSS-size segment of the LSO packet into 8 descriptors.
230309aee612SRyan Zezeski  * Except it's really 7 descriptors, see i40e_lso_num_descs.
230409aee612SRyan Zezeski  *
230509aee612SRyan Zezeski  * Your first inclination might be to verify that a given segment
230609aee612SRyan Zezeski  * spans no more than 7 mblks; but it's actually much more subtle than
230709aee612SRyan Zezeski  * that. First, let's describe what the hardware expects, and then we
230809aee612SRyan Zezeski  * can expound on the software side of things.
230909aee612SRyan Zezeski  *
231009aee612SRyan Zezeski  * For an LSO packet the hardware expects the following:
231109aee612SRyan Zezeski  *
231209aee612SRyan Zezeski  *	o Each MSS-sized segment must span no more than 7 descriptors.
231309aee612SRyan Zezeski  *
231409aee612SRyan Zezeski  *	o The header size does not count towards the segment size.
231509aee612SRyan Zezeski  *
231609aee612SRyan Zezeski  *	o If header and payload share the first descriptor, then the
231709aee612SRyan Zezeski  *	  controller will count the descriptor twice.
231809aee612SRyan Zezeski  *
231909aee612SRyan Zezeski  * The most important thing to keep in mind is that the hardware does
232009aee612SRyan Zezeski  * not view the segments in terms of mblks, like we do. The hardware
232109aee612SRyan Zezeski  * only sees descriptors. It will iterate each descriptor in turn,
232209aee612SRyan Zezeski  * keeping a tally of bytes seen and descriptors visited. If the byte
232309aee612SRyan Zezeski  * count hasn't reached MSS by the time the descriptor count reaches
232409aee612SRyan Zezeski  * 7, then the controller freezes the queue and we are stuck.
232509aee612SRyan Zezeski  * Furthermore, the hardware picks up its tally where it left off. So
232609aee612SRyan Zezeski  * if it reached MSS in the middle of a descriptor, it will start
232709aee612SRyan Zezeski  * tallying the next segment in the middle of that descriptor. The
232809aee612SRyan Zezeski  * hardware's view is entirely removed from the mblk chain or even the
232909aee612SRyan Zezeski  * descriptor layout. Consider these facts:
233009aee612SRyan Zezeski  *
233109aee612SRyan Zezeski  *	o The MSS will vary dpeneding on MTU and other factors.
233209aee612SRyan Zezeski  *
233309aee612SRyan Zezeski  *	o The dblk allocation will sit at various offsets within a
233409aee612SRyan Zezeski  *	  memory page.
233509aee612SRyan Zezeski  *
233609aee612SRyan Zezeski  *	o The page size itself could vary in the future (i.e. not
233709aee612SRyan Zezeski  *	  always 4K).
233809aee612SRyan Zezeski  *
233909aee612SRyan Zezeski  *	o Just because a dblk is virtually contiguous doesn't mean
234009aee612SRyan Zezeski  *	  it's physically contiguous. The number of cookies
234109aee612SRyan Zezeski  *	  (descriptors) required by a DMA bind of a single dblk is at
234209aee612SRyan Zezeski  *	  the mercy of the page size and physical layout.
234309aee612SRyan Zezeski  *
234409aee612SRyan Zezeski  *	o The descriptors will most often NOT start/end on a MSS
234509aee612SRyan Zezeski  *	  boundary. Thus the hardware will often start counting the
234609aee612SRyan Zezeski  *	  MSS mid descriptor and finish mid descriptor.
234709aee612SRyan Zezeski  *
234809aee612SRyan Zezeski  * The upshot of all this is that the driver must learn to think like
234909aee612SRyan Zezeski  * the controller; and verify that none of the constraints are broken.
235009aee612SRyan Zezeski  * It does this by tallying up the segment just like the hardware
235109aee612SRyan Zezeski  * would. This is handled by the two variables 'segsz' and 'segdesc'.
235209aee612SRyan Zezeski  * After each attempt to bind a dblk, we check the constaints. If
235309aee612SRyan Zezeski  * violated, we undo the DMA and force a copy until MSS is met. We
235409aee612SRyan Zezeski  * have a guarantee that the TCB buffer is larger than MTU; thus
235509aee612SRyan Zezeski  * ensuring we can always meet the MSS with a single copy buffer. We
235609aee612SRyan Zezeski  * also copy consecutive non-DMA fragments into the same TCB buffer.
235709aee612SRyan Zezeski  */
235809aee612SRyan Zezeski static i40e_tx_control_block_t *
i40e_lso_chain(i40e_trqpair_t * itrq,const mblk_t * mp,const mac_ether_offload_info_t * meo,const i40e_tx_context_t * tctx,uint_t * ndesc)235909aee612SRyan Zezeski i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
236009aee612SRyan Zezeski     const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
236109aee612SRyan Zezeski     uint_t *ndesc)
236209aee612SRyan Zezeski {
236309aee612SRyan Zezeski 	size_t mp_len = MBLKL(mp);
236409aee612SRyan Zezeski 	/*
236509aee612SRyan Zezeski 	 * The cpoff (copy offset) variable tracks the offset inside
236609aee612SRyan Zezeski 	 * the current mp. There are cases where the entire mp is not
236709aee612SRyan Zezeski 	 * fully copied in one go: such as the header copy followed by
236809aee612SRyan Zezeski 	 * a non-DMA mblk, or a TCB buffer that only has enough space
236909aee612SRyan Zezeski 	 * to copy part of the current mp.
237009aee612SRyan Zezeski 	 */
237109aee612SRyan Zezeski 	size_t cpoff = 0;
237209aee612SRyan Zezeski 	/*
237309aee612SRyan Zezeski 	 * The segsz and segdesc variables track the controller's view
237409aee612SRyan Zezeski 	 * of the segment. The needed_desc variable tracks the total
237509aee612SRyan Zezeski 	 * number of data descriptors used by the driver.
237609aee612SRyan Zezeski 	 */
237709aee612SRyan Zezeski 	size_t segsz = 0;
237809aee612SRyan Zezeski 	uint_t segdesc = 0;
237909aee612SRyan Zezeski 	uint_t needed_desc = 0;
238009aee612SRyan Zezeski 	size_t hdrcopied = 0;
238109aee612SRyan Zezeski 	const size_t hdrlen =
238209aee612SRyan Zezeski 	    meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
238309aee612SRyan Zezeski 	const size_t mss = tctx->itc_ctx_mss;
238409aee612SRyan Zezeski 	boolean_t force_copy = B_FALSE;
238509aee612SRyan Zezeski 	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
238609aee612SRyan Zezeski 	i40e_t *i40e = itrq->itrq_i40e;
238709aee612SRyan Zezeski 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
238809aee612SRyan Zezeski 
238909aee612SRyan Zezeski 	/*
239009aee612SRyan Zezeski 	 * We always copy the header in order to avoid more
239109aee612SRyan Zezeski 	 * complicated code dealing with various edge cases.
239209aee612SRyan Zezeski 	 */
239309aee612SRyan Zezeski 	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
239409aee612SRyan Zezeski 		txs->itxs_err_notcb.value.ui64++;
239509aee612SRyan Zezeski 		goto fail;
239609aee612SRyan Zezeski 	}
239709aee612SRyan Zezeski 
239809aee612SRyan Zezeski 	needed_desc++;
239909aee612SRyan Zezeski 	tcb_list_append(&tcbhead, &tcbtail, tcb);
240009aee612SRyan Zezeski 
240109aee612SRyan Zezeski 	while (hdrcopied < hdrlen) {
240209aee612SRyan Zezeski 		const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len);
240309aee612SRyan Zezeski 		i40e_tx_copy_fragment(tcb, mp, 0, tocopy);
240409aee612SRyan Zezeski 		hdrcopied += tocopy;
240509aee612SRyan Zezeski 		cpoff += tocopy;
240609aee612SRyan Zezeski 		if (tocopy == mp_len) {
240709aee612SRyan Zezeski 			/*
240809aee612SRyan Zezeski 			 * This is a bit of defensive programming. We
240909aee612SRyan Zezeski 			 * should never have a chain too short to
241009aee612SRyan Zezeski 			 * satisfy the headers -- but just in case.
241109aee612SRyan Zezeski 			 */
241209aee612SRyan Zezeski 			if ((mp = mp->b_cont) == NULL) {
241309aee612SRyan Zezeski 				txs->itxs_tx_short.value.ui64++;
241409aee612SRyan Zezeski 				goto fail;
241509aee612SRyan Zezeski 			}
241609aee612SRyan Zezeski 
241709aee612SRyan Zezeski 			while ((mp_len = MBLKL(mp)) == 0) {
241809aee612SRyan Zezeski 				if ((mp = mp->b_cont) == NULL) {
241909aee612SRyan Zezeski 					txs->itxs_tx_short.value.ui64++;
242009aee612SRyan Zezeski 					goto fail;
242109aee612SRyan Zezeski 				}
242209aee612SRyan Zezeski 			}
242309aee612SRyan Zezeski 			cpoff = 0;
242409aee612SRyan Zezeski 		}
242509aee612SRyan Zezeski 	}
242609aee612SRyan Zezeski 	ASSERT3U(hdrcopied, ==, hdrlen);
242709aee612SRyan Zezeski 
242809aee612SRyan Zezeski 	/*
242909aee612SRyan Zezeski 	 * A single descriptor containing both header and data is
243009aee612SRyan Zezeski 	 * counted twice by the controller.
243109aee612SRyan Zezeski 	 */
243209aee612SRyan Zezeski 	if (mp_len < i40e->i40e_tx_dma_min) {
243309aee612SRyan Zezeski 		segdesc = 2;
243409aee612SRyan Zezeski 	} else {
243509aee612SRyan Zezeski 		segdesc = 1;
243609aee612SRyan Zezeski 	}
243709aee612SRyan Zezeski 
243809aee612SRyan Zezeski 	while (mp != NULL) {
243909aee612SRyan Zezeski 		mp_len = MBLKL(mp);
244009aee612SRyan Zezeski force_copy:
244109aee612SRyan Zezeski 		/* Ignore zero-length mblks. */
244209aee612SRyan Zezeski 		if (mp_len == 0) {
244309aee612SRyan Zezeski 			mp = mp->b_cont;
244409aee612SRyan Zezeski 			cpoff = 0;
244509aee612SRyan Zezeski 			continue;
244609aee612SRyan Zezeski 		}
244709aee612SRyan Zezeski 
244809aee612SRyan Zezeski 		/*
244909aee612SRyan Zezeski 		 * We copy into the preallocated TCB buffer when the
245009aee612SRyan Zezeski 		 * current fragment is less than the DMA threshold OR
245109aee612SRyan Zezeski 		 * when the DMA bind can't meet the controller's
245209aee612SRyan Zezeski 		 * segment descriptor limit.
245309aee612SRyan Zezeski 		 */
245409aee612SRyan Zezeski 		if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
245509aee612SRyan Zezeski 			size_t tocopy;
245609aee612SRyan Zezeski 
245709aee612SRyan Zezeski 			/*
245809aee612SRyan Zezeski 			 * Our objective here is to compress
245909aee612SRyan Zezeski 			 * consecutive copies into one TCB (until it
246009aee612SRyan Zezeski 			 * is full). If there is no current TCB, or if
246109aee612SRyan Zezeski 			 * it is a DMA TCB, then allocate a new one.
246209aee612SRyan Zezeski 			 */
246309aee612SRyan Zezeski 			if (tcb == NULL ||
246409aee612SRyan Zezeski 			    (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
246509aee612SRyan Zezeski 				if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
246609aee612SRyan Zezeski 					txs->itxs_err_notcb.value.ui64++;
246709aee612SRyan Zezeski 					goto fail;
246809aee612SRyan Zezeski 				}
246909aee612SRyan Zezeski 
247009aee612SRyan Zezeski 				/*
247109aee612SRyan Zezeski 				 * The TCB DMA buffer is guaranteed to
247209aee612SRyan Zezeski 				 * be one cookie by i40e_alloc_dma_buffer().
247309aee612SRyan Zezeski 				 */
247409aee612SRyan Zezeski 				needed_desc++;
247509aee612SRyan Zezeski 				segdesc++;
247609aee612SRyan Zezeski 				ASSERT3U(segdesc, <=, i40e_lso_num_descs);
247709aee612SRyan Zezeski 				tcb_list_append(&tcbhead, &tcbtail, tcb);
247809aee612SRyan Zezeski 			} else if (segdesc == 0) {
247909aee612SRyan Zezeski 				/*
248009aee612SRyan Zezeski 				 * We are copying into an existing TCB
248109aee612SRyan Zezeski 				 * but we just crossed the MSS
248209aee612SRyan Zezeski 				 * boundary. Make sure to increment
248309aee612SRyan Zezeski 				 * segdesc to track the descriptor
248409aee612SRyan Zezeski 				 * count as the hardware would.
248509aee612SRyan Zezeski 				 */
248609aee612SRyan Zezeski 				segdesc++;
248709aee612SRyan Zezeski 			}
248809aee612SRyan Zezeski 
248909aee612SRyan Zezeski 			tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
249009aee612SRyan Zezeski 			i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
249109aee612SRyan Zezeski 			cpoff += tocopy;
249209aee612SRyan Zezeski 			segsz += tocopy;
249309aee612SRyan Zezeski 
249409aee612SRyan Zezeski 			/* We have consumed the current mp. */
249509aee612SRyan Zezeski 			if (cpoff == mp_len) {
249609aee612SRyan Zezeski 				mp = mp->b_cont;
249709aee612SRyan Zezeski 				cpoff = 0;
249809aee612SRyan Zezeski 			}
249909aee612SRyan Zezeski 
250009aee612SRyan Zezeski 			/* We have consumed the current TCB buffer. */
250109aee612SRyan Zezeski 			if (I40E_TCB_LEFT(tcb) == 0) {
250209aee612SRyan Zezeski 				tcb = NULL;
250309aee612SRyan Zezeski 			}
250409aee612SRyan Zezeski 
250509aee612SRyan Zezeski 			/*
250609aee612SRyan Zezeski 			 * We have met MSS with this copy; restart the
250709aee612SRyan Zezeski 			 * counters.
250809aee612SRyan Zezeski 			 */
250909aee612SRyan Zezeski 			if (segsz >= mss) {
251009aee612SRyan Zezeski 				segsz = segsz % mss;
251109aee612SRyan Zezeski 				segdesc = segsz == 0 ? 0 : 1;
251209aee612SRyan Zezeski 				force_copy = B_FALSE;
251309aee612SRyan Zezeski 			}
251409aee612SRyan Zezeski 
251509aee612SRyan Zezeski 			/*
251609aee612SRyan Zezeski 			 * We are at the controller's descriptor
251709aee612SRyan Zezeski 			 * limit; we must copy into the current TCB
251809aee612SRyan Zezeski 			 * until MSS is reached. The TCB buffer is
251909aee612SRyan Zezeski 			 * always bigger than the MTU so we know it is
252009aee612SRyan Zezeski 			 * big enough to meet the MSS.
252109aee612SRyan Zezeski 			 */
252209aee612SRyan Zezeski 			if (segdesc == i40e_lso_num_descs) {
252309aee612SRyan Zezeski 				force_copy = B_TRUE;
252409aee612SRyan Zezeski 			}
252509aee612SRyan Zezeski 		} else {
252609aee612SRyan Zezeski 			uint_t tsegdesc = segdesc;
252709aee612SRyan Zezeski 			size_t tsegsz = segsz;
252809aee612SRyan Zezeski 
252909aee612SRyan Zezeski 			ASSERT(force_copy == B_FALSE);
253009aee612SRyan Zezeski 			ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
253109aee612SRyan Zezeski 
253209aee612SRyan Zezeski 			tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
253309aee612SRyan Zezeski 			if (tcb == NULL) {
253409aee612SRyan Zezeski 				i40e_error(i40e, "dma bind failed!");
253509aee612SRyan Zezeski 				goto fail;
253609aee612SRyan Zezeski 			}
253709aee612SRyan Zezeski 
253809aee612SRyan Zezeski 			for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
253909aee612SRyan Zezeski 				struct i40e_dma_bind_info dbi =
254009aee612SRyan Zezeski 				    tcb->tcb_bind_info[i];
254109aee612SRyan Zezeski 
254209aee612SRyan Zezeski 				tsegsz += dbi.dbi_len;
254309aee612SRyan Zezeski 				tsegdesc++;
254409aee612SRyan Zezeski 				ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
254509aee612SRyan Zezeski 
254609aee612SRyan Zezeski 				/*
254709aee612SRyan Zezeski 				 * We've met the MSS with this portion
254809aee612SRyan Zezeski 				 * of the DMA.
254909aee612SRyan Zezeski 				 */
255009aee612SRyan Zezeski 				if (tsegsz >= mss) {
255109aee612SRyan Zezeski 					tsegsz = tsegsz % mss;
255209aee612SRyan Zezeski 					tsegdesc = tsegsz == 0 ? 0 : 1;
255309aee612SRyan Zezeski 				}
255409aee612SRyan Zezeski 
255509aee612SRyan Zezeski 				/*
255609aee612SRyan Zezeski 				 * We've reached max descriptors but
255709aee612SRyan Zezeski 				 * have not met the MSS. Undo the bind
255809aee612SRyan Zezeski 				 * and instead copy.
255909aee612SRyan Zezeski 				 */
256009aee612SRyan Zezeski 				if (tsegdesc == i40e_lso_num_descs) {
256109aee612SRyan Zezeski 					i40e_tcb_reset(tcb);
256209aee612SRyan Zezeski 					i40e_tcb_free(itrq, tcb);
256309aee612SRyan Zezeski 
256409aee612SRyan Zezeski 					if (tcbtail != NULL &&
256509aee612SRyan Zezeski 					    I40E_TCB_LEFT(tcb) > 0 &&
256609aee612SRyan Zezeski 					    tcbtail->tcb_type == I40E_TX_COPY) {
256709aee612SRyan Zezeski 						tcb = tcbtail;
256809aee612SRyan Zezeski 					} else {
256909aee612SRyan Zezeski 						tcb = NULL;
257009aee612SRyan Zezeski 					}
257109aee612SRyan Zezeski 
257209aee612SRyan Zezeski 					/*
257309aee612SRyan Zezeski 					 * Remember, we are still on
257409aee612SRyan Zezeski 					 * the same mp.
257509aee612SRyan Zezeski 					 */
257609aee612SRyan Zezeski 					force_copy = B_TRUE;
257709aee612SRyan Zezeski 					txs->itxs_tso_force_copy.value.ui64++;
257809aee612SRyan Zezeski 					goto force_copy;
257909aee612SRyan Zezeski 				}
258009aee612SRyan Zezeski 			}
258109aee612SRyan Zezeski 
258209aee612SRyan Zezeski 			ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
258309aee612SRyan Zezeski 			ASSERT3U(tsegsz, <, mss);
258409aee612SRyan Zezeski 
258509aee612SRyan Zezeski 			/*
258609aee612SRyan Zezeski 			 * We've made if through the loop without
258709aee612SRyan Zezeski 			 * breaking the segment descriptor contract
258809aee612SRyan Zezeski 			 * with the controller -- replace the segment
258909aee612SRyan Zezeski 			 * tracking values with the temporary ones.
259009aee612SRyan Zezeski 			 */
259109aee612SRyan Zezeski 			segdesc = tsegdesc;
259209aee612SRyan Zezeski 			segsz = tsegsz;
259309aee612SRyan Zezeski 			needed_desc += tcb->tcb_bind_ncookies;
259409aee612SRyan Zezeski 			cpoff = 0;
259509aee612SRyan Zezeski 			tcb_list_append(&tcbhead, &tcbtail, tcb);
259609aee612SRyan Zezeski 			mp = mp->b_cont;
259709aee612SRyan Zezeski 		}
259809aee612SRyan Zezeski 	}
259909aee612SRyan Zezeski 
260009aee612SRyan Zezeski 	ASSERT3P(mp, ==, NULL);
260109aee612SRyan Zezeski 	ASSERT3P(tcbhead, !=, NULL);
260209aee612SRyan Zezeski 	*ndesc += needed_desc;
260309aee612SRyan Zezeski 	return (tcbhead);
260409aee612SRyan Zezeski 
260509aee612SRyan Zezeski fail:
260609aee612SRyan Zezeski 	tcb = tcbhead;
260709aee612SRyan Zezeski 	while (tcb != NULL) {
260809aee612SRyan Zezeski 		i40e_tx_control_block_t *next = tcb->tcb_next;
260909aee612SRyan Zezeski 
261009aee612SRyan Zezeski 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
261109aee612SRyan Zezeski 		    tcb->tcb_type == I40E_TX_COPY);
261209aee612SRyan Zezeski 
261309aee612SRyan Zezeski 		tcb->tcb_mp = NULL;
261409aee612SRyan Zezeski 		i40e_tcb_reset(tcb);
261509aee612SRyan Zezeski 		i40e_tcb_free(itrq, tcb);
261609aee612SRyan Zezeski 		tcb = next;
261709aee612SRyan Zezeski 	}
261809aee612SRyan Zezeski 
261909aee612SRyan Zezeski 	return (NULL);
262009aee612SRyan Zezeski }
262109aee612SRyan Zezeski 
2622*aa2a44afSPaul Winder /*
2623*aa2a44afSPaul Winder  * Keep track of activity through the transmit data path.
2624*aa2a44afSPaul Winder  *
2625*aa2a44afSPaul Winder  * We need to ensure we don't try and transmit when a trqpair has been
2626*aa2a44afSPaul Winder  * stopped, nor do we want to stop a trqpair whilst transmitting.
2627*aa2a44afSPaul Winder  */
2628*aa2a44afSPaul Winder static boolean_t
i40e_ring_tx_enter(i40e_trqpair_t * itrq)2629*aa2a44afSPaul Winder i40e_ring_tx_enter(i40e_trqpair_t *itrq)
2630*aa2a44afSPaul Winder {
2631*aa2a44afSPaul Winder 	boolean_t allow;
2632*aa2a44afSPaul Winder 
2633*aa2a44afSPaul Winder 	mutex_enter(&itrq->itrq_tx_lock);
2634*aa2a44afSPaul Winder 	allow = !itrq->itrq_tx_quiesce;
2635*aa2a44afSPaul Winder 	if (allow)
2636*aa2a44afSPaul Winder 		itrq->itrq_tx_active++;
2637*aa2a44afSPaul Winder 	mutex_exit(&itrq->itrq_tx_lock);
2638*aa2a44afSPaul Winder 
2639*aa2a44afSPaul Winder 	return (allow);
2640*aa2a44afSPaul Winder }
2641*aa2a44afSPaul Winder 
2642*aa2a44afSPaul Winder static void
i40e_ring_tx_exit_nolock(i40e_trqpair_t * itrq)2643*aa2a44afSPaul Winder i40e_ring_tx_exit_nolock(i40e_trqpair_t *itrq)
2644*aa2a44afSPaul Winder {
2645*aa2a44afSPaul Winder 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2646*aa2a44afSPaul Winder 
2647*aa2a44afSPaul Winder 	itrq->itrq_tx_active--;
2648*aa2a44afSPaul Winder 	if (itrq->itrq_tx_quiesce)
2649*aa2a44afSPaul Winder 		cv_signal(&itrq->itrq_tx_cv);
2650*aa2a44afSPaul Winder }
2651*aa2a44afSPaul Winder 
2652*aa2a44afSPaul Winder static void
i40e_ring_tx_exit(i40e_trqpair_t * itrq)2653*aa2a44afSPaul Winder i40e_ring_tx_exit(i40e_trqpair_t *itrq)
2654*aa2a44afSPaul Winder {
2655*aa2a44afSPaul Winder 	mutex_enter(&itrq->itrq_tx_lock);
2656*aa2a44afSPaul Winder 	i40e_ring_tx_exit_nolock(itrq);
2657*aa2a44afSPaul Winder 	mutex_exit(&itrq->itrq_tx_lock);
2658*aa2a44afSPaul Winder }
2659*aa2a44afSPaul Winder 
2660*aa2a44afSPaul Winder 
2661*aa2a44afSPaul Winder /*
2662*aa2a44afSPaul Winder  * Tell the transmit path to quiesce and wait until there is no
2663*aa2a44afSPaul Winder  * more activity.
2664*aa2a44afSPaul Winder  * Will return B_TRUE if the transmit path is already quiesced, B_FALSE
2665*aa2a44afSPaul Winder  * otherwise.
2666*aa2a44afSPaul Winder  */
2667*aa2a44afSPaul Winder boolean_t
i40e_ring_tx_quiesce(i40e_trqpair_t * itrq)2668*aa2a44afSPaul Winder i40e_ring_tx_quiesce(i40e_trqpair_t *itrq)
2669*aa2a44afSPaul Winder {
2670*aa2a44afSPaul Winder 	mutex_enter(&itrq->itrq_tx_lock);
2671*aa2a44afSPaul Winder 	if (itrq->itrq_tx_quiesce) {
2672*aa2a44afSPaul Winder 		/*
2673*aa2a44afSPaul Winder 		 * When itrq_tx_quiesce is set, then the ring has already
2674*aa2a44afSPaul Winder 		 * been shutdown.
2675*aa2a44afSPaul Winder 		 */
2676*aa2a44afSPaul Winder 		mutex_exit(&itrq->itrq_tx_lock);
2677*aa2a44afSPaul Winder 		return (B_TRUE);
2678*aa2a44afSPaul Winder 	}
2679*aa2a44afSPaul Winder 
2680*aa2a44afSPaul Winder 	/*
2681*aa2a44afSPaul Winder 	 * Tell any threads in transmit path this trqpair is quiesced and
2682*aa2a44afSPaul Winder 	 * wait until they've all exited the critical code path.
2683*aa2a44afSPaul Winder 	 */
2684*aa2a44afSPaul Winder 	itrq->itrq_tx_quiesce = B_TRUE;
2685*aa2a44afSPaul Winder 	while (itrq->itrq_tx_active > 0)
2686*aa2a44afSPaul Winder 		cv_wait(&itrq->itrq_tx_cv, &itrq->itrq_tx_lock);
2687*aa2a44afSPaul Winder 
2688*aa2a44afSPaul Winder 	mutex_exit(&itrq->itrq_tx_lock);
2689*aa2a44afSPaul Winder 
2690*aa2a44afSPaul Winder 	return (B_FALSE);
2691*aa2a44afSPaul Winder }
2692*aa2a44afSPaul Winder 
26939d26e4fcSRobert Mustacchi /*
26949d26e4fcSRobert Mustacchi  * We've been asked to send a message block on the wire. We'll only have a
26959d26e4fcSRobert Mustacchi  * single chain. There will not be any b_next pointers; however, there may be
269609aee612SRyan Zezeski  * multiple b_cont blocks. The number of b_cont blocks may exceed the
269709aee612SRyan Zezeski  * controller's Tx descriptor limit.
26989d26e4fcSRobert Mustacchi  *
26999d26e4fcSRobert Mustacchi  * We may do one of three things with any given mblk_t chain:
27009d26e4fcSRobert Mustacchi  *
27019d26e4fcSRobert Mustacchi  *   1) Drop it
27029d26e4fcSRobert Mustacchi  *   2) Transmit it
27039d26e4fcSRobert Mustacchi  *   3) Return it
27049d26e4fcSRobert Mustacchi  *
27059d26e4fcSRobert Mustacchi  * If we return it to MAC, then MAC will flow control on our behalf. In other
27069d26e4fcSRobert Mustacchi  * words, it won't send us anything until we tell it that it's okay to send us
27079d26e4fcSRobert Mustacchi  * something.
27089d26e4fcSRobert Mustacchi  */
27099d26e4fcSRobert Mustacchi mblk_t *
i40e_ring_tx(void * arg,mblk_t * mp)27109d26e4fcSRobert Mustacchi i40e_ring_tx(void *arg, mblk_t *mp)
27119d26e4fcSRobert Mustacchi {
271209aee612SRyan Zezeski 	size_t msglen;
271309aee612SRyan Zezeski 	i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
271409aee612SRyan Zezeski 	i40e_tx_context_desc_t *ctxdesc;
271509aee612SRyan Zezeski 	mac_ether_offload_info_t meo;
27169d26e4fcSRobert Mustacchi 	i40e_tx_context_t tctx;
271709aee612SRyan Zezeski 	int type;
271809aee612SRyan Zezeski 	uint_t needed_desc = 0;
271909aee612SRyan Zezeski 	boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
27209d26e4fcSRobert Mustacchi 
27219d26e4fcSRobert Mustacchi 	i40e_trqpair_t *itrq = arg;
27229d26e4fcSRobert Mustacchi 	i40e_t *i40e = itrq->itrq_i40e;
27239d26e4fcSRobert Mustacchi 	i40e_hw_t *hw = &i40e->i40e_hw_space;
27249d26e4fcSRobert Mustacchi 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
27259d26e4fcSRobert Mustacchi 
27269d26e4fcSRobert Mustacchi 	ASSERT(mp->b_next == NULL);
27279d26e4fcSRobert Mustacchi 
27289d26e4fcSRobert Mustacchi 	if (!(i40e->i40e_state & I40E_STARTED) ||
27299d26e4fcSRobert Mustacchi 	    (i40e->i40e_state & I40E_OVERTEMP) ||
27309d26e4fcSRobert Mustacchi 	    (i40e->i40e_state & I40E_SUSPENDED) ||
27319d26e4fcSRobert Mustacchi 	    (i40e->i40e_state & I40E_ERROR) ||
2732*aa2a44afSPaul Winder 	    (i40e->i40e_link_state != LINK_STATE_UP) ||
2733*aa2a44afSPaul Winder 	    !i40e_ring_tx_enter(itrq)) {
27349d26e4fcSRobert Mustacchi 		freemsg(mp);
27359d26e4fcSRobert Mustacchi 		return (NULL);
27369d26e4fcSRobert Mustacchi 	}
27379d26e4fcSRobert Mustacchi 
273809aee612SRyan Zezeski 	if (mac_ether_offload_info(mp, &meo) != 0) {
273909aee612SRyan Zezeski 		freemsg(mp);
274009aee612SRyan Zezeski 		itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
2741*aa2a44afSPaul Winder 		i40e_ring_tx_exit(itrq);
274209aee612SRyan Zezeski 		return (NULL);
274309aee612SRyan Zezeski 	}
274409aee612SRyan Zezeski 
27459d26e4fcSRobert Mustacchi 	/*
27469d26e4fcSRobert Mustacchi 	 * Figure out the relevant context about this frame that we might need
274709aee612SRyan Zezeski 	 * for enabling checksum, LSO, etc. This also fills in information that
27489d26e4fcSRobert Mustacchi 	 * we might set around the packet type, etc.
27499d26e4fcSRobert Mustacchi 	 */
275009aee612SRyan Zezeski 	if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
27519d26e4fcSRobert Mustacchi 		freemsg(mp);
27529d26e4fcSRobert Mustacchi 		itrq->itrq_txstat.itxs_err_context.value.ui64++;
2753*aa2a44afSPaul Winder 		i40e_ring_tx_exit(itrq);
27549d26e4fcSRobert Mustacchi 		return (NULL);
27559d26e4fcSRobert Mustacchi 	}
275609aee612SRyan Zezeski 	if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
275709aee612SRyan Zezeski 		use_lso = B_TRUE;
275809aee612SRyan Zezeski 		do_ctx_desc = B_TRUE;
275909aee612SRyan Zezeski 	}
27609d26e4fcSRobert Mustacchi 
27619d26e4fcSRobert Mustacchi 	/*
27629d26e4fcSRobert Mustacchi 	 * For the primordial driver we can punt on doing any recycling right
27639d26e4fcSRobert Mustacchi 	 * now; however, longer term we need to probably do some more pro-active
276409aee612SRyan Zezeski 	 * recycling to cut back on stalls in the TX path.
27659d26e4fcSRobert Mustacchi 	 */
27669d26e4fcSRobert Mustacchi 
276709aee612SRyan Zezeski 	msglen = msgsize(mp);
276809aee612SRyan Zezeski 
276909aee612SRyan Zezeski 	if (do_ctx_desc) {
277009aee612SRyan Zezeski 		/*
277109aee612SRyan Zezeski 		 * If we're doing tunneling or LSO, then we'll need a TX
277209aee612SRyan Zezeski 		 * context descriptor in addition to one or more TX data
277309aee612SRyan Zezeski 		 * descriptors.  Since there's no data DMA block or handle
277409aee612SRyan Zezeski 		 * associated with the context descriptor, we create a special
277509aee612SRyan Zezeski 		 * control block that behaves effectively like a NOP.
277609aee612SRyan Zezeski 		 */
277709aee612SRyan Zezeski 		if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
277809aee612SRyan Zezeski 			txs->itxs_err_notcb.value.ui64++;
277909aee612SRyan Zezeski 			goto txfail;
278009aee612SRyan Zezeski 		}
278109aee612SRyan Zezeski 		tcb_ctx->tcb_type = I40E_TX_DESC;
278209aee612SRyan Zezeski 		needed_desc++;
27839d26e4fcSRobert Mustacchi 	}
27849d26e4fcSRobert Mustacchi 
278509aee612SRyan Zezeski 	if (!use_lso) {
278609aee612SRyan Zezeski 		tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
278709aee612SRyan Zezeski 	} else {
278809aee612SRyan Zezeski 		tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
27899d26e4fcSRobert Mustacchi 	}
27909d26e4fcSRobert Mustacchi 
279109aee612SRyan Zezeski 	if (tcbhead == NULL)
279209aee612SRyan Zezeski 		goto txfail;
27939d26e4fcSRobert Mustacchi 
279409aee612SRyan Zezeski 	tcbhead->tcb_mp = mp;
27959d26e4fcSRobert Mustacchi 
27969d26e4fcSRobert Mustacchi 	/*
279709aee612SRyan Zezeski 	 * The second condition ensures that 'itrq_desc_tail' never
279809aee612SRyan Zezeski 	 * equals 'itrq_desc_head'. This enforces the rule found in
279909aee612SRyan Zezeski 	 * the second bullet point of section 8.4.3.1.5 of the XL710
280009aee612SRyan Zezeski 	 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
280109aee612SRyan Zezeski 	 * never overlap with the head. This means that we only ever
280209aee612SRyan Zezeski 	 * have 'itrq_tx_ring_size - 1' total available descriptors.
28039d26e4fcSRobert Mustacchi 	 */
28049d26e4fcSRobert Mustacchi 	mutex_enter(&itrq->itrq_tx_lock);
280509aee612SRyan Zezeski 	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
280609aee612SRyan Zezeski 	    (itrq->itrq_desc_free - 1) < needed_desc) {
28079d26e4fcSRobert Mustacchi 		txs->itxs_err_nodescs.value.ui64++;
28089d26e4fcSRobert Mustacchi 		mutex_exit(&itrq->itrq_tx_lock);
28099d26e4fcSRobert Mustacchi 		goto txfail;
28109d26e4fcSRobert Mustacchi 	}
28119d26e4fcSRobert Mustacchi 
281209aee612SRyan Zezeski 	if (do_ctx_desc) {
281309aee612SRyan Zezeski 		/*
281409aee612SRyan Zezeski 		 * If we're enabling any offloads for this frame, then we'll
281509aee612SRyan Zezeski 		 * need to build up a transmit context descriptor, first.  The
281609aee612SRyan Zezeski 		 * context descriptor needs to be placed in the TX ring before
281709aee612SRyan Zezeski 		 * the data descriptor(s).  See section 8.4.2, table 8-16
281809aee612SRyan Zezeski 		 */
281909aee612SRyan Zezeski 		uint_t tail = itrq->itrq_desc_tail;
282009aee612SRyan Zezeski 		itrq->itrq_desc_free--;
282109aee612SRyan Zezeski 		ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
282209aee612SRyan Zezeski 		itrq->itrq_tcb_work_list[tail] = tcb_ctx;
282309aee612SRyan Zezeski 		itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
282409aee612SRyan Zezeski 		    itrq->itrq_tx_ring_size);
282509aee612SRyan Zezeski 
282609aee612SRyan Zezeski 		/* QW0 */
282709aee612SRyan Zezeski 		type = I40E_TX_DESC_DTYPE_CONTEXT;
282809aee612SRyan Zezeski 		ctxdesc->tunneling_params = 0;
282909aee612SRyan Zezeski 		ctxdesc->l2tag2 = 0;
283009aee612SRyan Zezeski 
283109aee612SRyan Zezeski 		/* QW1 */
283209aee612SRyan Zezeski 		ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
283309aee612SRyan Zezeski 		if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
283409aee612SRyan Zezeski 			ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
283509aee612SRyan Zezeski 			    ((uint64_t)tctx.itc_ctx_cmdflags <<
283609aee612SRyan Zezeski 			    I40E_TXD_CTX_QW1_CMD_SHIFT) |
283709aee612SRyan Zezeski 			    ((uint64_t)tctx.itc_ctx_tsolen <<
283809aee612SRyan Zezeski 			    I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
283909aee612SRyan Zezeski 			    ((uint64_t)tctx.itc_ctx_mss <<
284009aee612SRyan Zezeski 			    I40E_TXD_CTX_QW1_MSS_SHIFT));
284109aee612SRyan Zezeski 		}
284209aee612SRyan Zezeski 	}
28439d26e4fcSRobert Mustacchi 
284409aee612SRyan Zezeski 	tcb = tcbhead;
284509aee612SRyan Zezeski 	while (tcb != NULL) {
284609aee612SRyan Zezeski 
284709aee612SRyan Zezeski 		itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
284809aee612SRyan Zezeski 		if (tcb->tcb_type == I40E_TX_COPY) {
284909aee612SRyan Zezeski 			boolean_t last_desc = (tcb->tcb_next == NULL);
285009aee612SRyan Zezeski 
285109aee612SRyan Zezeski 			i40e_tx_set_data_desc(itrq, &tctx,
285209aee612SRyan Zezeski 			    (caddr_t)tcb->tcb_dma.dmab_dma_address,
285309aee612SRyan Zezeski 			    tcb->tcb_dma.dmab_len, last_desc);
285409aee612SRyan Zezeski 		} else {
285509aee612SRyan Zezeski 			boolean_t last_desc = B_FALSE;
285609aee612SRyan Zezeski 			ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
285709aee612SRyan Zezeski 
285809aee612SRyan Zezeski 			for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
285909aee612SRyan Zezeski 				last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
286009aee612SRyan Zezeski 				    (tcb->tcb_next == NULL);
286109aee612SRyan Zezeski 
286209aee612SRyan Zezeski 				i40e_tx_set_data_desc(itrq, &tctx,
286309aee612SRyan Zezeski 				    tcb->tcb_bind_info[c].dbi_paddr,
286409aee612SRyan Zezeski 				    tcb->tcb_bind_info[c].dbi_len,
286509aee612SRyan Zezeski 				    last_desc);
286609aee612SRyan Zezeski 			}
286709aee612SRyan Zezeski 		}
286809aee612SRyan Zezeski 
286909aee612SRyan Zezeski 		tcb = tcb->tcb_next;
287009aee612SRyan Zezeski 	}
28719d26e4fcSRobert Mustacchi 
28729d26e4fcSRobert Mustacchi 	/*
28739d26e4fcSRobert Mustacchi 	 * Now, finally, sync the DMA data and alert hardware.
28749d26e4fcSRobert Mustacchi 	 */
28759d26e4fcSRobert Mustacchi 	I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
28769d26e4fcSRobert Mustacchi 
28779d26e4fcSRobert Mustacchi 	I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
28789d26e4fcSRobert Mustacchi 	    itrq->itrq_desc_tail);
287909aee612SRyan Zezeski 
28809d26e4fcSRobert Mustacchi 	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
28819d26e4fcSRobert Mustacchi 	    DDI_FM_OK) {
28829d26e4fcSRobert Mustacchi 		/*
28839d26e4fcSRobert Mustacchi 		 * Note, we can't really go through and clean this up very well,
28849d26e4fcSRobert Mustacchi 		 * because the memory has been given to the device, so just
28859d26e4fcSRobert Mustacchi 		 * indicate it's been transmitted.
28869d26e4fcSRobert Mustacchi 		 */
28879d26e4fcSRobert Mustacchi 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
28889d26e4fcSRobert Mustacchi 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
28899d26e4fcSRobert Mustacchi 	}
28909d26e4fcSRobert Mustacchi 
289109aee612SRyan Zezeski 	txs->itxs_bytes.value.ui64 += msglen;
28929d26e4fcSRobert Mustacchi 	txs->itxs_packets.value.ui64++;
289309aee612SRyan Zezeski 	txs->itxs_descriptors.value.ui64 += needed_desc;
28949d26e4fcSRobert Mustacchi 
2895*aa2a44afSPaul Winder 	i40e_ring_tx_exit_nolock(itrq);
2896*aa2a44afSPaul Winder 
28979d26e4fcSRobert Mustacchi 	mutex_exit(&itrq->itrq_tx_lock);
28989d26e4fcSRobert Mustacchi 
28999d26e4fcSRobert Mustacchi 	return (NULL);
29009d26e4fcSRobert Mustacchi 
29019d26e4fcSRobert Mustacchi txfail:
29029d26e4fcSRobert Mustacchi 	/*
29039d26e4fcSRobert Mustacchi 	 * We ran out of resources. Return it to MAC and indicate that we'll
29049d26e4fcSRobert Mustacchi 	 * need to signal MAC. If there are allocated tcb's, return them now.
29059d26e4fcSRobert Mustacchi 	 * Make sure to reset their message block's, since we'll return them
29069d26e4fcSRobert Mustacchi 	 * back to MAC.
29079d26e4fcSRobert Mustacchi 	 */
290809aee612SRyan Zezeski 	if (tcb_ctx != NULL) {
290909aee612SRyan Zezeski 		tcb_ctx->tcb_mp = NULL;
291009aee612SRyan Zezeski 		i40e_tcb_reset(tcb_ctx);
291109aee612SRyan Zezeski 		i40e_tcb_free(itrq, tcb_ctx);
291209aee612SRyan Zezeski 	}
291309aee612SRyan Zezeski 
291409aee612SRyan Zezeski 	tcb = tcbhead;
291509aee612SRyan Zezeski 	while (tcb != NULL) {
291609aee612SRyan Zezeski 		i40e_tx_control_block_t *next = tcb->tcb_next;
291709aee612SRyan Zezeski 
291809aee612SRyan Zezeski 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
291909aee612SRyan Zezeski 		    tcb->tcb_type == I40E_TX_COPY);
292009aee612SRyan Zezeski 
29219d26e4fcSRobert Mustacchi 		tcb->tcb_mp = NULL;
29229d26e4fcSRobert Mustacchi 		i40e_tcb_reset(tcb);
29239d26e4fcSRobert Mustacchi 		i40e_tcb_free(itrq, tcb);
292409aee612SRyan Zezeski 		tcb = next;
29259d26e4fcSRobert Mustacchi 	}
29269d26e4fcSRobert Mustacchi 
29279d26e4fcSRobert Mustacchi 	mutex_enter(&itrq->itrq_tx_lock);
2928*aa2a44afSPaul Winder 	i40e_ring_tx_exit_nolock(itrq);
29299d26e4fcSRobert Mustacchi 	itrq->itrq_tx_blocked = B_TRUE;
29309d26e4fcSRobert Mustacchi 	mutex_exit(&itrq->itrq_tx_lock);
29319d26e4fcSRobert Mustacchi 
29329d26e4fcSRobert Mustacchi 	return (mp);
29339d26e4fcSRobert Mustacchi }
2934