io/igc/igc.c

/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2024 Oxide Computer Company
 */

/*
 * Intel I225/226 Ethernet Driver
 * ------------------------------
 *
 * This driver implements support for the Intel I225 and I226 Ethernet
 * controllers which support up to 2.5 GbE and generally only supports BASE-T
 * copper phys. This device is yet another variant on the venerable Intel 1 GbE
 * devices that are found in e1000g(4D) and igb(4D). This is its own driver in
 * part because that's how Intel did things and refactored their common code
 * which we import and is found in the 'core' directory.
 *
 * There is not a good datasheet for the MAC that we've been able to find for
 * this part. It's not clear that Intel even has a doc for this in their
 * Resource and Design Center. The primary datasheet documents the NVM and other
 * parts of it, but not the software interface. Based on observations from the
 * common code we describe this as somewhat of an evolution of the I217 and
 * I210, with less features than the I210, which comes from the server world
 * (which ws itself a more stripped down I350).
 *
 * The result of all this is us trying to focus on what we know about this part
 * and making some assumptions along the way. This includes things like:
 *
 * 1) We believe that the device only supports up to 4 RX and TX queues.
 * 2) There is only one TX context for each TX queue and it is mapped to the
 * queue.
 * 3) There is no support for the head writeback modes that we've found.
 * 4) This does otherwise support both the MSI-X and MSI/INTx interrupt
 * management which are shaped very differently in the device.
 * 5) The 2500BASE-T PHY support is unique, but the other PHY settings are
 * roughly the same as far as we can tell.
 *
 * There are certainly more differences than the points up above, but the above
 * are ones that generally influence our design.
 *
 * ------------
 * Organization
 * ------------
 *
 * This driver is first broken into two different pieces. There is the 'core'
 * code which we import from Intel via FreeBSD. All of these sources are in the
 * 'uts/common/io/igc/core' directory and we try our hardest to avoid modifying
 * them (hence the smatch gags). The core code can be thought of as abstracting
 * the MAC, NVM, and PHY across different chipsets (right now it's all the I225)
 * and providing us with a series of library calls that we can do to manage the
 * chip.
 *
 * The remaining files that sit alongside this one implement different portions
 * of functionality related to the device. In particular:
 *
 *  igc.[ch]:		This is the main entry point for the driver and the
 *			source of this block comment. It implements all of the
 *			basic DDI entry points: attach and detach, interrupts,
 *			PCI config space and register set up and tear down.
 *
 *			The header file contains all of the structure
 *			definitions that we use throughout this and the basic
 *			constants we use for sizing.
 *
 *  igc_gld.c		This file implements all of the major GLDv3 required
 *			entry points that are found in mac(9E). The guts of the
 *			actual I/O are in igc_ring.c, but getting and setting
 *			all of the various MAC properties and other bits is
 *			here.
 *
 *  igc_osdep.[ch]	The osdep (OS dependent) files, are used to define and
 *			implement the functionality required by the common code.
 *			igc_osdep.h is included in the build of each file.
 *
 *			We have a second use for igc_osdep.h which is where we
 *			put missing hardware definitions that apply. This is for
 *			cases where the core code doesn't have it and it should
 *			really live in igc_defines.h or igc_regs.h, but we keep
 *			it here to avoid modifying those.
 *
 *  igc_ring.c		This implements the core I/O routines of the device
 *			driver, starting with the descriptor ring setup and tear
 *			down as well as DMA, descriptor ring, and per-frame
 *			memory. It also implements all of the primary logic for
 *			transmitting and receiving frames.
 *
 *  igc_stat.c		This file deals with kstat creation and destruction as
 *			well as reading and fetching all of the registers that
 *			exist in hardware.
 *
 * There are a few primary data structures to be aware of. Their relationships
 * are shown in the following image and then described. Note, each structure has
 * many more fields than those pictured:
 *
 * +---------------+
 * | dev_info_t *  |
 * |              -|-+
 * | private data  | |
 * +---------------+ v
 *   +------------------------------+        +---------------------+
 *   | igc_t                        |        | igc_addr_t          |
 *   | per-instance primary         |  +---->|                     |
 *   | structure                    |  |+--->| Notes a MAC address | ...
 *   |                              |  ||    | stored in hardware  |
 *   | igc_addr_t    *igc_ucast    -|--+|    +---------------------+
 *   | igc_addr_t    *igc_mcast    -|---+      +---------------------------+
 *   | struct igc_hw *igc_hw       -|--------->| struct igc_hw (core code) |
 *   | igc_tx_ring_t *igc_tx_rings -|--+       |                           |
 *   | igc_rx_ring_t *igc_rx_rings -|--|---+   | igc_mac_info mac          |
 *   +------------------------------+  |   |   | igc_fc_info  fc           |
 *                                     |   |   | igc_phy_info phy          |
 *  +----------------------------------+   |   | igc_nvm_info nvm          |
 *  |                                      v   +---------------------------+
 *  |  +--------------------------------------+
 *  |  | igc_rx_ring_t                        |
 *  |  |                                      |
 *  |  | igc_adv_rx_desc *irr_ring         ---|--> rx hw descriptor ring
 *  |  | uint32_t        irr_next          ---|--> next entry to look for data
 *  |  | igc_rx_buffer_t **irr_work_list   ---|--> corresponds to ring entries
 *  |  | uint32_t        irr_nfree         ---|--> number of free list entries
 *  |  | igc_rx_buffer_t **irr_free_list   ---|--> set of buffers free for bind
 *  |  | igc_rx_buffer_t *irr_arena        ---|-+> array of all rx buffers
 *  |  +--------------------------------------+ |
 *  |                                           |
 *  |          +----------------------------+   |
 *  |          | igc_rx_buffer_t            |<--+
 *  |          |                            |
 *  |          | mblk_t            *igb_mp -|---> mblk_t for rx buffer
 *  |          | igc_dma_buffer_t  irb_dma -|---> DMA memory for rx buffer
 *  |          +----------------------------+
 *  |
 *  |   +------------------------------------+
 *  +-->| igc_tx_ring_t                      |
 *      |                                    |
 *      | icc_adv_tx_desc   *itr_ring      --|--> tx hw descriptor ring
 *      | uin32_t           itr_ring_head  --|--> next descriptor to recycle
 *      | uin32_t           itr_ring_fail  --|--> next descriptor to place
 *      | uin32_t           itr_ring_free  --|--> free descriptors in ring
 *      | igc_tx_buffer_t   **itr_work_list  |--> corresponds to ring entries
 *      | list_t            itr_free_list  --|--> available tx buffers
 *      | igc_tx_buffer_t   *itr_arena     --|-+> array of all tx buffers
 *      +------------------------------------+ |
 *                                             |
 *        +---------------------------------+  |
 *        | igc_tx_buffer_t                 |<-+
 *        |                                 |
 *        | mblk_t           *itb_mp      --|--> mblk to tx (only in first)
 *        | igc_dma_buffer_t itb_dma      --|--> tx DMA buffer for copy
 *        | ddi_dma_handle_t itb_bind_hdl --|--> DMA handle for bind
 *        +---------------------------------+
 *
 * igc_t		This is the primary data structure that exists for each
 *			instance of the driver. There is generally a 1:1
 *			relationship between a physical port, an instance of the
 *			driver, and a PCI function. This structure provides
 *			access to the device's registers and it embeds the
 *			common code's struct igc_hw.
 *
 * struct igc_hw	This structure is used by the core code and it contains
 *			information related to the MAC, PHY, NVM, and related
 *			information that the device uses. In general, this
 *			structure is used when performing API calls to the
 *			common code. The common code calls back into us in the
 *			igc_osdep.c interfaces.
 *
 * igc_tx_ring_t	This structure represents a single transmit ring in
 *			hardware, its associated software state, and
 *			miscellaneous data like statistics, MAC handles, etc.
 *			See the 'TX Data Path Design' section for more
 *			information.
 *
 * igc_rx_ring_t	This is the receive variant of a ring. It represents and
 *			tracks the hardware state along with all our metadata.
 *			One of these exists for each receive ring that we've
 *			enabled (currently one). See the 'RX Data Path Design'
 *			section for more information.
 *
 * igc_tx_buffer_t	This represents a single tx buffer in the driver. A tx
 *			buffer contains DMA based storage that it can use to
 *			transmit a packet and contains a second DMA handle that
 *			can be used to bind a specific mblk_t to it. tx buffers
 *			are capped at the current page size and can be smaller
 *			if the maximum packet size is smaller. A 1500 byte MTU
 *			will end up with a 2 KiB buffer due to the device's
 *			internal alignment requirements.
 *
 * igc_rx_buffer_t	This represents a single rx buffer in the driver. These
 *			buffers may be loaned up to MAC and then returned to us
 *			later. They contain a single DMA buffer which right now
 *			is a single contiguous buffer that fits the maximum
 *			packet size. Each buffer has a corresponding mblk_t that
 *			it is mapped to.
 *
 * igc_dma_buffer_t	This represent a DMA buffer in the system. DMA buffers
 *			are used for transmit buffers, receive buffers, or
 *			various ring descriptor entries. The DMA buffer
 *			structure is not inherently limited to a specific number
 *			of cookies. It is always mapped in our virtual address
 *			space and encapsulates the various DDI functions. In
 *			general, one expects to interface with the idb_va member
 *			when needing to access the memory, the idb_size member
 *			when wanting to understand how much memory is in the
 *			buffer, and the idb_hdl member when needing to access
 *			the DMA cookies.
 *
 * igc_addr_t		This represents a 48-bit Ethernet MAC address slot in
 *			the hardware that may or may not be used at any given
 *			point in time.
 *
 * --------------------
 * Rings and Interrupts
 * --------------------
 *
 * The I225/226 controller like the I210 supports up to 4 rx and tx rings. Due
 * to the long history of this controller and its tradition from the e1000g/igb
 * days and much older parts like the 8254x series, it has two entirely
 * different sets of interrupt modes. One where MSI-X is used and a mode where
 * a single MSI or INTx interrupt is used. Currently the driver only supports
 * the MSI-X mode as that gives us more flexibility and due to the fact that the
 * interrupt modes and register handling are different, reduces the complexity
 * in the driver.
 *
 * The hardware uses its IVAR registers to map specific queues to interrupts.
 * Each rx queue and tx queue is mapped to a specific bit position in the IVAR
 * and there is an additional IVAR register for miscellaneous causes like link
 * state changes. While the IVAR register allows for several bits for MSI-X
 * entries, for the most part, it appears that there is only support for values
 * in the range [0, 4] based on the I210 which we believe extends to the I225/6.
 *
 * MSI-X mode causes the device's various interrupt registers to be split into
 * two groups the 'legacy' and 'extended' (sometimes called advanced) ones. The
 * extended ones all start with 'E'. When in MSI-X mode, the EICR (cause), EICS
 * (cause set), EIAC (auto-clear), EIMS (mask set) registers all operate with
 * indexes that refer to the MSI-X. The primary way to temporarily disable
 * interrupts for polling is to remove the given MSI-X from the auto-clear set
 * and to clear it from the enabled mask set.
 *
 * The implication of all of this is that we can only really disable interrupts
 * for polling on a per-MSI-X basis. This generally means that the design for
 * interrupts and rings is that all the tx rings and the link state change
 * events share interrupt 0, while rx rings use interrupts 1-4. Because the x86
 * 'apix' modules end up defaulting to two interrupts to a driver, we end up
 * only supporting a single rx and tx ring for the time being, though the driver
 * is phrased in terms of a variable number of such rings.
 *
 * -------------------
 * RX Data Path Design
 * -------------------
 *
 * The rx data path is based around allocating a fixed number of receive buffers
 * for each ring. We have two goals in the allocation buffer and ring design:
 *
 * 1) We want to make sure that the ring is always full of valid descriptors for
 *    rx to prevent stalls. One implication of this is that we will always
 *    refill a received buffer with a new one and notify the hardware that the
 *    buffer is usable again.
 *
 * 2) We would prefer to not have to copy received memory and instead bind the
 *    DMA memory directly into an mblk_t.
 *
 * To satisfy (1) we need to allocate at least as many rx buffers as there are
 * ring entries. The ring is sized by default to 512 entries, which is a
 * somewhat arbitrary, but common, size. We then say that we want to be able to
 * loan half of our entries up the stack at any given time. This leads to us
 * allocating 1.5x the ring size rx buffers.
 *
 * All of the rx buffers are stored in the irr_arena array. They are then split
 * between the free list and the ring's work list. The work list is an array
 * that is a 1:1 mapping to a location in the descriptor ring. That is index 4
 * of the work list (irr_work_list[4]) corresponds to index 4 of the descriptor
 * ring (irr_ring[4]). However, this may refer to any of the rx descriptors that
 * is in the irr_arena. When we start up the ring, the first ring size entries
 * are all inserted into the work list and then the remaining entries are
 * inserted into the free list.
 *
 * Entries that are in the work list are always given to hardware. We track the
 * next place for us to scan for received packets through the 'irr_next' index
 * into the descriptor ring. When an interrupt fires, we start at irr_next and
 * iterate through the descriptor ring continuing while we find valid, received
 * packets. When we process a packet, we look at two things to consider whether
 * we bind it or copy it to a new mblk_t. The first piece is the received
 * packet's length. If the packet is small, there is not much value in binding
 * it and instead we just allocate and copy a new buffer for the packet.
 *
 * The second is if there are free rx descriptors. To keep goal (1) valid, we
 * only will loan a packet up if there is an entry on the free list that can
 * replace the rx buffer, as otherwise we'd want to make sure we don't stall the
 * ring. If an rx buffer is loaned, the entry on the free list takes its place
 * in the descriptor ring and when the networking stack is finally done with the
 * mblk_t, it'll be returned to us as part of the freemsg()/freeb() destructor.
 * This lifetime is illustrated in the following diagram:
 *
 *
 *    +-------------+                        +-----------+
 *    | Work List   |<---*-------------------| Free List |
 *    | Owned by HW |    . . Used to replace |   Idle    |
 *    +-------------+        loaned buffers  +-----------+
 *      |     | ^                                  ^
 *      |     | . . . Reused if a                  |
 *      |     +-+     copy is done                 . . . Returned to driver via
 *      |                                          |     freemsg() which calls
 *      |                                          |     igc_rx_recycle().
 *      v                                          |
 *    +-------------------+                        |
 *    | Loaned            |------------------------+
 *    | Owned by netstack |
 *    +-------------------+
 *
 * Currently the rx data path uses rx buffers that are equal to the maximum size
 * of a packet (rounded up based on hardware's 1 KiB alignment requirement).
 * This was mostly done for initial simplicity, though it comes at a memory
 * cost. It is possible to design this to be more like the tx subsystem where we
 * use fixed page size buffers and just cons up an mblk_t chain with b_cont
 * pointers.
 *
 * -------------------
 * TX Data Path Design
 * -------------------
 *
 * The tx data path is a bit different in design from the rx data path. When the
 * system wants to tx data there are two fundamental building blocks that we
 * use, both of which leverage the igc_tx_buffer_t:
 *
 * 1) We use the DMA memory that is allocated with the buffer and copy the
 *    mblk_t data into it. This is used when we have small mblk_t's.
 *
 * 2) We utilize the DMA handle that is in the tx buffer (but not the buffer's
 *    DMA memory) to perform DMA binding. This can result in multiple cookies
 *    and therefore descriptors mapping to the single buffer.
 *
 * Because a given tx buffer may end up using more than one descriptor and we
 * have to account for transmit context descriptors, which are used for
 * indicating checksum and segmentation offloads, we end up only allocating a
 * number of transmit buffers equal to the ring size. In addition, the tx data
 * buffer's maximum size is capped at the size of a single page. This is done
 * because we often aren't going to be copying and if we are, we don't need that
 * much more memory. The actual size may be smaller depending on the MTU.
 *
 * The tx descriptor ring is used in a bit of a different way. While part of the
 * reason for this is that we are filling it based on the stack's demands and
 * therefore only need to fill in descriptors when there's a need, the second
 * reason is because of how the hardware reports back events. There are two
 * major kinds of descriptors that can be entered into the ring. There are the
 * aforementioned context descriptors and then data descriptors. While data
 * descriptors support an interrupt on completion, context descriptors do not.
 *
 * When an mblk_t comes in to be transmitted, we walk all of the mblk_t's
 * associated with it via the b_cont pointer. For each one, we look at the size
 * of the data and determine whether or not to perform DMA binding or to copy it
 * into the current tx buffer. A given tx buffer can be used to copy multiple
 * different mblk_t's. Imagine a pathological case where we had a 500 byte
 * packet split into 125 byte chunks, this would end up using a single tx data
 * buffer.  However, if you imagine a large chunk of TCP data, this may be
 * spread across several mblk_t's so we may end up leveraging multiple tx data
 * buffers.
 *
 * The transmit buffers that are available are stored on a free list. This is
 * managed as a list_t as we end up needing to often track groups of descriptors
 * to allocate and free across packet transmit and recycling. We don't count the
 * number of transmit buffers that are free per se, but it generally tracks the
 * number of free descriptors which do track as in the worst case there is a 1:1
 * relationship between buffers and descriptors and more generally it's 1:n,
 * that is there are multiple descriptors used for a single buffer.
 *
 * The transmit ring is managed through a combination of three integers, the
 * itr_ring_head, the itr_ring_tail, and the itr_ring_free. The ring's tail
 * represents the place where the driver will place new data to transmit. The
 * ring's head represents the first place that we should check for a packet's
 * completion when we're performing recycling (the act of acknowledging what
 * hardware has processed internal to the driver) due to a tx interrupt or
 * manual recycling in the transmit path.
 *
 * When placing a packet as a series of descriptor rings we'll end up doing the
 * following:
 *
 * 1) First we determine how to map each mblk_t as mentioned above.
 * 2) This will then be turned into descriptors in the ring. Each tx data buffer
 *    that is used is placed in the itr_work_list at the corresponding index
 *    that they are used in the ring. There is one special case here, if a
 *    context descriptor is used, the first transmit buffer will refer to the
 *    context descriptor's entry (which always comes before data).
 * 3) We'll ensure that there are enough descriptors for this packet to fit into
 *    the ring or if it would exceed our mandatory gap threshold. If so, then
 *    we'll undo all the work we just did and return the mblk_t to MAC and
 *    indicate that the ring is blocked. MAC will be notified later when we free
 *    up transmit descriptors.
 * 4) In the first transmit data buffer we'll store both the mblk_t and then
 *    we'll store what the index of the last descriptor that's used is. This is
 *    important for recycling. We also indicate that the last descriptor should
 *    be the one that reports its status on interrupt completion.
 * 5) We'll notify hardware that there is data for it to transmit by writing to
 *    the ring's tail pointer.
 *
 * This all works reasonably okay, except for the small problem of the bill,
 * which we pay off in the form of recycling. Recycling is going through the
 * ring and seeing which descriptors are free. While the transmit path described
 * above is the only path that is allowed to move the tail, the recycling path
 * is the only one that's allowed to adjust the head.
 *
 * When we perform recycling we look at the current head and its corresponding
 * tx buffer. There will always be a tx buffer in the same index in the
 * itr_work_list[] unless a serious programmer error has occurred. This buffer
 * will tell us what the index to check for completion is via its itb_last_desc
 * member (only valid when itb_first is set to true). If this index indicates
 * that it has been processed by hardware, then we process all entries between
 * here and there.
 *
 * When we process descriptors, we bunch up the transmit descriptors and
 * mblk_t's. We'll reset the transmit descriptor (freeing any DMA binding if
 * used) and append the mblk_t if it exists to be freed in one large
 * freemsgchain() at the end. The fact that we won't free any tx buffers
 * associated with a packet until they're all done is important. This makes
 * sure that any memory that we have bound from the mblk_t remains valid the
 * entire time.
 *
 * If we have freed enough descriptors as part of this to allow mac to send data
 * again, then once we have finished all processing and dropped the lock, we
 * will notify MAC.
 *
 * When we are processing descriptors here we try to avoid holding the itr_lock
 * except for the start and end of the process. This is an important way to
 * ensure that we don't block transmits. Because of this, there can only be one
 * thread performing a recycle at any given time between the interrupt path and
 * the transmit path trying to clean up. This is maintained using the
 * 'itr_recycle' boolean. If a recycle is already in progress then there's
 * generally not much reason to perform one simultaneously and so the caller
 * will just return. This is why the head (and thus returning descriptors) is
 * only used by the recycle path.
 *
 * -------
 * Locking
 * -------
 *
 * Mutexes exist on three different structures in the driver:
 *
 * 1) igc_t (igc_lock)
 * 2) igc_rx_ring_t (irr_lock, irr_free_lock)
 * 3) igc_tx_ring_t (itr_lock)
 *
 * The following rules hold for locking in the driver:
 *
 * 1) One should not hold locks for both the rx rings and tx rings at the same
 *    time. If this is required, please determine if it is absolutely necessary.
 * 2) You should always take the controller's lock ahead of any ring's locks.
 * 3) The general rx ring lock (irr_lock) should be taken ahead of the free list
 *    lock (irr_free_lock) if both are required.
 *
 * -------------------
 * Future Improvements
 * -------------------
 *
 * This driver was initially written with an eye towards getting something that
 * had broad use for folks with this hardware and not towards enabling every
 * feature immediately. Here are some areas that can be improved upon in the
 * driver.
 *
 *  - Multiple ring, RSS support: As the OS changes towards offering more
 *    interrupts or opting to participate in IRM, then you can more easily
 *    offer RSS and related features. This should likely show up as a single
 *    rx group with multiple rings and leverage the tx pseudo-group support.
 *
 *  - TCP segmentation offload support: Right now the driver does not support
 *    TSO. It'd potentially be a useful addition and help out folks. Fetching
 *    information for TSO is in the tx data path right now.
 *
 *  - FMA Support: Currently the driver does not rig up support for FMA.
 *    Participating in that and more generally being able to reset the device
 *    while it is operating in the face of fatal errors would be good.
 *
 *  - TX stall detection: Related to the above, carefully designing a tx stall
 *    detection and resetting the device when that happens would probably be
 *    useful.
 *
 *  - UFM support: Exposing the NVM and PBA (printed board assembly) through the
 *    UFM subsystem would be a good thing to do.
 *
 *  - Dynamic MTU changing: Right now the driver takes advantage of the
 *    simplification of not allowing the MTU to change once the device has been
 *    started. This isn't great, but it is far from the first (igb, e1000g,
 *    ixgbe, etc.) to do this. It would be nice if this was lifted.
 */

#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/modctl.h>
#include <sys/cmn_err.h>
#include <sys/pci.h>
#include <sys/sysmacros.h>
#include <sys/debug.h>
#include <sys/bitext.h>

#include "igc.h"

/*
 * The core code expects the igc_mcast_raw to be a uint8_t packed array. We use
 * the ether_addr_t to make this a little more explicit and easy to reason
 * about, but that means we are relying on this size.
 */
CTASSERT(sizeof (ether_addr_t) == 6);

uint32_t
igc_read32(igc_t *igc, uint32_t reg)
{
	uint32_t *addr;
	ASSERT3U(reg, <, igc->igc_regs_size);
	addr = (uint32_t *)(igc->igc_regs_base + reg);
	return (ddi_get32(igc->igc_regs_hdl, addr));
}

void
igc_write32(igc_t *igc, uint32_t reg, uint32_t val)
{
	uint32_t *addr;
	ASSERT3U(reg, <, igc->igc_regs_size);
	addr = (uint32_t *)(igc->igc_regs_base + reg);
	ddi_put32(igc->igc_regs_hdl, addr, val);
}

/*
 * Ask hardware if the link is up and ready. Note, this assumes that we're on a
 * copper phy and short circuits a few things. See igb_is_link_up() for what
 * this looks like for non-copper PHYs if that ever becomes relevant.
 */
static bool
igc_link_up(igc_t *igc)
{
	ASSERT(MUTEX_HELD(&igc->igc_lock));

	/*
	 * When the link is up, then the core code will clear the value below.
	 * Otherwise we likely need to assume it's down.
	 */
	(void) igc_check_for_link(&igc->igc_hw);
	return (!igc->igc_hw.mac.get_link_status);
}

static void
igc_intr_lsc(igc_t *igc)
{
	link_state_t orig_state, new_state;
	uint32_t mmd_base;

	mutex_enter(&igc->igc_lock);
	orig_state = igc->igc_link_state;

	/*
	 * Always force a check of the link.
	 */
	igc->igc_hw.mac.get_link_status = true;
	if (igc_link_up(igc)) {
		uint16_t duplex = 0;

		(void) igc_get_speed_and_duplex(&igc->igc_hw,
		    &igc->igc_link_speed, &duplex);

		switch (duplex) {
		case HALF_DUPLEX:
			igc->igc_link_duplex = LINK_DUPLEX_HALF;
			break;
		case FULL_DUPLEX:
			igc->igc_link_duplex = LINK_DUPLEX_FULL;
			break;
		default:
			igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
			break;
		}
		igc->igc_link_state = LINK_STATE_UP;
	} else {
		igc->igc_link_state = LINK_STATE_DOWN;
		igc->igc_link_speed = 0;
		igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
	}
	new_state = igc->igc_link_state;

	/*
	 * Next, grab a bunch of information from the PHY for future us.
	 */
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_CONTROL, &igc->igc_phy_ctrl);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_STATUS, &igc->igc_phy_status);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_ADV,
	    &igc->igc_phy_an_adv);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_LP_ABILITY,
	    &igc->igc_phy_lp);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_EXP,
	    &igc->igc_phy_an_exp);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_CTRL,
	    &igc->igc_phy_1000t_ctrl);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_STATUS,
	    &igc->igc_phy_1000t_status);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS,
	    &igc->igc_phy_ext_status);
	(void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS,
	    &igc->igc_phy_ext_status);

	mmd_base = STANDARD_AN_REG_MASK << MMD_DEVADDR_SHIFT;
	(void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_CTRL,
	    &igc->igc_phy_mmd_ctrl);
	(void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_STS1,
	    &igc->igc_phy_mmd_sts);
	mutex_exit(&igc->igc_lock);

	if (orig_state != new_state) {
		mac_link_update(igc->igc_mac_hdl, new_state);
	}
}

static uint_t
igc_intr_rx_queue(caddr_t arg1, caddr_t arg2)
{
	igc_t *igc = (igc_t *)arg1;
	uintptr_t queue = (uintptr_t)arg2;
	igc_rx_ring_t *ring;
	mblk_t *mp = NULL;

	ASSERT3U(queue, <, igc->igc_nrx_rings);
	ring = &igc->igc_rx_rings[queue];

	mutex_enter(&ring->irr_lock);
	if ((ring->irr_flags & IGC_RXR_F_POLL) == 0) {
		mp = igc_ring_rx(ring, IGC_RX_POLL_INTR);
	}
	mutex_exit(&ring->irr_lock);

	if (mp != NULL) {
		mac_rx_ring(igc->igc_mac_hdl, ring->irr_rh, mp, ring->irr_gen);
	}

	return (DDI_INTR_CLAIMED);
}

static uint_t
igc_intr_tx_other(caddr_t arg1, caddr_t arg2)
{
	igc_t *igc = (igc_t *)arg1;
	uint32_t icr = igc_read32(igc, IGC_ICR);

	igc_tx_recycle(igc, &igc->igc_tx_rings[0]);

	if ((icr & IGC_ICR_LSC) != 0) {
		igc_intr_lsc(igc);
	}

	return (DDI_INTR_CLAIMED);
}

static bool
igc_setup_regs(igc_t *igc)
{
	int ret;
	ddi_device_acc_attr_t da;

	if (pci_config_setup(igc->igc_dip, &igc->igc_cfgspace) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to map config space");
		return (false);
	}

	if (ddi_dev_regsize(igc->igc_dip, IGC_PCI_BAR, &igc->igc_regs_size) !=
	    DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to get BAR %u size",
		    IGC_PCI_BAR - 1);
		return (false);
	}

	bzero(&da, sizeof (ddi_device_acc_attr_t));
	da.devacc_attr_version = DDI_DEVICE_ATTR_V1;
	da.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC;
	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
	da.devacc_attr_access = DDI_DEFAULT_ACC;

	if ((ret = ddi_regs_map_setup(igc->igc_dip, IGC_PCI_BAR,
	    &igc->igc_regs_base, 0, igc->igc_regs_size, &da,
	    &igc->igc_regs_hdl)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to map registers: %d",
		    ret);
		return (false);
	}

	return (true);
}

/*
 * Go through the process of initializing the igc core code. First we have to
 * fill in the information that the common code requires to identify the
 * hardware and set the mac type. After that we can go through and set up all of
 * the function initialization.
 */
static bool
igc_core_code_init(igc_t *igc)
{
	int ret;
	int *regs;
	uint_t nprop;

	igc->igc_hw.back = igc;
	igc->igc_hw.vendor_id = pci_config_get16(igc->igc_cfgspace,
	    PCI_CONF_VENID);
	igc->igc_hw.device_id = pci_config_get16(igc->igc_cfgspace,
	    PCI_CONF_DEVID);
	igc->igc_hw.revision_id = pci_config_get8(igc->igc_cfgspace,
	    PCI_CONF_REVID);
	igc->igc_hw.subsystem_vendor_id = pci_config_get16(igc->igc_cfgspace,
	    PCI_CONF_SUBVENID);
	igc->igc_hw.subsystem_device_id = pci_config_get16(igc->igc_cfgspace,
	    PCI_CONF_SUBSYSID);

	if ((ret = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, igc->igc_dip,
	    DDI_PROP_DONTPASS, "reg", &regs, &nprop)) != DDI_PROP_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to look up 'reg' "
		    "property: %d", ret);
		return (false);
	}

	/*
	 * We fill out the function and command word. We currently don't fill
	 * out the bus type, speed, and width as it's not used by the common
	 * code, leaving it all at unknown. We can grab that information when it
	 * needs it. We do fill out the function and command word as the former
	 * is important and the latter is easy to grab.
	 */
	igc->igc_hw.bus.func = PCI_REG_FUNC_G(regs[0]);
	igc->igc_hw.bus.pci_cmd_word = pci_config_get16(igc->igc_cfgspace,
	    PCI_CONF_COMM);
	ddi_prop_free(regs);

	/*
	 * The common code asks for the memory mapped address to be set in its
	 * structure. Though in theory it promises not to use it.
	 */
	igc->igc_hw.hw_addr = (uint8_t *)igc->igc_regs_base;

	if ((ret = igc_set_mac_type(&igc->igc_hw)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to set mac type: %d",
		    ret);
		return (false);
	}

	if ((ret = igc_setup_init_funcs(&igc->igc_hw, true)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to setup core code "
		    "function pointers: %d", ret);
		return (false);
	}

	/*
	 * Go ahead and attempt to get the bus information even though this
	 * doesn't actually do anything right now.
	 */
	if ((ret = igc_get_bus_info(&igc->igc_hw)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "core code failed to get bus "
		    "info: %d", ret);
		return (false);
	}

	return (true);
}

static bool
igc_limits_init(igc_t *igc)
{
	switch (igc->igc_hw.mac.type) {
	case igc_i225:
		igc->igc_limits.il_max_rx_rings = IGC_MAX_RX_RINGS_I225;
		igc->igc_limits.il_max_tx_rings = IGC_MAX_RX_RINGS_I225;
		igc->igc_limits.il_max_mtu = IGC_MAX_MTU_I225;
		break;
	default:
		dev_err(igc->igc_dip, CE_WARN, "unknown MAC type: %u",
		    igc->igc_hw.mac.type);
		return (false);
	}

	return (true);
}

/*
 * Determine the hardware buffer sizes that are required for the given MTU.
 * There are a few different constraints that we try to enforce here that come
 * from the hardware and others that come from us:
 *
 * 1) The hardware requires that the rx and tx sizes all be 1 KiB (0x400) byte
 * aligned.
 * 2) Our tx engine can handle copying across multiple descriptors, so we cap
 * the maximum tx buffer size at one page.
 * 3) Right now our rx engine does not handle scanning multiple buffers for rx
 * (see the theory statement), so we end up making the rx buffer have to fix the
 * maximum frame size.
 * 4) rx buffers need to also account for IP alignment, so we make sure to
 * allocate extra bytes for that.
 */
void
igc_hw_buf_update(igc_t *igc)
{
	unsigned long pagesize = ddi_ptob(igc->igc_dip, 1);
	uint32_t tx_mtu;

	igc->igc_max_frame = igc->igc_mtu + sizeof (struct ether_vlan_header) +
	    ETHERFCSL;
	igc->igc_rx_buf_size = P2ROUNDUP_TYPED(igc->igc_max_frame +
	    IGC_RX_BUF_IP_ALIGN, IGC_BUF_ALIGN, uint32_t);
	tx_mtu = P2ROUNDUP_TYPED(igc->igc_max_frame, IGC_BUF_ALIGN, uint32_t);
	igc->igc_tx_buf_size = MIN(tx_mtu, pagesize);
}

static bool
igc_intr_init(igc_t *igc)
{
	int ret, types, nintrs, navail, req;
	const int min_nintrs = 2;

	if ((ret = ddi_intr_get_supported_types(igc->igc_dip, &types)) !=
	    DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to get supported "
		    "interrupts: %d", ret);
		return (false);
	}

	/*
	 * For now, we simplify our lives and device support by only supporting
	 * MSI-X interrupts. When we find versions of this without MSI-X
	 * support, we can go and add what we need.
	 */
	if ((types & DDI_INTR_TYPE_MSIX) == 0) {
		dev_err(igc->igc_dip, CE_WARN, "device does not support MSI-X, "
		    "found %d", types);
		return (false);
	}

	if ((ret = ddi_intr_get_nintrs(igc->igc_dip, DDI_INTR_TYPE_MSIX,
	    &nintrs)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to get number of "
		    "supported MSI-X interrupts: %d", ret);
		return (false);
	}

	if (nintrs < min_nintrs) {
		dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires "
		    "%d MSI-X interrupts be supported, found %d", min_nintrs,
		    nintrs);
		return (false);
	}

	if ((ret = ddi_intr_get_navail(igc->igc_dip, DDI_INTR_TYPE_MSIX,
	    &navail)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to get number of "
		    "available MSI-X interrupts: %d", ret);
		return (false);
	}

	if (navail < min_nintrs) {
		dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires "
		    "%d MSI-X interrupts be available, found %d", min_nintrs,
		    navail);
		return (false);
	}

	/*
	 * In the future this could be based upon the multiple queues that the
	 * device supports, but for now it's limited to two. See 'Rings and
	 * Interrupts' in the theory statement for more background.
	 */
	req = min_nintrs;
	req = MIN(req, navail);
	igc->igc_intr_size = req * sizeof (ddi_intr_handle_t);
	igc->igc_intr_handles = kmem_alloc(igc->igc_intr_size, KM_SLEEP);

	if ((ret = ddi_intr_alloc(igc->igc_dip, igc->igc_intr_handles,
	    DDI_INTR_TYPE_MSIX, 0, req, &igc->igc_nintrs,
	    DDI_INTR_ALLOC_NORMAL)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to allocate interrupts: "
		    "%d", ret);
		return (false);
	}

	igc->igc_intr_type = DDI_INTR_TYPE_MSIX;
	igc->igc_attach |= IGC_ATTACH_INTR_ALLOC;
	if (igc->igc_nintrs < min_nintrs) {
		dev_err(igc->igc_dip, CE_WARN, "received %d interrupts, but "
		    "needed at least %d", igc->igc_nintrs, min_nintrs);
		return (false);
	}

	if ((ret = ddi_intr_get_pri(igc->igc_intr_handles[0],
	    &igc->igc_intr_pri)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt "
		    "priority: %d", ret);
		return (false);
	}

	if ((ret = ddi_intr_get_cap(igc->igc_intr_handles[0],
	    &igc->igc_intr_cap)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt "
		    "capabilities: %d", ret);
		return (false);
	}

	return (true);
}

/*
 * As part of allocating our rings we make the following assumptions about
 * interrupt assignments. All tx rings share interrupt 0. All rx rings have
 * separate interrupts starting from interrupt 1. This design may likely change
 * in the face of actual multi-ring support
 */
static bool
igc_rings_alloc(igc_t *igc)
{
	uint32_t intr = 0;
	igc->igc_tx_rings = kmem_zalloc(sizeof (igc_tx_ring_t) *
	    igc->igc_ntx_rings, KM_SLEEP);

	for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
		igc->igc_tx_rings[i].itr_igc = igc;
		igc->igc_tx_rings[i].itr_idx = i;
		igc->igc_tx_rings[i].itr_intr_idx = intr;
		mutex_init(&igc->igc_tx_rings[i].itr_lock, NULL, MUTEX_DRIVER,
		    DDI_INTR_PRI(igc->igc_intr_pri));
		if (!igc_tx_ring_stats_init(igc, &igc->igc_tx_rings[i])) {
			return (false);
		}
	}

	igc->igc_rx_rings = kmem_zalloc(sizeof (igc_rx_ring_t) *
	    igc->igc_nrx_rings, KM_SLEEP);
	intr = 1;

	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++, intr++) {
		igc->igc_rx_rings[i].irr_igc = igc;
		igc->igc_rx_rings[i].irr_idx = i;
		igc->igc_rx_rings[i].irr_intr_idx = intr;
		mutex_init(&igc->igc_rx_rings[i].irr_lock, NULL, MUTEX_DRIVER,
		    DDI_INTR_PRI(igc->igc_intr_pri));
		mutex_init(&igc->igc_rx_rings[i].irr_free_lock, NULL,
		    MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri));
		cv_init(&igc->igc_rx_rings[i].irr_free_cv, NULL, CV_DRIVER,
		    NULL);
		if (!igc_rx_ring_stats_init(igc, &igc->igc_rx_rings[i])) {
			return (false);
		}
	}

	ASSERT3U(intr, ==, igc->igc_nintrs);

	return (true);
}

/*
 * Allocate our interrupts. Note, we have more or less constrained the device
 * right now to only request two interrupts which we use in a fixed way. If we
 * end up with more varied queue support then this should be changed around.
 */
static bool
igc_intr_hdlr_init(igc_t *igc)
{
	int ret;

	if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[0],
	    igc_intr_tx_other, igc, NULL)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to add tx/other "
		    "interrupt handler: %d", ret);
		return (false);
	}

	if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[1],
	    igc_intr_rx_queue, igc, (uintptr_t)0)) != DDI_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to add rx interrupt "
		    "handler: %d", ret);
		if ((ret = ddi_intr_remove_handler(igc->igc_intr_handles[0])) !=
		    DDI_SUCCESS) {
			dev_err(igc->igc_dip, CE_WARN, "failed to remove "
			    "tx/other interrupt handler");
		}
		return (false);
	}

	return (true);
}

static void
igc_hw_control(igc_t *igc, bool take)
{
	uint32_t ctrl = igc_read32(igc, IGC_CTRL_EXT);

	if (take) {
		ctrl |= IGC_CTRL_EXT_DRV_LOAD;
	} else {
		ctrl &= ~IGC_CTRL_EXT_DRV_LOAD;
	}

	igc_write32(igc, IGC_CTRL_EXT, ctrl);
}

/*
 * Basic device initialization and sanity check. This covers that we can
 * properly reset the device, validate its checksum, and get a valid MAC
 * address.
 */
static bool
igc_hw_init(igc_t *igc)
{
	int ret;
	uint32_t eecd;

	if ((ret = igc_reset_hw(&igc->igc_hw)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d",
		    ret);
		return (false);
	}

	/*
	 * Goodbye firmware.
	 */
	igc_hw_control(igc, true);

	/*
	 * Check the NVM validiity if a device is present.
	 */
	eecd = igc_read32(igc, IGC_EECD);
	if ((eecd & IGC_EECD_EE_DET) != 0) {
		if ((ret = igc_validate_nvm_checksum(&igc->igc_hw)) !=
		    IGC_SUCCESS) {
			dev_err(igc->igc_dip, CE_WARN, "failed to validate "
			    "igc NVM checksum: %d", ret);
			return (false);
		}
	}

	if ((ret = igc_read_mac_addr(&igc->igc_hw)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to read MAC address: %d",
		    ret);
		return (false);
	}

	if ((ret = igc_get_phy_id(&igc->igc_hw)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to get PHY id: %d", ret);
		return (false);
	}

	return (true);
}

/*
 * In case the user has modified the LED state through MAC_CAPAB_LED, restore
 * that back to the defaults we got when we started up the device.
 */
static void
igc_led_fini(igc_t *igc)
{
	igc_write32(igc, IGC_LEDCTL, igc->igc_ledctl);
}

/*
 * Traditionally the Intel NIC drivers avoid touching activity pins as part of
 * their behavior for what we use. We also don't touch a pin if it's in SDP mode
 * and not being used to drive an LED as it means it's likely not for us.
 */
static bool
igc_led_ignore(i225_led_mode_t mode)
{
	switch (mode) {
	case I225_LED_M_FILTER_ACT:
	case I225_LED_M_LINK_ACT:
	case I225_LED_M_SDP:
	case I225_LED_M_PAUSE:
	case I225_LED_M_ACT:
		return (true);
	default:
		return (false);
	}
}

static inline uint32_t
igc_led_bitoff(uint32_t led)
{
	VERIFY3U(led, <, 3);
	return (led * 8);
}

static inline uint32_t
igc_led_get_mode(uint32_t led, uint32_t reg)
{
	uint32_t off = igc_led_bitoff(led);
	return (bitx32(reg, 3 + off, off));
}

static inline uint32_t
igc_led_set_mode(uint32_t led, uint32_t reg, i225_led_mode_t mode)
{
	uint32_t off = igc_led_bitoff(led);
	return (bitset32(reg, 3 + off, off, mode));
}

static inline uint32_t
igc_led_get_ivrt(uint32_t led, uint32_t reg)
{
	uint32_t off = igc_led_bitoff(led) + 6;
	return (bitx32(reg, off, off));
}

static inline uint32_t
igc_led_set_blink(uint32_t led, uint32_t reg, bool en)
{
	uint32_t off = igc_led_bitoff(led) + 7;
	return (bitset32(reg, off, off, en));
}

/*
 * There are three LEDs on the chip. The reference defines LED0 for 1 GbE link
 * up, LED1 for a 2.5GbE link up, and LED 2 for activity. However, this is all
 * controllable in the NVM so we shouldn't assume that these have any of their
 * default values. We instead read the LEDCTL register to see how it was set up
 * by default (though the NVM would likely be better). We then create pre-canned
 * LEDCTL register values for on, off, and default. See igc_osdep.h for some of
 * the caveats in definitions here. Note, we only tweak the non-activity LEDs
 * and if an LED has been indicated that it's being used for SDP, we don't touch
 * it.
 */
static void
igc_led_init(igc_t *igc)
{
	uint32_t led = igc_read32(igc, IGC_LEDCTL);

	igc->igc_ledctl = led;
	igc->igc_ledctl_on = led;
	igc->igc_ledctl_off = led;
	igc->igc_ledctl_blink = led;

	for (uint32_t i = 0; i < IGC_I225_NLEDS; i++) {
		i225_led_mode_t mode = igc_led_get_mode(i, led);
		if (!igc_led_ignore(mode)) {
			/*
			 * If the inversion logic is on, that changes what the
			 * on and off modes mean, so we need to change how we
			 * set that appropriately.
			 */
			if (igc_led_get_ivrt(i, led) != 0) {
				igc->igc_ledctl_on = igc_led_set_mode(i,
				    igc->igc_ledctl_on, I225_LED_M_OFF);
				igc->igc_ledctl_off = igc_led_set_mode(i,
				    igc->igc_ledctl_off, I225_LED_M_ON);
				igc->igc_ledctl_blink = igc_led_set_mode(i,
				    igc->igc_ledctl_blink, I225_LED_M_OFF);
			} else {
				igc->igc_ledctl_on = igc_led_set_mode(i,
				    igc->igc_ledctl_on, I225_LED_M_ON);
				igc->igc_ledctl_off = igc_led_set_mode(i,
				    igc->igc_ledctl_off, I225_LED_M_OFF);
				igc->igc_ledctl_blink = igc_led_set_mode(i,
				    igc->igc_ledctl_blink, I225_LED_M_ON);
			}
		}

		igc->igc_ledctl_blink = igc_led_set_blink(i,
		    igc->igc_ledctl_blink, true);
	}

	igc->igc_led_mode = MAC_LED_DEFAULT;
}

static void
igc_write_ivar(igc_t *igc, uint32_t queue, bool rx, uint32_t msix)
{
	const uint32_t ivarno = queue >> 1;
	const uint32_t reg = IGC_IVAR0 + ivarno * 4;
	const uint32_t val = msix | IGC_IVAR_VALID;
	uint32_t bitoff, bitend, ivar;

	if (rx) {
		if ((queue % 2) == 0) {
			bitoff = IGC_IVAR_RX0_START;
		} else {
			bitoff = IGC_IVAR_RX1_START;
		}
	} else {
		if ((queue % 2) == 0) {
			bitoff = IGC_IVAR_TX0_START;
		} else {
			bitoff = IGC_IVAR_TX1_START;
		}
	}
	bitend = bitoff + IGC_IVAR_ENT_LEN - 1;

	ivar = igc_read32(igc, reg);
	ivar = bitset32(ivar, bitend, bitoff, val);
	igc_write32(igc, reg, ivar);
	igc->igc_eims |= 1 << msix;
}

/*
 * Here we need to go through and initialize the hardware's notion of how
 * interrupts are mapped to causes. The device must be specifically enabled for
 * MSI-X and then this is also where we go ensure that all of our interrupt
 * coalescing is properly enabled. Note, we must first touch the GPIE register
 * to enable MSI-X settings otherwise later settings won't do anything.
 */
static void
igc_hw_intr_init(igc_t *igc)
{
	uint32_t gpie, ivar;

	gpie = IGC_GPIE_NSICR | IGC_GPIE_MSIX_MODE | IGC_GPIE_EIAME |
	    IGC_GPIE_PBA;
	igc_write32(igc, IGC_GPIE, gpie);

	/*
	 * Other causes are always explicitly mapped to cause 0. Each ring then
	 * has its own mapping. In the MISC IVAR, these start at bit 8. We leave
	 * the '0 |' out below just to avoid a compiler complaining. We also
	 * must unamsk this interrupt cause, which is in bit 0.
	 */
	ivar = IGC_IVAR_VALID << 8;
	igc_write32(igc, IGC_IVAR_MISC, ivar);
	igc->igc_eims = 1;

	/*
	 * There are a few IVAR registers available in hardware. Each IVAR
	 * register handles mapping a given queue to an MSI-X. Each IVAR handles
	 * two queues.
	 */
	for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
		igc_write_ivar(igc, i, false,
		    igc->igc_tx_rings[i].itr_intr_idx);
	}

	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
		igc_write_ivar(igc, i, true, igc->igc_rx_rings[i].irr_intr_idx);
	}

	for (uint32_t i = 0; i < igc->igc_nintrs; i++) {
		igc_write32(igc, IGC_EITR(i), igc->igc_eitr);
	}
}

/*
 * Synchronize our sense of the unicast table over to the device. If this is the
 * first time that we're here due to attach, we need to go through and allocate
 * the tracking table.
 */
static void
igc_unicast_sync(igc_t *igc)
{
	ASSERT(MUTEX_HELD(&igc->igc_lock));

	if (igc->igc_ucast == NULL) {
		igc->igc_nucast = igc->igc_hw.mac.rar_entry_count;
		igc->igc_ucast = kmem_zalloc(sizeof (igc_addr_t) *
		    igc->igc_nucast, KM_SLEEP);
	}

	for (uint16_t i = 0; i < igc->igc_nucast; i++) {
		int ret = igc_rar_set(&igc->igc_hw, igc->igc_ucast[i].ia_mac,
		    i);
		/*
		 * Common code today guarantees this can't fail. Put this here
		 * to ensure to guard against future updates.
		 */
		VERIFY3S(ret, ==, IGC_SUCCESS);
	}

}

/*
 * The core code interface to the multicast table requires us to give them a
 * packed uint8_t array that they manually walk through in ETHERADDRL (6 byte)
 * chunks. This must be packed. To deal with this we opt to preserve a normal
 * list of multicast addresses and then a secondary version that's serialized as
 * the core code wants it. We allocate the memory for this secondary version at
 * the start.
 */
void
igc_multicast_sync(igc_t *igc)
{
	uint16_t nvalid;

	ASSERT(MUTEX_HELD(&igc->igc_lock));

	if (igc->igc_mcast == NULL) {
		igc->igc_nmcast = igc->igc_hw.mac.mta_reg_count;
		igc->igc_mcast = kmem_zalloc(sizeof (igc_addr_t) *
		    igc->igc_nmcast, KM_SLEEP);
		igc->igc_mcast_raw = kmem_alloc(sizeof (ether_addr_t) *
		    igc->igc_nmcast, KM_SLEEP);
	}

	bzero(igc->igc_mcast_raw, sizeof (ether_addr_t) * igc->igc_nmcast);
	nvalid = 0;
	for (uint16_t i = 0; i < igc->igc_nmcast; i++) {
		ether_addr_t *targ = &igc->igc_mcast_raw[nvalid];

		if (!igc->igc_mcast[i].ia_valid)
			continue;
		bcopy(igc->igc_mcast[i].ia_mac, targ, sizeof (ether_addr_t));
		nvalid++;
	}

	igc_update_mc_addr_list(&igc->igc_hw, (uint8_t *)igc->igc_mcast_raw,
	    nvalid);
}

/*
 * This function is used to reinitialize the PBA, our various flow control
 * settings, reset hardware, ensure that the EEE, DPLU, and related power modes
 * are in the correct state.
 */
bool
igc_hw_common_init(igc_t *igc)
{
	int ret;
	uint32_t pba, hwm, hwmp, hwm2x;
	struct igc_hw *hw = &igc->igc_hw;

	/*
	 * The PBA register determines which portion is used for the receive
	 * buffers and which is used for the transmit buffers. This follows from
	 * the I210 and reference drivers which use 34K as the default. We
	 * currently leave the RXPBS and TXPBS at their power-on-reset defaults.
	 *
	 * We set the watermark based settings similar to igb, ensuring that we
	 * have 16-byte granularity. The general guidelines from there was that
	 * when it comes to automatic Ethernet PAUSE frame generation we should:
	 *
	 * - After an XOFF, you want to receive at least two frames. We use
	 *   whichever is smaller of 9/10ths and two frames.
	 * - The low water mark apparently wants to be closer to the high water
	 *   mark.
	 *
	 * See igb_init_adapter() for more information. We basically use the
	 * same calculation it did, given that the MAC is basically the same.
	 */
	pba = IGC_PBA_34K;
	hwmp = (pba << 10) * 9 / 10;
	hwm2x = (pba << 10) - 2 * igc->igc_max_frame;
	hwm = MIN(hwmp, hwm2x);

	hw->fc.high_water = hwm & 0xfffffff0;
	hw->fc.low_water = igc->igc_hw.fc.high_water - 16;

	/*
	 * Use the suggested default pause time.
	 */
	hw->fc.pause_time = IGC_FC_PAUSE_TIME;
	hw->fc.send_xon = true;

	if ((ret = igc_reset_hw(hw)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d",
		    ret);
		return (false);
	}

	if ((ret = igc_init_hw(hw)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to init hardware: %d",
		    ret);
		return (false);
	}

	/*
	 * Clear wake on LAN and set other power states. In addition, disable
	 * EEE for now.
	 */
	igc_write32(igc, IGC_WUC, 0);

	if ((ret = igc_set_d0_lplu_state(hw, false)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to set D0 LPLU mode: %d",
		    ret);
		return (false);
	}

	/*
	 * There have been reports that enabling EEE for some 2.5G devices has
	 * led to issues with the I225/226. It's not entirely clear, but we
	 * default to disabling this like in igb/e1000g for now.
	 */
	if ((ret = igc_set_eee_i225(hw, false, false, false)) != IGC_SUCCESS) {
		dev_err(igc->igc_dip, CE_WARN, "failed to set EEE mode: %d",
		    ret);
		return (false);
	}

	igc_hw_intr_init(igc);

	mutex_enter(&igc->igc_lock);
	igc_unicast_sync(igc);
	igc_multicast_sync(igc);

	igc->igc_hw.mac.get_link_status = true;
	(void) igc_get_phy_info(hw);
	(void) igc_check_for_link(hw);
	mutex_exit(&igc->igc_lock);

	return (true);
}

static bool
igc_intr_en(igc_t *igc)
{
	int ret;

	if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) {
		ret = ddi_intr_block_enable(igc->igc_intr_handles,
		    igc->igc_nintrs);
		if (ret != DDI_SUCCESS) {
			dev_err(igc->igc_dip, CE_WARN, "failed to block "
			    "enable interrupts: %d", ret);
			return (false);
		}
	} else {
		for (int i = 0; i < igc->igc_nintrs; i++) {
			ret = ddi_intr_enable(igc->igc_intr_handles[i]);
			if (ret != DDI_SUCCESS) {
				dev_err(igc->igc_dip, CE_WARN, "failed to "
				    "enable interrupt %d: %d", i, ret);
				for (int clean = 0; clean < i; clean++) {
					ret = ddi_intr_disable(
					    igc->igc_intr_handles[clean]);
					if (ret != DDI_SUCCESS) {
						dev_err(igc->igc_dip, CE_WARN,
						    "failed to disable "
						    "interrupt %d while "
						    "unwinding: %d", i, ret);
					}
				}
				return (false);
			}
		}
	}

	/*
	 * Now that we've enabled interrupts here, clear any pending interrupts
	 * and make sure hardware interrupts are enabled.
	 */
	(void) igc_read32(igc, IGC_ICR);

	return (true);
}

/*
 * Undo interrupt enablement.
 */
void
igc_hw_intr_disable(igc_t *igc)
{
	igc_write32(igc, IGC_EIMC, UINT32_MAX);
	igc_write32(igc, IGC_EIAC, 0);
	igc_write32(igc, IGC_IMC, UINT32_MAX);
}

/*
 * This is used during the GLDv3 mc_start(9E) entry point to enable interrupts
 * on the device itself.
 */
void
igc_hw_intr_enable(igc_t *igc)
{
	uint32_t ims;

	/*
	 * First we clear pending interrupts.
	 */
	(void) igc_read32(igc, IGC_ICR);

	/*
	 * The hardware has extended and non-extended interrupt masks and
	 * auto-clear registers. We always disable auto-clear for the
	 * non-extended portions. See the I210 datasheet 'Setting Interrupt
	 * Registers' for a better sense of what's going on here.
	 *
	 * In the IMS register we always register link status change events and
	 * device reset assertions.
	 */
	ims = IGC_IMS_LSC | IGC_IMS_DRSTA;

	igc_write32(igc, IGC_EIAC, igc->igc_eims);
	igc_write32(igc, IGC_EIMS, igc->igc_eims);
	igc_write32(igc, IGC_IMS, ims);
	igc_write32(igc, IGC_IAM, 0);
}

static void
igc_cleanup(igc_t *igc)
{
	if (igc->igc_mcast != NULL) {
		ASSERT3U(igc->igc_nmcast, !=, 0);
		kmem_free(igc->igc_mcast_raw, sizeof (ether_addr_t) *
		    igc->igc_nmcast);
		kmem_free(igc->igc_mcast, sizeof (igc_addr_t) *
		    igc->igc_nmcast);
		igc->igc_nmcast = 0;
		igc->igc_mcast = NULL;
	}

	if (igc->igc_ucast != NULL) {
		ASSERT3U(igc->igc_nucast, !=, 0);
		kmem_free(igc->igc_ucast, sizeof (igc_addr_t) *
		    igc->igc_nucast);
		igc->igc_nucast = 0;
		igc->igc_ucast = NULL;
	}

	if ((igc->igc_attach & IGC_ATTACH_INTR_EN) != 0) {
		int ret;
		if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) {
			ret = ddi_intr_block_disable(igc->igc_intr_handles,
			    igc->igc_nintrs);
			if (ret != DDI_SUCCESS) {
				dev_err(igc->igc_dip, CE_WARN, "failed to "
				    "block disable interrupts: %d", ret);
			}
		} else {
			for (int i = 0; i < igc->igc_nintrs; i++) {
				ret = ddi_intr_disable(
				    igc->igc_intr_handles[i]);
				if (ret != DDI_SUCCESS) {
					dev_err(igc->igc_dip, CE_WARN, "failed "
					    "to disable interrupt %d: %d", i,
					    ret);
				}
			}
		}
		igc->igc_attach &= ~IGC_ATTACH_INTR_EN;
	}

	if ((igc->igc_attach & IGC_ATTACH_MAC) != 0) {
		int ret = mac_unregister(igc->igc_mac_hdl);
		if (ret != 0) {
			dev_err(igc->igc_dip, CE_WARN, "failed to unregister "
			    "MAC handle: %d", ret);
		}
		igc->igc_attach &= ~IGC_ATTACH_MAC;
	}

	if ((igc->igc_attach & IGC_ATTACH_STATS) != 0) {
		igc_stats_fini(igc);
		igc->igc_attach &= ~IGC_ATTACH_STATS;
	}

	if ((igc->igc_attach & IGC_ATTACH_LED) != 0) {
		igc_led_fini(igc);
		igc->igc_attach &= ~IGC_ATTACH_LED;
	}

	if ((igc->igc_attach & IGC_ATTACH_INTR_HANDLER) != 0) {
		for (int i = 0; i < igc->igc_nintrs; i++) {
			int ret =
			    ddi_intr_remove_handler(igc->igc_intr_handles[i]);
			if (ret != 0) {
				dev_err(igc->igc_dip, CE_WARN, "failed to "
				    "remove interrupt %d handler: %d", i, ret);
			}
		}
		igc->igc_attach &= ~IGC_ATTACH_INTR_HANDLER;
	}

	if (igc->igc_tx_rings != NULL) {
		for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
			igc_tx_ring_stats_fini(&igc->igc_tx_rings[i]);
			mutex_destroy(&igc->igc_tx_rings[i].itr_lock);
		}
		kmem_free(igc->igc_tx_rings, sizeof (igc_tx_ring_t) *
		    igc->igc_ntx_rings);
		igc->igc_tx_rings = NULL;
	}

	if (igc->igc_rx_rings != NULL) {
		for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
			igc_rx_ring_stats_fini(&igc->igc_rx_rings[i]);
			cv_destroy(&igc->igc_rx_rings[i].irr_free_cv);
			mutex_destroy(&igc->igc_rx_rings[i].irr_free_lock);
			mutex_destroy(&igc->igc_rx_rings[i].irr_lock);
		}
		kmem_free(igc->igc_rx_rings, sizeof (igc_rx_ring_t) *
		    igc->igc_nrx_rings);
		igc->igc_rx_rings = NULL;
	}

	if ((igc->igc_attach & IGC_ATTACH_MUTEX) != 0) {
		mutex_destroy(&igc->igc_lock);
		igc->igc_attach &= ~IGC_ATTACH_MUTEX;
	}

	if ((igc->igc_attach & IGC_ATTACH_INTR_ALLOC) != 0) {
		for (int i = 0; i < igc->igc_nintrs; i++) {
			int ret = ddi_intr_free(igc->igc_intr_handles[i]);
			if (ret != DDI_SUCCESS) {
				dev_err(igc->igc_dip, CE_WARN, "unexpected "
				    "failure freeing interrupt %d: %d", i, ret);
			}
		}
		igc->igc_attach &= ~IGC_ATTACH_INTR_ALLOC;
	}

	if (igc->igc_intr_handles != NULL) {
		ASSERT3U(igc->igc_intr_size, !=, 0);
		kmem_free(igc->igc_intr_handles, igc->igc_intr_size);
	}

	/*
	 * Now that we're almost done, begrudgingly let firmware know we're
	 * done.
	 */
	igc_hw_control(igc, false);

	if (igc->igc_regs_hdl != NULL) {
		ddi_regs_map_free(&igc->igc_regs_hdl);
		igc->igc_regs_base = NULL;
	}

	if (igc->igc_cfgspace != NULL) {
		pci_config_teardown(&igc->igc_cfgspace);
	}
	igc->igc_attach &= ~IGC_ATTACH_REGS;

	ddi_set_driver_private(igc->igc_dip, NULL);
	igc->igc_dip = NULL;

	VERIFY0(igc->igc_attach);

	kmem_free(igc, sizeof (igc_t));
}

static int
igc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	igc_t *igc;

	if (cmd != DDI_ATTACH) {
		return (DDI_FAILURE);
	}

	igc = kmem_zalloc(sizeof (igc_t), KM_SLEEP);
	ddi_set_driver_private(dip, igc);
	igc->igc_dip = dip;

	/*
	 * Initialize a few members that are not zero-based.
	 */
	igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
	igc->igc_link_state = LINK_STATE_UNKNOWN;

	/*
	 * Set up all the register spaces that hardware requires.
	 */
	if (!igc_setup_regs(igc)) {
		goto err;
	}
	igc->igc_attach |= IGC_ATTACH_REGS;

	/*
	 * Setup the common code.
	 */
	if (!igc_core_code_init(igc)) {
		goto err;
	}

	if (!igc_limits_init(igc)) {
		goto err;
	}

	/*
	 * Go allocate and set up all of our interrupts.
	 */
	if (!igc_intr_init(igc)) {
		goto err;
	}

	/*
	 * Initialize our main mutex for the device now that we have an
	 * interrupt priority.
	 */
	mutex_init(&igc->igc_lock, NULL, MUTEX_DRIVER,
	    DDI_INTR_PRI(igc->igc_intr_pri));
	igc->igc_attach |= IGC_ATTACH_MUTEX;

	/*
	 * We now want to determine the total number of rx and tx rings that we
	 * have based on our interrupt allocation so we can go through and
	 * perform the rest of the device setup that is required. The various
	 * queues that we have are mapped to a given MSI-X through the IVAR
	 * registers in the device. There is also an IVAR_MISC register that
	 * maps link state change events and other issues up to two vectors.
	 *
	 * There isn't strictly per-queue interrupt generation control. Instead,
	 * when in MSI-X mode, the device has an extended interrupt cause and
	 * mask register. The mask register allows us to mask the five bits
	 * described above.
	 *
	 * Because of all this we end up limiting the number of queues that we
	 * use to 2 for now: 1 for tx and 1 for rx. Interrupt 0 is for tx/other
	 * and 1 for rx.
	 */
	igc->igc_nrx_rings = 1;
	igc->igc_ntx_rings = 1;

	/*
	 * Default to a 1500 byte MTU.
	 */
	igc->igc_mtu = ETHERMTU;
	igc_hw_buf_update(igc);

	/*
	 * Initialize default descriptor limits and thresholds. We allocate 1.5
	 * times the number of rx descriptors so that way we can loan up to
	 * 1/3rd of them. We allocate an even number of tx descriptors.
	 */
	igc->igc_rx_ndesc = IGC_DEF_RX_RING_SIZE;
	igc->igc_tx_ndesc = IGC_DEF_TX_RING_SIZE;
	igc->igc_rx_nbuf = igc->igc_rx_ndesc + (igc->igc_rx_ndesc >> 1);
	igc->igc_tx_nbuf = igc->igc_tx_ndesc;
	igc->igc_rx_nfree = igc->igc_rx_nbuf - igc->igc_rx_ndesc;
	igc->igc_rx_intr_nframes = IGC_DEF_RX_RING_INTR_LIMIT;
	igc->igc_rx_bind_thresh = IGC_DEF_RX_BIND;
	igc->igc_tx_bind_thresh = IGC_DEF_TX_BIND;
	igc->igc_tx_notify_thresh = IGC_DEF_TX_NOTIFY_MIN;
	igc->igc_tx_recycle_thresh = IGC_DEF_TX_RECYCLE_MIN;
	igc->igc_tx_gap = IGC_DEF_TX_GAP;
	igc->igc_eitr = IGC_DEF_EITR;

	if (!igc_rings_alloc(igc)) {
		goto err;
	}

	if (!igc_intr_hdlr_init(igc)) {
		goto err;
	}
	igc->igc_attach |= IGC_ATTACH_INTR_HANDLER;

	/*
	 * Next reset the device before we begin initializing anything else. As
	 * part of this, validate the flash checksum if present. This is all
	 * initialization that we would only do once per device. Other
	 * initialization that we want to do after any reset is done is
	 * igc_hw_common_init().
	 */
	if (!igc_hw_init(igc)) {
		goto err;
	}

	igc_led_init(igc);
	igc->igc_attach |= IGC_ATTACH_LED;

	/*
	 * Snapshot our basic settings that users can eventually control in the
	 * device. We start with always enabling auto-negotiation and
	 * advertising the basic supported speeds. The I225v1 does have
	 * substantial problems with enabling 2.5G due to the fact that it
	 * doesn't maintain a proper inter-packet gap. Despite that, we default
	 * to enabling 2.5G for now as its supposedly not broken with all link
	 * partners and the NVM. We also don't have a way of actually
	 * identifying and mapping that to something in the driver today,
	 * unfortunately.
	 */
	igc->igc_hw.mac.autoneg = true;
	igc->igc_hw.phy.autoneg_wait_to_complete = false;
	igc->igc_hw.phy.autoneg_advertised = IGC_DEFAULT_ADV;
	igc->igc_hw.fc.requested_mode = igc_fc_default;
	igc->igc_hw.fc.current_mode = igc_fc_default;

	if (!igc_hw_common_init(igc)) {
		goto err;
	}

	if (!igc_stats_init(igc)) {
		goto err;
	}
	igc->igc_attach |= IGC_ATTACH_STATS;

	/*
	 * Register with MAC
	 */
	if (!igc_mac_register(igc)) {
		goto err;
	}
	igc->igc_attach |= IGC_ATTACH_MAC;

	/*
	 * Enable interrupts and get going.
	 */
	if (!igc_intr_en(igc)) {
		goto err;
	}
	igc->igc_attach |= IGC_ATTACH_INTR_EN;

	return (DDI_SUCCESS);

err:
	igc_cleanup(igc);
	return (DDI_FAILURE);
}

static int
igc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	igc_t *igc;

	if (cmd != DDI_DETACH) {
		return (DDI_FAILURE);
	}

	igc = ddi_get_driver_private(dip);
	if (igc == NULL) {
		dev_err(dip, CE_WARN, "asked to detach, but missing igc_t");
		return (DDI_FAILURE);
	}

	igc_cleanup(igc);
	return (DDI_SUCCESS);
}

static struct cb_ops igc_cb_ops = {
	.cb_open = nulldev,
	.cb_close = nulldev,
	.cb_strategy = nodev,
	.cb_print = nodev,
	.cb_dump = nodev,
	.cb_read = nodev,
	.cb_write = nodev,
	.cb_ioctl = nodev,
	.cb_devmap = nodev,
	.cb_mmap = nodev,
	.cb_segmap = nodev,
	.cb_chpoll = nochpoll,
	.cb_prop_op = ddi_prop_op,
	.cb_flag = D_MP,
	.cb_rev = CB_REV,
	.cb_aread = nodev,
	.cb_awrite = nodev
};

static struct dev_ops igc_dev_ops = {
	.devo_rev = DEVO_REV,
	.devo_refcnt = 0,
	.devo_getinfo = NULL,
	.devo_identify = nulldev,
	.devo_probe = nulldev,
	.devo_attach = igc_attach,
	.devo_detach = igc_detach,
	.devo_reset = nodev,
	.devo_quiesce = ddi_quiesce_not_supported,
	.devo_cb_ops = &igc_cb_ops
};

static struct modldrv igc_modldrv = {
	.drv_modops = &mod_driverops,
	.drv_linkinfo = "Intel I226/226 Ethernet Controller",
	.drv_dev_ops = &igc_dev_ops
};

static struct modlinkage igc_modlinkage = {
	.ml_rev = MODREV_1,
	.ml_linkage = { &igc_modldrv, NULL }
};

int
_init(void)
{
	int ret;

	mac_init_ops(&igc_dev_ops, IGC_MOD_NAME);

	if ((ret = mod_install(&igc_modlinkage)) != 0) {
		mac_fini_ops(&igc_dev_ops);
	}

	return (ret);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&igc_modlinkage, modinfop));
}

int
_fini(void)
{
	int ret;

	if ((ret = mod_remove(&igc_modlinkage)) == 0) {
		mac_fini_ops(&igc_dev_ops);
	}

	return (ret);
}