clients/ibd/ibd.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * An implementation of the IPoIB standard based on PSARC 2001/289.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/dlpi.h>
#include <sys/mac_provider.h>

#include <sys/pattr.h>		/* for HCK_FULLCKSUM */
#include <sys/sysmacros.h>	/* for offsetof */
#include <sys/disp.h>		/* for async thread pri */
#include <sys/atomic.h>		/* for atomic_add*() */
#include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
#include <netinet/in.h>		/* for netinet/ip.h below */
#include <netinet/ip.h>		/* for struct ip */
#include <netinet/udp.h>	/* for struct udphdr */
#include <inet/common.h>	/* for inet/ip.h below */
#include <inet/ip.h>		/* for ipha_t */
#include <inet/ip6.h>		/* for ip6_t */
#include <inet/tcp.h>		/* for tcph_t */
#include <netinet/icmp6.h>	/* for icmp6_t */
#include <sys/callb.h>
#include <sys/modhash.h>

#include <sys/ib/clients/ibd/ibd.h>
#include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
#include <sys/note.h>
#include <sys/multidata.h>

#include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */

/*
 * Per-interface tunables
 *
 * ibd_tx_copy_thresh
 *     This sets the threshold at which ibd will attempt to do a bcopy of the
 *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
 *     is restricted by various parameters, so setting of this value must be
 *     made after careful considerations only.  For instance, IB HCAs currently
 *     impose a relatively small limit (when compared to ethernet NICs) on the
 *     length of the SGL for transmit. On the other hand, the ip stack could
 *     send down mp chains that are quite long when LSO is enabled.
 *
 * ibd_num_swqe
 *     Number of "send WQE" elements that will be allocated and used by ibd.
 *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
 *     buffer in each of these send wqes must be taken into account. This
 *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
 *     currently set to the same value of ibd_tx_copy_thresh, but may be
 *     changed independently if needed).
 *
 * ibd_num_rwqe
 *     Number of "receive WQE" elements that will be allocated and used by
 *     ibd. This parameter is limited by the maximum channel size of the HCA.
 *     Each buffer in the receive wqe will be of MTU size.
 *
 * ibd_num_lso_bufs
 *     Number of "larger-than-MTU" copy buffers to use for cases when the
 *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
 *     and too large to be used with regular MTU-sized copy buffers. It is
 *     not recommended to tune this variable without understanding the
 *     application environment and/or memory resources. The size of each of
 *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
 *
 * ibd_num_ah
 *     Number of AH cache entries to allocate
 *
 * ibd_hash_size
 *     Hash table size for the active AH list
 *
 * ibd_separate_cqs
 * ibd_txcomp_poll
 *     These boolean variables (1 or 0) may be used to tune the behavior of
 *     ibd in managing the send and receive completion queues and in deciding
 *     whether or not transmit completions should be polled or interrupt
 *     driven (when the completion queues are separate). If both the completion
 *     queues are interrupt driven, it may not be possible for the handlers to
 *     be invoked concurrently, depending on how the interrupts are tied on
 *     the PCI intr line.  Note that some combination of these two parameters
 *     may not be meaningful (and therefore not allowed).
 *
 * ibd_tx_softintr
 * ibd_rx_softintr
 *     The softintr mechanism allows ibd to avoid event queue overflows if
 *     the receive/completion handlers are to be expensive. These are enabled
 *     by default.
 *
 * ibd_log_sz
 *     This specifies the size of the ibd log buffer in bytes. The buffer is
 *     allocated and logging is enabled only when IBD_LOGGING is defined.
 *
 */
uint_t ibd_tx_copy_thresh = 0x1000;
uint_t ibd_num_swqe = 4000;
uint_t ibd_num_rwqe = 4000;
uint_t ibd_num_lso_bufs = 0x400;
uint_t ibd_num_ah = 64;
uint_t ibd_hash_size = 32;
uint_t ibd_separate_cqs = 1;
uint_t ibd_txcomp_poll = 0;
uint_t ibd_rx_softintr = 1;
uint_t ibd_tx_softintr = 1;
uint_t ibd_create_broadcast_group = 1;
uint_t ibd_force_lso_disable = 1;
#ifdef IBD_LOGGING
uint_t ibd_log_sz = 0x20000;
#endif

#define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
#define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
#define	IBD_NUM_SWQE			ibd_num_swqe
#define	IBD_NUM_RWQE			ibd_num_rwqe
#define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
#define	IBD_NUM_AH			ibd_num_ah
#define	IBD_HASH_SIZE			ibd_hash_size
#ifdef IBD_LOGGING
#define	IBD_LOG_SZ			ibd_log_sz
#endif

/*
 * Receive CQ moderation parameters: NOT tunables
 */
static uint_t ibd_rxcomp_count = 4;
static uint_t ibd_rxcomp_usec = 10;

/*
 * Send CQ moderation parameters: NOT tunables
 */
#define	IBD_TXCOMP_COUNT		10
#define	IBD_TXCOMP_USEC			300

/*
 * Thresholds
 *
 * When waiting for resources (swqes or lso buffers) to become available,
 * the first two thresholds below determine how long to wait before informing
 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
 * determines how low the available swqes should go before we start polling
 * the completion queue.
 */
#define	IBD_FREE_LSOS_THRESH		8
#define	IBD_FREE_SWQES_THRESH		20
#define	IBD_TX_POLL_THRESH		80

/*
 * When doing multiple-send-wr or multiple-recv-wr posts, this value
 * determines how many to do at a time (in a single ibt_post_send/recv).
 */
#define	IBD_MAX_POST_MULTIPLE		4

/*
 * Maximum length for returning chained mps back to crossbow
 */
#define	IBD_MAX_RX_MP_LEN		16

/*
 * LSO parameters
 */
#define	IBD_LSO_MAXLEN			65536
#define	IBD_LSO_BUFSZ			8192
#define	IBD_PROP_LSO_POLICY		"lso-policy"

/*
 * Completion queue polling control
 */
#define	IBD_RX_CQ_POLLING		0x1
#define	IBD_TX_CQ_POLLING		0x2
#define	IBD_REDO_RX_CQ_POLLING		0x4
#define	IBD_REDO_TX_CQ_POLLING		0x8

/*
 * Flag bits for resources to reap
 */
#define	IBD_RSRC_SWQE			0x1
#define	IBD_RSRC_LSOBUF			0x2

/*
 * Async operation types
 */
#define	IBD_ASYNC_GETAH			1
#define	IBD_ASYNC_JOIN			2
#define	IBD_ASYNC_LEAVE			3
#define	IBD_ASYNC_PROMON		4
#define	IBD_ASYNC_PROMOFF		5
#define	IBD_ASYNC_REAP			6
#define	IBD_ASYNC_TRAP			7
#define	IBD_ASYNC_SCHED			8
#define	IBD_ASYNC_LINK			9
#define	IBD_ASYNC_EXIT			10

/*
 * Async operation states
 */
#define	IBD_OP_NOTSTARTED		0
#define	IBD_OP_ONGOING			1
#define	IBD_OP_COMPLETED		2
#define	IBD_OP_ERRORED			3
#define	IBD_OP_ROUTERED			4

/*
 * State of IBD driver initialization during attach/m_start
 */
#define	IBD_DRV_STATE_INITIALIZED	0x00001
#define	IBD_DRV_RXINTR_ADDED		0x00002
#define	IBD_DRV_TXINTR_ADDED		0x00004
#define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
#define	IBD_DRV_HCA_OPENED		0x00010
#define	IBD_DRV_PD_ALLOCD		0x00020
#define	IBD_DRV_MAC_REGISTERED		0x00040
#define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
#define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
#define	IBD_DRV_ACACHE_INITIALIZED	0x00200
#define	IBD_DRV_CQS_ALLOCD		0x00400
#define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
#define	IBD_DRV_TXLIST_ALLOCD		0x01000
#define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
#define	IBD_DRV_RXLIST_ALLOCD		0x04000
#define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
#define	IBD_DRV_ASYNC_THR_CREATED	0x10000
#define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
#define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
#define	IBD_DRV_STARTED			0x80000

/*
 * Start/stop in-progress flags; note that restart must always remain
 * the OR of start and stop flag values.
 */
#define	IBD_DRV_START_IN_PROGRESS	0x10000000
#define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
#define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000

/*
 * Miscellaneous constants
 */
#define	IBD_SEND			0
#define	IBD_RECV			1
#define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
#define	IBD_DEF_MAX_SDU			2044
#define	IBD_DEFAULT_QKEY		0xB1B
#ifdef IBD_LOGGING
#define	IBD_DMAX_LINE			100
#endif

/*
 * Enumerations for link states
 */
typedef enum {
	IBD_LINK_DOWN,
	IBD_LINK_UP,
	IBD_LINK_UP_ABSENT
} ibd_link_op_t;

/*
 * Driver State Pointer
 */
void *ibd_list;

/*
 * Logging
 */
#ifdef IBD_LOGGING
kmutex_t ibd_lbuf_lock;
uint8_t *ibd_lbuf;
uint32_t ibd_lbuf_ndx;
#endif

/*
 * Required system entry points
 */
static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);

/*
 * Required driver entry points for GLDv3
 */
static int ibd_m_stat(void *, uint_t, uint64_t *);
static int ibd_m_start(void *);
static void ibd_m_stop(void *);
static int ibd_m_promisc(void *, boolean_t);
static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
static int ibd_m_unicst(void *, const uint8_t *);
static mblk_t *ibd_m_tx(void *, mblk_t *);
static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);

/*
 * Private driver entry points for GLDv3
 */

/*
 * Initialization
 */
static int ibd_state_init(ibd_state_t *, dev_info_t *);
static int ibd_init_txlist(ibd_state_t *);
static int ibd_init_rxlist(ibd_state_t *);
static int ibd_acache_init(ibd_state_t *);
#ifdef IBD_LOGGING
static void ibd_log_init(void);
#endif

/*
 * Termination/cleanup
 */
static void ibd_state_fini(ibd_state_t *);
static void ibd_fini_txlist(ibd_state_t *);
static void ibd_fini_rxlist(ibd_state_t *);
static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
static void ibd_acache_fini(ibd_state_t *);
#ifdef IBD_LOGGING
static void ibd_log_fini(void);
#endif

/*
 * Allocation/acquire/map routines
 */
static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
static int ibd_alloc_tx_copybufs(ibd_state_t *);
static int ibd_alloc_tx_lsobufs(ibd_state_t *);
static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
    uint32_t *);

/*
 * Free/release/unmap routines
 */
static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
static void ibd_free_tx_copybufs(ibd_state_t *);
static void ibd_free_tx_lsobufs(ibd_state_t *);
static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);

/*
 * Handlers/callback routines
 */
static uint_t ibd_intr(char *);
static uint_t ibd_tx_recycle(char *);
static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
static void ibd_scq_handler(ibt_cq_hdl_t, void *);
static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
static void ibd_freemsg_cb(char *);
static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
    ibt_async_event_t *);
static void ibd_snet_notices_handler(void *, ib_gid_t,
    ibt_subnet_event_code_t, ibt_subnet_event_t *);

/*
 * Send/receive routines
 */
static boolean_t ibd_send(ibd_state_t *, mblk_t *);
static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
static void ibd_flush_rx(ibd_state_t *, mblk_t *);

/*
 * Threads
 */
static void ibd_async_work(ibd_state_t *);

/*
 * Async tasks
 */
static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
static void ibd_async_setprom(ibd_state_t *);
static void ibd_async_unsetprom(ibd_state_t *);
static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
static void ibd_async_txsched(ibd_state_t *);
static void ibd_async_link(ibd_state_t *, ibd_req_t *);

/*
 * Async task helpers
 */
static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
static boolean_t ibd_get_allroutergroup(ibd_state_t *,
    ipoib_mac_t *, ipoib_mac_t *);
static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
static ibt_status_t ibd_find_bgroup(ibd_state_t *);
static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
static uint64_t ibd_get_portspeed(ibd_state_t *);
static boolean_t ibd_async_safe(ibd_state_t *);
static void ibd_async_done(ibd_state_t *);
static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);

/*
 * Helpers for attach/start routines
 */
static int ibd_register_mac(ibd_state_t *, dev_info_t *);
static int ibd_record_capab(ibd_state_t *, dev_info_t *);
static int ibd_unattach(ibd_state_t *, dev_info_t *);
static int ibd_get_port_details(ibd_state_t *);
static int ibd_alloc_cqs(ibd_state_t *);
static int ibd_setup_ud_channel(ibd_state_t *);
static int ibd_start(ibd_state_t *);
static int ibd_undo_start(ibd_state_t *, link_state_t);
static void ibd_set_mac_progress(ibd_state_t *, uint_t);
static void ibd_clr_mac_progress(ibd_state_t *, uint_t);


/*
 * Miscellaneous helpers
 */
static int ibd_sched_poll(ibd_state_t *, int, int);
static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
static int ibd_resume_transmission(ibd_state_t *);
static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
static void *list_get_head(list_t *);
static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
static void ibd_print_warn(ibd_state_t *, char *, ...);
#ifdef IBD_LOGGING
static void ibd_log(const char *, ...);
#endif

DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
    nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);

/* Module Driver Info */
static struct modldrv ibd_modldrv = {
	&mod_driverops,			/* This one is a driver */
	"InfiniBand GLDv3 Driver",	/* short description */
	&ibd_dev_ops			/* driver specific ops */
};

/* Module Linkage */
static struct modlinkage ibd_modlinkage = {
	MODREV_1, (void *)&ibd_modldrv, NULL
};

/*
 * Module (static) info passed to IBTL during ibt_attach
 */
static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
	IBTI_V_CURR,
	IBT_NETWORK,
	ibd_async_handler,
	NULL,
	"IPIB"
};

/*
 * GLDv3 entry points
 */
#define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
static mac_callbacks_t ibd_m_callbacks = {
	IBD_M_CALLBACK_FLAGS,
	ibd_m_stat,
	ibd_m_start,
	ibd_m_stop,
	ibd_m_promisc,
	ibd_m_multicst,
	ibd_m_unicst,
	ibd_m_tx,
	NULL,
	ibd_m_getcapab
};

/*
 * Fill/clear <scope> and <p_key> in multicast/broadcast address
 */
#define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
{							\
	*(uint32_t *)((char *)(maddr) + 4) |=		\
	    htonl((uint32_t)(scope) << 16);		\
	*(uint32_t *)((char *)(maddr) + 8) |=		\
	    htonl((uint32_t)(pkey) << 16);		\
}

#define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
{							\
	*(uint32_t *)((char *)(maddr) + 4) &=		\
	    htonl(~((uint32_t)0xF << 16));		\
	*(uint32_t *)((char *)(maddr) + 8) &=		\
	    htonl(~((uint32_t)0xFFFF << 16));		\
}

/*
 * Rudimentary debugging support
 */
#ifdef DEBUG
int ibd_debuglevel = 100;
static void
debug_print(int l, char *fmt, ...)
{
	va_list ap;

	if (l < ibd_debuglevel)
		return;
	va_start(ap, fmt);
	vcmn_err(CE_CONT, fmt, ap);
	va_end(ap);
}
#define	DPRINT		debug_print
#else
#define	DPRINT
#endif

/*
 * Common routine to print warning messages; adds in hca guid, port number
 * and pkey to be able to identify the IBA interface.
 */
static void
ibd_print_warn(ibd_state_t *state, char *fmt, ...)
{
	ib_guid_t hca_guid;
	char ibd_print_buf[256];
	int len;
	va_list ap;

	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
	    0, "hca-guid", 0);
	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
	va_start(ap, fmt);
	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
	    fmt, ap);
	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
	va_end(ap);
}

/*
 * Warlock directives
 */

/*
 * id_lso_lock
 *
 * state->id_lso->bkt_nfree may be accessed without a lock to
 * determine the threshold at which we have to ask the nw layer
 * to resume transmission (see ibd_resume_transmission()).
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
    ibd_state_t::id_lso))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))

/*
 * id_cq_poll_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
    ibd_state_t::id_cq_poll_busy))

/*
 * id_txpost_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    ibd_state_t::id_tx_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    ibd_state_t::id_tx_busy))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    ibd_state_t::id_tx_tailp))

/*
 * id_rxpost_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
    ibd_state_t::id_rx_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
    ibd_state_t::id_rx_busy))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
    ibd_state_t::id_rx_tailp))

/*
 * id_acache_req_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
    ibd_state_t::id_acache_req_cv))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
    ibd_state_t::id_req_list))

/*
 * id_ac_mutex
 *
 * This mutex is actually supposed to protect id_ah_op as well,
 * but this path of the code isn't clean (see update of id_ah_op
 * in ibd_async_acache(), immediately after the call to
 * ibd_async_mcache()). For now, we'll skip this check by
 * declaring that id_ah_op is protected by some internal scheme
 * that warlock isn't aware of.
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_active))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_free))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_addr))
_NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
    ibd_state_t::id_ah_op))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    ibd_state_t::id_ah_error))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))

/*
 * id_mc_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
    ibd_state_t::id_mc_full))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
    ibd_state_t::id_mc_non))

/*
 * id_trap_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    ibd_state_t::id_trap_cv))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    ibd_state_t::id_trap_stop))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    ibd_state_t::id_trap_inprog))

/*
 * id_prom_op
 */
_NOTE(SCHEME_PROTECTS_DATA("only by async thread",
    ibd_state_t::id_prom_op))

/*
 * id_sched_lock
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
    ibd_state_t::id_sched_needed))

/*
 * id_link_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
    ibd_state_t::id_link_state))
_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
_NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
    ibd_state_t::id_link_speed))

/*
 * id_tx_list.dl_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
    ibd_state_t::id_tx_list.dl_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
    ibd_state_t::id_tx_list.dl_tail))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_tx_list.dl_pending_sends))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_tx_list.dl_cnt))

/*
 * id_rx_list.dl_mutex
 */
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
    ibd_state_t::id_rx_list.dl_head))
_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
    ibd_state_t::id_rx_list.dl_tail))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_rx_list.dl_bufs_outstanding))
_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    ibd_state_t::id_rx_list.dl_cnt))


/*
 * Items protected by atomic updates
 */
_NOTE(SCHEME_PROTECTS_DATA("atomic update only",
    ibd_state_s::id_brd_rcv
    ibd_state_s::id_brd_xmt
    ibd_state_s::id_multi_rcv
    ibd_state_s::id_multi_xmt
    ibd_state_s::id_num_intrs
    ibd_state_s::id_rcv_bytes
    ibd_state_s::id_rcv_pkt
    ibd_state_s::id_tx_short
    ibd_state_s::id_xmt_bytes
    ibd_state_s::id_xmt_pkt))

/*
 * Non-mutex protection schemes for data elements. Almost all of
 * these are non-shared items.
 */
_NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
    callb_cpr
    ib_gid_s
    ib_header_info
    ibd_acache_rq
    ibd_acache_s::ac_mce
    ibd_mcache::mc_fullreap
    ibd_mcache::mc_jstate
    ibd_mcache::mc_req
    ibd_rwqe_s
    ibd_swqe_s
    ibd_wqe_s
    ibt_wr_ds_s::ds_va
    ibt_wr_lso_s
    ipoib_mac::ipoib_qpn
    mac_capab_lso_s
    msgb::b_next
    msgb::b_rptr
    msgb::b_wptr))

int
_init()
{
	int status;

	/*
	 * Sanity check some parameter settings. Tx completion polling
	 * only makes sense with separate CQs for Tx and Rx.
	 */
	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
		cmn_err(CE_NOTE, "!ibd: %s",
		    "Setting ibd_txcomp_poll = 0 for combined CQ");
		ibd_txcomp_poll = 0;
	}

	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
	if (status != 0) {
		DPRINT(10, "_init:failed in ddi_soft_state_init()");
		return (status);
	}

	mac_init_ops(&ibd_dev_ops, "ibd");
	status = mod_install(&ibd_modlinkage);
	if (status != 0) {
		DPRINT(10, "_init:failed in mod_install()");
		ddi_soft_state_fini(&ibd_list);
		mac_fini_ops(&ibd_dev_ops);
		return (status);
	}

#ifdef IBD_LOGGING
	ibd_log_init();
#endif
	return (0);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&ibd_modlinkage, modinfop));
}

int
_fini()
{
	int status;

	status = mod_remove(&ibd_modlinkage);
	if (status != 0)
		return (status);

	mac_fini_ops(&ibd_dev_ops);
	ddi_soft_state_fini(&ibd_list);
#ifdef IBD_LOGGING
	ibd_log_fini();
#endif
	return (0);
}

/*
 * Convert the GID part of the mac address from network byte order
 * to host order.
 */
static void
ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
{
	ib_sn_prefix_t nbopref;
	ib_guid_t nboguid;

	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
	dgid->gid_prefix = b2h64(nbopref);
	dgid->gid_guid = b2h64(nboguid);
}

/*
 * Create the IPoIB address in network byte order from host order inputs.
 */
static void
ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
    ib_guid_t guid)
{
	ib_sn_prefix_t nbopref;
	ib_guid_t nboguid;

	mac->ipoib_qpn = htonl(qpn);
	nbopref = h2b64(prefix);
	nboguid = h2b64(guid);
	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
}

/*
 * Send to the appropriate all-routers group when the IBA multicast group
 * does not exist, based on whether the target group is v4 or v6.
 */
static boolean_t
ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
    ipoib_mac_t *rmac)
{
	boolean_t retval = B_TRUE;
	uint32_t adjscope = state->id_scope << 16;
	uint32_t topword;

	/*
	 * Copy the first 4 bytes in without assuming any alignment of
	 * input mac address; this will have IPoIB signature, flags and
	 * scope bits.
	 */
	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
	topword = ntohl(topword);

	/*
	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
	 */
	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
		    ((uint32_t)(state->id_pkey << 16))),
		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
	else
		/*
		 * Does not have proper bits in the mgid address.
		 */
		retval = B_FALSE;

	return (retval);
}

/*
 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
 * front of optional src/tgt link layer address. Right now Solaris inserts
 * padding by default at the end. The routine which is doing is nce_xmit()
 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
 * the packet comes down from IP layer to the IBD driver, it is in the
 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
 * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
 *
 * The send routine at IBD driver changes this packet as follows:
 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
 * aligned.
 *
 * At the receiving side again ibd_process_rx takes the above packet and
 * removes the two bytes of front padding and inserts it at the end. This
 * is since the IP layer does not understand padding at the front.
 */
#define	IBD_PAD_NSNA(ip6h, len, type) {					\
	uchar_t 	*nd_lla_ptr;					\
	icmp6_t 	*icmp6;						\
	nd_opt_hdr_t	*opt;						\
	int 		i;						\
									\
	icmp6 = (icmp6_t *)&ip6h[1];					\
	len -= sizeof (nd_neighbor_advert_t);				\
	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
	    (len != 0)) {						\
		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
		ASSERT(opt != NULL);					\
		nd_lla_ptr = (uchar_t *)&opt[1];			\
		if (type == IBD_SEND) {					\
			for (i = IPOIB_ADDRL; i > 0; i--)		\
				*(nd_lla_ptr + i + 1) =			\
				    *(nd_lla_ptr + i - 1);		\
		} else {						\
			for (i = 0; i < IPOIB_ADDRL; i++)		\
				*(nd_lla_ptr + i) =			\
				    *(nd_lla_ptr + i + 2);		\
		}							\
		*(nd_lla_ptr + i) = 0;					\
		*(nd_lla_ptr + i + 1) = 0;				\
	}								\
}

/*
 * Address handle entries maintained by the driver are kept in the
 * free and active lists. Each entry starts out in the free list;
 * it migrates to the active list when primed using ibt_get_paths()
 * and ibt_modify_ud_dest() for transmission to a specific destination.
 * In the active list, the entry has a reference count indicating the
 * number of ongoing/uncompleted transmits that reference it. The
 * entry is left in the active list even after the reference count
 * goes to 0, since successive transmits can find it there and do
 * not need to set up another entry (ie the path information is
 * cached using the active list). Entries on the active list are
 * also hashed using the destination link address as a key for faster
 * lookups during transmits.
 *
 * For any destination address (unicast or multicast, whatever the
 * join states), there will be at most one entry in the active list.
 * Entries with a 0 reference count on the active list can be reused
 * for a transmit to a new destination, if the free list is empty.
 *
 * The AH free list insertion/deletion is protected with the id_ac_mutex,
 * since the async thread and Tx callback handlers insert/delete. The
 * active list does not need a lock (all operations are done by the
 * async thread) but updates to the reference count are atomically
 * done (increments done by Tx path, decrements by the Tx callback handler).
 */
#define	IBD_ACACHE_INSERT_FREE(state, ce) \
	list_insert_head(&state->id_ah_free, ce)
#define	IBD_ACACHE_GET_FREE(state) \
	list_get_head(&state->id_ah_free)
#define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
	int _ret_;						\
	list_insert_head(&state->id_ah_active, ce);		\
	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
	ASSERT(_ret_ == 0);					\
}
#define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
	list_remove(&state->id_ah_active, ce);			\
	(void) mod_hash_remove(state->id_ah_active_hash,	\
	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
}
#define	IBD_ACACHE_GET_ACTIVE(state) \
	list_get_head(&state->id_ah_active)

/*
 * Membership states for different mcg's are tracked by two lists:
 * the "non" list is used for promiscuous mode, when all mcg traffic
 * needs to be inspected. This type of membership is never used for
 * transmission, so there can not be an AH in the active list
 * corresponding to a member in this list. This list does not need
 * any protection, since all operations are performed by the async
 * thread.
 *
 * "Full" and "SendOnly" membership is tracked using a single list,
 * the "full" list. This is because this single list can then be
 * searched during transmit to a multicast group (if an AH for the
 * mcg is not found in the active list), since at least one type
 * of membership must be present before initiating the transmit.
 * This list is also emptied during driver detach, since sendonly
 * membership acquired during transmit is dropped at detach time
 * alongwith ipv4 broadcast full membership. Insert/deletes to
 * this list are done only by the async thread, but it is also
 * searched in program context (see multicast disable case), thus
 * the id_mc_mutex protects the list. The driver detach path also
 * deconstructs the "full" list, but it ensures that the async
 * thread will not be accessing the list (by blocking out mcg
 * trap handling and making sure no more Tx reaping will happen).
 *
 * Currently, an IBA attach is done in the SendOnly case too,
 * although this is not required.
 */
#define	IBD_MCACHE_INSERT_FULL(state, mce) \
	list_insert_head(&state->id_mc_full, mce)
#define	IBD_MCACHE_INSERT_NON(state, mce) \
	list_insert_head(&state->id_mc_non, mce)
#define	IBD_MCACHE_FIND_FULL(state, mgid) \
	ibd_mcache_find(mgid, &state->id_mc_full)
#define	IBD_MCACHE_FIND_NON(state, mgid) \
	ibd_mcache_find(mgid, &state->id_mc_non)
#define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
	list_remove(&state->id_mc_full, mce)
#define	IBD_MCACHE_PULLOUT_NON(state, mce) \
	list_remove(&state->id_mc_non, mce)

/*
 * AH and MCE active list manipulation:
 *
 * Multicast disable requests and MCG delete traps are two cases
 * where the active AH entry for the mcg (if any unreferenced one exists)
 * will be moved to the free list (to force the next Tx to the mcg to
 * join the MCG in SendOnly mode). Port up handling will also move AHs
 * from active to free list.
 *
 * In the case when some transmits are still pending on an entry
 * for an mcg, but a multicast disable has already been issued on the
 * mcg, there are some options to consider to preserve the join state
 * to ensure the emitted packet is properly routed on the IBA fabric.
 * For the AH, we can
 * 1. take out of active list at multicast disable time.
 * 2. take out of active list only when last pending Tx completes.
 * For the MCE, we can
 * 3. take out of active list at multicast disable time.
 * 4. take out of active list only when last pending Tx completes.
 * 5. move from active list to stale list at multicast disable time.
 * We choose to use 2,4. We use option 4 so that if a multicast enable
 * is tried before the pending Tx completes, the enable code finds the
 * mce in the active list and just has to make sure it will not be reaped
 * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
 * a stale list (#5) that would be checked in the enable code would need
 * to be implemented. Option 2 is used, because otherwise, a Tx attempt
 * after the multicast disable would try to put an AH in the active list,
 * and associate the mce it finds in the active list to this new AH,
 * whereas the mce is already associated with the previous AH (taken off
 * the active list), and will be removed once the pending Tx's complete
 * (unless a reference count on mce's is implemented). One implication of
 * using 2,4 is that new Tx's posted before the pending Tx's complete will
 * grab new references on the AH, further delaying the leave.
 *
 * In the case of mcg delete (or create) trap when the port is sendonly
 * joined, the AH and MCE handling is different: the AH and MCE has to be
 * immediately taken off the active lists (forcing a join and path lookup
 * at the next Tx is the only guaranteed means of ensuring a proper Tx
 * to an mcg as it is repeatedly created and deleted and goes thru
 * reincarnations).
 *
 * When a port is already sendonly joined, and a multicast enable is
 * attempted, the same mce structure is promoted; this ensures only a
 * single mce on the active list tracks the most powerful join state.
 *
 * In the case of port up event handling, the MCE for sendonly membership
 * is freed up, and the ACE is put into the free list as soon as possible
 * (depending on whether posted Tx's have completed). For fullmembership
 * MCE's though, the ACE is similarly handled; but the MCE is kept around
 * (a re-JOIN is attempted) only if the DLPI leave has not already been
 * done; else the mce is deconstructed (mc_fullreap case).
 *
 * MCG creation and deletion trap handling:
 *
 * These traps are unreliable (meaning sometimes the trap might never
 * be delivered to the subscribed nodes) and may arrive out-of-order
 * since they use UD transport. An alternative to relying on these
 * unreliable traps is to poll for mcg presence every so often, but
 * instead of doing that, we try to be as conservative as possible
 * while handling the traps, and hope that the traps do arrive at
 * the subscribed nodes soon. Note that if a node is fullmember
 * joined to an mcg, it can not possibly receive a mcg create/delete
 * trap for that mcg (by fullmember definition); if it does, it is
 * an old trap from a previous incarnation of the mcg.
 *
 * Whenever a trap is received, the driver cleans up its sendonly
 * membership to the group; we choose to do a sendonly leave even
 * on a creation trap to handle the case of a prior deletion of the mcg
 * having gone unnoticed. Consider an example scenario:
 * T1: MCG M is deleted, and fires off deletion trap D1.
 * T2: MCG M is recreated, fires off creation trap C1, which is lost.
 * T3: Node N tries to transmit to M, joining in sendonly mode.
 * T4: MCG M is deleted, and fires off deletion trap D2.
 * T5: N receives a deletion trap, but can not distinguish D1 from D2.
 *     If the trap is D2, then a LEAVE is not required, since the mcg
 *     is already deleted; but if it is D1, a LEAVE is required. A safe
 *     approach is to always LEAVE, but the SM may be confused if it
 *     receives a LEAVE without a prior JOIN.
 *
 * Management of the non-membership to an mcg is similar to the above,
 * except that if the interface is in promiscuous mode, it is required
 * to attempt to re-join the mcg after receiving a trap. Unfortunately,
 * if the re-join attempt fails (in which case a warning message needs
 * to be printed), it is not clear whether it failed due to the mcg not
 * existing, or some fabric/hca issues, due to the delayed nature of
 * trap delivery. Querying the SA to establish presence/absence of the
 * mcg is also racy at best. Thus, the driver just prints a warning
 * message when it can not rejoin after receiving a create trap, although
 * this might be (on rare occassions) a mis-warning if the create trap is
 * received after the mcg was deleted.
 */

/*
 * Implementation of atomic "recycle" bits and reference count
 * on address handles. This utilizes the fact that max reference
 * count on any handle is limited by number of send wqes, thus
 * high bits in the ac_ref field can be used as the recycle bits,
 * and only the low bits hold the number of pending Tx requests.
 * This atomic AH reference counting allows the Tx completion
 * handler not to acquire the id_ac_mutex to process every completion,
 * thus reducing lock contention problems between completion and
 * the Tx path.
 */
#define	CYCLEVAL		0x80000
#define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
#define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
#define	GET_REF(ace)		((ace)->ac_ref)
#define	GET_REF_CYCLE(ace) (				\
	/*						\
	 * Make sure "cycle" bit is set.		\
	 */						\
	ASSERT(CYCLE_SET(ace)),				\
	((ace)->ac_ref & ~(CYCLEVAL))			\
)
#define	INC_REF(ace, num) {				\
	atomic_add_32(&(ace)->ac_ref, num);		\
}
#define	SET_CYCLE_IF_REF(ace) (				\
	CYCLE_SET(ace) ? B_TRUE :			\
	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
		CYCLEVAL ?				\
		/*					\
		 * Clear the "cycle" bit we just set;	\
		 * ref count known to be 0 from above.	\
		 */					\
		CLEAR_REFCYCLE(ace), B_FALSE :		\
		/*					\
		 * We set "cycle" bit; let caller know.	\
		 */					\
		B_TRUE					\
)
#define	DEC_REF_DO_CYCLE(ace) (				\
	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
	    CYCLEVAL ?					\
		/*					\
		 * Ref count known to be 0 from above.	\
		 */					\
		B_TRUE :				\
		B_FALSE					\
)

static void *
list_get_head(list_t *list)
{
	list_node_t *lhead = list_head(list);

	if (lhead != NULL)
		list_remove(list, lhead);
	return (lhead);
}

/*
 * This is always guaranteed to be able to queue the work.
 */
static void
ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
{
	/* Initialize request */
	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
	ptr->rq_op = op;

	/*
	 * Queue provided slot onto request pool.
	 */
	mutex_enter(&state->id_acache_req_lock);
	list_insert_tail(&state->id_req_list, ptr);

	/* Go, fetch, async thread */
	cv_signal(&state->id_acache_req_cv);
	mutex_exit(&state->id_acache_req_lock);
}

/*
 * Main body of the per interface async thread.
 */
static void
ibd_async_work(ibd_state_t *state)
{
	ibd_req_t *ptr;
	callb_cpr_t cprinfo;

	mutex_enter(&state->id_acache_req_lock);
	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
	    callb_generic_cpr, "ibd_async_work");

	for (;;) {
		ptr = list_get_head(&state->id_req_list);
		if (ptr != NULL) {
			mutex_exit(&state->id_acache_req_lock);

			/*
			 * Once we have done the operation, there is no
			 * guarantee the request slot is going to be valid,
			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
			 * TRAP).
			 *
			 * Perform the request.
			 */
			switch (ptr->rq_op) {
				case IBD_ASYNC_GETAH:
					ibd_async_acache(state, &ptr->rq_mac);
					break;
				case IBD_ASYNC_JOIN:
				case IBD_ASYNC_LEAVE:
					ibd_async_multicast(state,
					    ptr->rq_gid, ptr->rq_op);
					break;
				case IBD_ASYNC_PROMON:
					ibd_async_setprom(state);
					break;
				case IBD_ASYNC_PROMOFF:
					ibd_async_unsetprom(state);
					break;
				case IBD_ASYNC_REAP:
					ibd_async_reap_group(state,
					    ptr->rq_ptr, ptr->rq_gid,
					    IB_MC_JSTATE_FULL);
					/*
					 * the req buf contains in mce
					 * structure, so we do not need
					 * to free it here.
					 */
					ptr = NULL;
					break;
				case IBD_ASYNC_TRAP:
					ibd_async_trap(state, ptr);
					break;
				case IBD_ASYNC_SCHED:
					ibd_async_txsched(state);
					break;
				case IBD_ASYNC_LINK:
					ibd_async_link(state, ptr);
					break;
				case IBD_ASYNC_EXIT:
					mutex_enter(&state->id_acache_req_lock);
#ifndef __lock_lint
					CALLB_CPR_EXIT(&cprinfo);
#else
					mutex_exit(&state->id_acache_req_lock);
#endif
					return;
			}
			if (ptr != NULL)
				kmem_cache_free(state->id_req_kmc, ptr);

			mutex_enter(&state->id_acache_req_lock);
		} else {
#ifndef __lock_lint
			/*
			 * Nothing to do: wait till new request arrives.
			 */
			CALLB_CPR_SAFE_BEGIN(&cprinfo);
			cv_wait(&state->id_acache_req_cv,
			    &state->id_acache_req_lock);
			CALLB_CPR_SAFE_END(&cprinfo,
			    &state->id_acache_req_lock);
#endif
		}
	}

	/*NOTREACHED*/
	_NOTE(NOT_REACHED)
}

/*
 * Return when it is safe to queue requests to the async daemon; primarily
 * for subnet trap and async event handling. Disallow requests before the
 * daemon is created, and when interface deinitilization starts.
 */
static boolean_t
ibd_async_safe(ibd_state_t *state)
{
	mutex_enter(&state->id_trap_lock);
	if (state->id_trap_stop) {
		mutex_exit(&state->id_trap_lock);
		return (B_FALSE);
	}
	state->id_trap_inprog++;
	mutex_exit(&state->id_trap_lock);
	return (B_TRUE);
}

/*
 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
 * trap or event handling to complete to kill the async thread and deconstruct
 * the mcg/ace list.
 */
static void
ibd_async_done(ibd_state_t *state)
{
	mutex_enter(&state->id_trap_lock);
	if (--state->id_trap_inprog == 0)
		cv_signal(&state->id_trap_cv);
	mutex_exit(&state->id_trap_lock);
}

/*
 * Hash functions:
 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
 * These operate on mac addresses input into ibd_send, but there is no
 * guarantee on the alignment of the ipoib_mac_t structure.
 */
/*ARGSUSED*/
static uint_t
ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
{
	ulong_t ptraddr = (ulong_t)key;
	uint_t hval;

	/*
	 * If the input address is 4 byte aligned, we can just dereference
	 * it. This is most common, since IP will send in a 4 byte aligned
	 * IP header, which implies the 24 byte IPoIB psuedo header will be
	 * 4 byte aligned too.
	 */
	if ((ptraddr & 3) == 0)
		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);

	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
	return (hval);
}

static int
ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
{
	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
		return (0);
	else
		return (1);
}

/*
 * Initialize all the per interface caches and lists; AH cache,
 * MCG list etc.
 */
static int
ibd_acache_init(ibd_state_t *state)
{
	ibd_ace_t *ce;
	int i;

	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);

	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
	    offsetof(ibd_ace_t, ac_list));
	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
	    offsetof(ibd_ace_t, ac_list));
	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
	    offsetof(ibd_mce_t, mc_list));
	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
	    offsetof(ibd_mce_t, mc_list));
	list_create(&state->id_req_list, sizeof (ibd_req_t),
	    offsetof(ibd_req_t, rq_list));

	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
	    IBD_NUM_AH, KM_SLEEP);
	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
			ibd_acache_fini(state);
			return (DDI_FAILURE);
		} else {
			CLEAR_REFCYCLE(ce);
			ce->ac_mce = NULL;
			IBD_ACACHE_INSERT_FREE(state, ce);
		}
	}
	return (DDI_SUCCESS);
}

static void
ibd_acache_fini(ibd_state_t *state)
{
	ibd_ace_t *ptr;

	mutex_enter(&state->id_ac_mutex);

	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
		ASSERT(GET_REF(ptr) == 0);
		(void) ibt_free_ud_dest(ptr->ac_dest);
	}

	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
		ASSERT(GET_REF(ptr) == 0);
		(void) ibt_free_ud_dest(ptr->ac_dest);
	}

	list_destroy(&state->id_ah_free);
	list_destroy(&state->id_ah_active);
	list_destroy(&state->id_mc_full);
	list_destroy(&state->id_mc_non);
	list_destroy(&state->id_req_list);
	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
	mutex_exit(&state->id_ac_mutex);
	mutex_destroy(&state->id_ac_mutex);
	mutex_destroy(&state->id_mc_mutex);
	mutex_destroy(&state->id_acache_req_lock);
	cv_destroy(&state->id_acache_req_cv);
}

/*
 * Search AH active hash list for a cached path to input destination.
 * If we are "just looking", hold == F. When we are in the Tx path,
 * we set hold == T to grab a reference on the AH so that it can not
 * be recycled to a new destination while the Tx request is posted.
 */
static ibd_ace_t *
ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
{
	ibd_ace_t *ptr;

	ASSERT(mutex_owned(&state->id_ac_mutex));

	/*
	 * Do hash search.
	 */
	if (mod_hash_find(state->id_ah_active_hash,
	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
		if (hold)
			INC_REF(ptr, num);
		return (ptr);
	}
	return (NULL);
}

/*
 * This is called by the tx side; if an initialized AH is found in
 * the active list, it is locked down and can be used; if no entry
 * is found, an async request is queued to do path resolution.
 */
static ibd_ace_t *
ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
{
	ibd_ace_t *ptr;
	ibd_req_t *req;

	/*
	 * Only attempt to print when we can; in the mdt pattr case, the
	 * address is not aligned properly.
	 */
	if (((ulong_t)mac & 3) == 0) {
		DPRINT(4,
		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
		    htonl(mac->ipoib_gidsuff[1]));
	}

	mutex_enter(&state->id_ac_mutex);

	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
		mutex_exit(&state->id_ac_mutex);
		return (ptr);
	}

	/*
	 * Implementation of a single outstanding async request; if
	 * the operation is not started yet, queue a request and move
	 * to ongoing state. Remember in id_ah_addr for which address
	 * we are queueing the request, in case we need to flag an error;
	 * Any further requests, for the same or different address, until
	 * the operation completes, is sent back to GLDv3 to be retried.
	 * The async thread will update id_ah_op with an error indication
	 * or will set it to indicate the next look up can start; either
	 * way, it will mac_tx_update() so that all blocked requests come
	 * back here.
	 */
	*err = EAGAIN;
	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
		if (req != NULL) {
			/*
			 * We did not even find the entry; queue a request
			 * for it.
			 */
			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
			state->id_ah_op = IBD_OP_ONGOING;
			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
		}
	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
		/*
		 * Check the status of the pathrecord lookup request
		 * we had queued before.
		 */
		if (state->id_ah_op == IBD_OP_ERRORED) {
			*err = EFAULT;
			state->id_ah_error++;
		} else {
			/*
			 * IBD_OP_ROUTERED case: We need to send to the
			 * all-router MCG. If we can find the AH for
			 * the mcg, the Tx will be attempted. If we
			 * do not find the AH, we return NORESOURCES
			 * to retry.
			 */
			ipoib_mac_t routermac;

			(void) ibd_get_allroutergroup(state, mac, &routermac);
			ptr = ibd_acache_find(state, &routermac, B_TRUE,
			    numwqe);
		}
		state->id_ah_op = IBD_OP_NOTSTARTED;
	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
		/*
		 * This case can happen when we get a higher band
		 * packet. The easiest way is to reset the state machine
		 * to accommodate the higher priority packet.
		 */
		state->id_ah_op = IBD_OP_NOTSTARTED;
	}
	mutex_exit(&state->id_ac_mutex);

	return (ptr);
}

/*
 * Grab a not-currently-in-use AH/PathRecord from the active
 * list to recycle to a new destination. Only the async thread
 * executes this code.
 */
static ibd_ace_t *
ibd_acache_get_unref(ibd_state_t *state)
{
	ibd_ace_t *ptr = list_head(&state->id_ah_active);

	ASSERT(mutex_owned(&state->id_ac_mutex));

	/*
	 * Do plain linear search.
	 */
	while (ptr != NULL) {
		/*
		 * Note that it is possible that the "cycle" bit
		 * is set on the AH w/o any reference count. The
		 * mcg must have been deleted, and the tx cleanup
		 * just decremented the reference count to 0, but
		 * hasn't gotten around to grabbing the id_ac_mutex
		 * to move the AH into the free list.
		 */
		if (GET_REF(ptr) == 0) {
			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
			break;
		}
		ptr = list_next(&state->id_ah_active, ptr);
	}
	return (ptr);
}

/*
 * Invoked to clean up AH from active list in case of multicast
 * disable and to handle sendonly memberships during mcg traps.
 * And for port up processing for multicast and unicast AHs.
 * Normally, the AH is taken off the active list, and put into
 * the free list to be recycled for a new destination. In case
 * Tx requests on the AH have not completed yet, the AH is marked
 * for reaping (which will put the AH on the free list) once the Tx's
 * complete; in this case, depending on the "force" input, we take
 * out the AH from the active list right now, or leave it also for
 * the reap operation. Returns TRUE if the AH is taken off the active
 * list (and either put into the free list right now, or arranged for
 * later), FALSE otherwise.
 */
static boolean_t
ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
{
	ibd_ace_t *acactive;
	boolean_t ret = B_TRUE;

	ASSERT(mutex_owned(&state->id_ac_mutex));

	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {

		/*
		 * Note that the AH might already have the cycle bit set
		 * on it; this might happen if sequences of multicast
		 * enables and disables are coming so fast, that posted
		 * Tx's to the mcg have not completed yet, and the cycle
		 * bit is set successively by each multicast disable.
		 */
		if (SET_CYCLE_IF_REF(acactive)) {
			if (!force) {
				/*
				 * The ace is kept on the active list, further
				 * Tx's can still grab a reference on it; the
				 * ace is reaped when all pending Tx's
				 * referencing the AH complete.
				 */
				ret = B_FALSE;
			} else {
				/*
				 * In the mcg trap case, we always pull the
				 * AH from the active list. And also the port
				 * up multi/unicast case.
				 */
				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
				acactive->ac_mce = NULL;
			}
		} else {
			/*
			 * Determined the ref count is 0, thus reclaim
			 * immediately after pulling out the ace from
			 * the active list.
			 */
			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
			acactive->ac_mce = NULL;
			IBD_ACACHE_INSERT_FREE(state, acactive);
		}

	}
	return (ret);
}

/*
 * Helper function for async path record lookup. If we are trying to
 * Tx to a MCG, check our membership, possibly trying to join the
 * group if required. If that fails, try to send the packet to the
 * all router group (indicated by the redirect output), pointing
 * the input mac address to the router mcg address.
 */
static ibd_mce_t *
ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
{
	ib_gid_t mgid;
	ibd_mce_t *mce;
	ipoib_mac_t routermac;

	*redirect = B_FALSE;
	ibd_n2h_gid(mac, &mgid);

	/*
	 * Check the FullMember+SendOnlyNonMember list.
	 * Since we are the only one who manipulates the
	 * id_mc_full list, no locks are needed.
	 */
	mce = IBD_MCACHE_FIND_FULL(state, mgid);
	if (mce != NULL) {
		DPRINT(4, "ibd_async_mcache : already joined to group");
		return (mce);
	}

	/*
	 * Not found; try to join(SendOnlyNonMember) and attach.
	 */
	DPRINT(4, "ibd_async_mcache : not joined to group");
	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
	    NULL) {
		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
		return (mce);
	}

	/*
	 * MCGroup not present; try to join the all-router group. If
	 * any of the following steps succeed, we will be redirecting
	 * to the all router group.
	 */
	DPRINT(4, "ibd_async_mcache : nonmem join failed");
	if (!ibd_get_allroutergroup(state, mac, &routermac))
		return (NULL);
	*redirect = B_TRUE;
	ibd_n2h_gid(&routermac, &mgid);
	bcopy(&routermac, mac, IPOIB_ADDRL);
	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
	    mgid.gid_prefix, mgid.gid_guid);

	/*
	 * Are we already joined to the router group?
	 */
	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
		DPRINT(4, "ibd_async_mcache : using already joined router"
		    "group\n");
		return (mce);
	}

	/*
	 * Can we join(SendOnlyNonMember) the router group?
	 */
	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
	    NULL) {
		DPRINT(4, "ibd_async_mcache : joined to router grp");
		return (mce);
	}

	return (NULL);
}

/*
 * Async path record lookup code.
 */
static void
ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
{
	ibd_ace_t *ce;
	ibd_mce_t *mce = NULL;
	ibt_path_attr_t path_attr;
	ibt_path_info_t path_info;
	ib_gid_t destgid;
	char ret = IBD_OP_NOTSTARTED;

	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
	    htonl(mac->ipoib_gidsuff[1]));

	/*
	 * Check whether we are trying to transmit to a MCG.
	 * In that case, we need to make sure we are a member of
	 * the MCG.
	 */
	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
		boolean_t redirected;

		/*
		 * If we can not find or join the group or even
		 * redirect, error out.
		 */
		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
		    NULL) {
			state->id_ah_op = IBD_OP_ERRORED;
			return;
		}

		/*
		 * If we got redirected, we need to determine whether
		 * the AH for the new mcg is in the cache already, and
		 * not pull it in then; otherwise proceed to get the
		 * path for the new mcg. There is no guarantee that
		 * if the AH is currently in the cache, it will still be
		 * there when we look in ibd_acache_lookup(), but that's
		 * okay, we will come back here.
		 */
		if (redirected) {
			ret = IBD_OP_ROUTERED;
			DPRINT(4, "ibd_async_acache :  redirected to "
			    "%08X:%08X:%08X:%08X:%08X",
			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
			    htonl(mac->ipoib_gidpref[1]),
			    htonl(mac->ipoib_gidsuff[0]),
			    htonl(mac->ipoib_gidsuff[1]));

			mutex_enter(&state->id_ac_mutex);
			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
				state->id_ah_op = IBD_OP_ROUTERED;
				mutex_exit(&state->id_ac_mutex);
				DPRINT(4, "ibd_async_acache : router AH found");
				return;
			}
			mutex_exit(&state->id_ac_mutex);
		}
	}

	/*
	 * Get an AH from the free list.
	 */
	mutex_enter(&state->id_ac_mutex);
	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
		/*
		 * No free ones; try to grab an unreferenced active
		 * one. Maybe we need to make the active list LRU,
		 * but that will create more work for Tx callbacks.
		 * Is there a way of not having to pull out the
		 * entry from the active list, but just indicate it
		 * is being recycled? Yes, but that creates one more
		 * check in the fast lookup path.
		 */
		if ((ce = ibd_acache_get_unref(state)) == NULL) {
			/*
			 * Pretty serious shortage now.
			 */
			state->id_ah_op = IBD_OP_NOTSTARTED;
			mutex_exit(&state->id_ac_mutex);
			DPRINT(10, "ibd_async_acache : failed to find AH "
			    "slot\n");
			return;
		}
		/*
		 * We could check whether ac_mce points to a SendOnly
		 * member and drop that membership now. Or do it lazily
		 * at detach time.
		 */
		ce->ac_mce = NULL;
	}
	mutex_exit(&state->id_ac_mutex);
	ASSERT(ce->ac_mce == NULL);

	/*
	 * Update the entry.
	 */
	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);

	bzero(&path_info, sizeof (path_info));
	bzero(&path_attr, sizeof (ibt_path_attr_t));
	path_attr.pa_sgid = state->id_sgid;
	path_attr.pa_num_dgids = 1;
	ibd_n2h_gid(&ce->ac_mac, &destgid);
	path_attr.pa_dgids = &destgid;
	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
		goto error;
	}
	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
	    ntohl(ce->ac_mac.ipoib_qpn),
	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
		goto error;
	}

	/*
	 * mce is set whenever an AH is being associated with a
	 * MCG; this will come in handy when we leave the MCG. The
	 * lock protects Tx fastpath from scanning the active list.
	 */
	if (mce != NULL)
		ce->ac_mce = mce;
	mutex_enter(&state->id_ac_mutex);
	IBD_ACACHE_INSERT_ACTIVE(state, ce);
	state->id_ah_op = ret;
	mutex_exit(&state->id_ac_mutex);
	return;
error:
	/*
	 * We might want to drop SendOnly membership here if we
	 * joined above. The lock protects Tx callbacks inserting
	 * into the free list.
	 */
	mutex_enter(&state->id_ac_mutex);
	state->id_ah_op = IBD_OP_ERRORED;
	IBD_ACACHE_INSERT_FREE(state, ce);
	mutex_exit(&state->id_ac_mutex);
}

/*
 * While restoring port's presence on the subnet on a port up, it is possible
 * that the port goes down again.
 */
static void
ibd_async_link(ibd_state_t *state, ibd_req_t *req)
{
	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
	    LINK_STATE_UP;
	ibd_mce_t *mce, *pmce;
	ibd_ace_t *ace, *pace;

	DPRINT(10, "ibd_async_link(): %d", opcode);

	/*
	 * On a link up, revalidate the link speed/width. No point doing
	 * this on a link down, since we will be unable to do SA operations,
	 * defaulting to the lowest speed. Also notice that we update our
	 * notion of speed before calling mac_link_update(), which will do
	 * neccesary higher level notifications for speed changes.
	 */
	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
		state->id_link_speed = ibd_get_portspeed(state);
		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
	}

	/*
	 * Do all the work required to establish our presence on
	 * the subnet.
	 */
	if (opcode == IBD_LINK_UP_ABSENT) {
		/*
		 * If in promiscuous mode ...
		 */
		if (state->id_prom_op == IBD_OP_COMPLETED) {
			/*
			 * Drop all nonmembership.
			 */
			ibd_async_unsetprom(state);

			/*
			 * Then, try to regain nonmembership to all mcg's.
			 */
			ibd_async_setprom(state);

		}

		/*
		 * Drop all sendonly membership (which also gets rid of the
		 * AHs); try to reacquire all full membership.
		 */
		mce = list_head(&state->id_mc_full);
		while ((pmce = mce) != NULL) {
			mce = list_next(&state->id_mc_full, mce);
			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
				ibd_leave_group(state,
				    pmce->mc_info.mc_adds_vect.av_dgid,
				    IB_MC_JSTATE_SEND_ONLY_NON);
			else
				ibd_reacquire_group(state, pmce);
		}

		/*
		 * Recycle all active AHs to free list (and if there are
		 * pending posts, make sure they will go into the free list
		 * once the Tx's complete). Grab the lock to prevent
		 * concurrent Tx's as well as Tx cleanups.
		 */
		mutex_enter(&state->id_ac_mutex);
		ace = list_head(&state->id_ah_active);
		while ((pace = ace) != NULL) {
			boolean_t cycled;

			ace = list_next(&state->id_ah_active, ace);
			mce = pace->ac_mce;
			cycled = ibd_acache_recycle(state, &pace->ac_mac,
			    B_TRUE);
			/*
			 * If this is for an mcg, it must be for a fullmember,
			 * since we got rid of send-only members above when
			 * processing the mce list.
			 */
			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
			    IB_MC_JSTATE_FULL)));

			/*
			 * Check if the fullmember mce needs to be torn down,
			 * ie whether the DLPI disable has already been done.
			 * If so, do some of the work of tx_cleanup, namely
			 * causing leave (which will fail), detach and
			 * mce-freeing. tx_cleanup will put the AH into free
			 * list. The reason to duplicate some of this
			 * tx_cleanup work is because we want to delete the
			 * AH right now instead of waiting for tx_cleanup, to
			 * force subsequent Tx's to reacquire an AH.
			 */
			if ((mce != NULL) && (mce->mc_fullreap))
				ibd_async_reap_group(state, mce,
				    mce->mc_info.mc_adds_vect.av_dgid,
				    mce->mc_jstate);
		}
		mutex_exit(&state->id_ac_mutex);
	}

	/*
	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
	 * (which stops further events from being delivered) before
	 * mac_unregister(). At this point, it is guaranteed that mac_register
	 * has already been done.
	 */
	mutex_enter(&state->id_link_mutex);
	state->id_link_state = lstate;
	mac_link_update(state->id_mh, lstate);
	mutex_exit(&state->id_link_mutex);

	ibd_async_done(state);
}

/*
 * Check the pkey table to see if we can find the pkey we're looking for.
 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
 * failure.
 */
static int
ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
    uint16_t *pkix)
{
	uint16_t ndx;

	ASSERT(pkix != NULL);

	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
		if (pkey_tbl[ndx] == pkey) {
			*pkix = ndx;
			return (0);
		}
	}
	return (-1);
}

/*
 * When the link is notified up, we need to do a few things, based
 * on the port's current p_init_type_reply claiming a reinit has been
 * done or not. The reinit steps are:
 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
 *    the old Pkey and GID0 are correct.
 * 2. Register for mcg traps (already done by ibmf).
 * 3. If PreservePresenceReply indicates the SM has restored port's presence
 *    in subnet, nothing more to do. Else go to next steps (on async daemon).
 * 4. Give up all sendonly memberships.
 * 5. Acquire all full memberships.
 * 6. In promiscuous mode, acquire all non memberships.
 * 7. Recycle all AHs to free list.
 */
static void
ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
{
	ibt_hca_portinfo_t *port_infop = NULL;
	ibt_status_t ibt_status;
	uint_t psize, port_infosz;
	ibd_link_op_t opcode;
	ibd_req_t *req;
	link_state_t new_link_state = LINK_STATE_UP;
	uint8_t itreply;
	uint16_t pkix;
	int ret;

	/*
	 * Let's not race with a plumb or an unplumb; if we detect a
	 * pkey relocation event later on here, we may have to restart.
	 */
	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);

	mutex_enter(&state->id_link_mutex);

	/*
	 * If the init code in ibd_m_start hasn't yet set up the
	 * pkey/gid, nothing to do; that code will set the link state.
	 */
	if (state->id_link_state == LINK_STATE_UNKNOWN) {
		mutex_exit(&state->id_link_mutex);
		goto link_mod_return;
	}

	/*
	 * If this routine was called in response to a port down event,
	 * we just need to see if this should be informed.
	 */
	if (code == IBT_ERROR_PORT_DOWN) {
		new_link_state = LINK_STATE_DOWN;
		goto update_link_state;
	}

	/*
	 * If it's not a port down event we've received, try to get the port
	 * attributes first. If we fail here, the port is as good as down.
	 * Otherwise, if the link went down by the time the handler gets
	 * here, give up - we cannot even validate the pkey/gid since those
	 * are not valid and this is as bad as a port down anyway.
	 */
	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
	    &port_infop, &psize, &port_infosz);
	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
		new_link_state = LINK_STATE_DOWN;
		goto update_link_state;
	}

	/*
	 * Check the SM InitTypeReply flags. If both NoLoadReply and
	 * PreserveContentReply are 0, we don't know anything about the
	 * data loaded into the port attributes, so we need to verify
	 * if gid0 and pkey are still valid.
	 */
	itreply = port_infop->p_init_type_reply;
	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
		/*
		 * Check to see if the subnet part of GID0 has changed. If
		 * not, check the simple case first to see if the pkey
		 * index is the same as before; finally check to see if the
		 * pkey has been relocated to a different index in the table.
		 */
		if (bcmp(port_infop->p_sgid_tbl,
		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {

			new_link_state = LINK_STATE_DOWN;

		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
		    state->id_pkey) {

			new_link_state = LINK_STATE_UP;

		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {

			ibt_free_portinfo(port_infop, port_infosz);
			mutex_exit(&state->id_link_mutex);

			/*
			 * Currently a restart is required if our pkey has moved
			 * in the pkey table. If we get the ibt_recycle_ud() to
			 * work as documented (expected), we may be able to
			 * avoid a complete restart.  Note that we've already
			 * marked both the start and stop 'in-progress' flags,
			 * so it is ok to go ahead and do this restart.
			 */
			ibd_undo_start(state, LINK_STATE_DOWN);
			if ((ret = ibd_start(state)) != 0) {
				DPRINT(10, "ibd_restart: cannot restart, "
				    "ret=%d", ret);
			}

			goto link_mod_return;
		} else {
			new_link_state = LINK_STATE_DOWN;
		}
	}

update_link_state:
	if (port_infop) {
		ibt_free_portinfo(port_infop, port_infosz);
	}

	/*
	 * If the old state is the same as the new state, nothing to do
	 */
	if (state->id_link_state == new_link_state) {
		mutex_exit(&state->id_link_mutex);
		goto link_mod_return;
	}

	/*
	 * Ok, so there was a link state change; see if it's safe to ask
	 * the async thread to do the work
	 */
	if (!ibd_async_safe(state)) {
		state->id_link_state = new_link_state;
		mutex_exit(&state->id_link_mutex);
		goto link_mod_return;
	}

	mutex_exit(&state->id_link_mutex);

	/*
	 * If we're reporting a link up, check InitTypeReply to see if
	 * the SM has ensured that the port's presence in mcg, traps,
	 * etc. is intact.
	 */
	if (new_link_state == LINK_STATE_DOWN) {
		opcode = IBD_LINK_DOWN;
	} else {
		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
			opcode = IBD_LINK_UP;
		} else {
			opcode = IBD_LINK_UP_ABSENT;
		}
	}

	/*
	 * Queue up a request for ibd_async_link() to handle this link
	 * state change event
	 */
	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
	req->rq_ptr = (void *)opcode;
	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);

link_mod_return:
	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
}

/*
 * For the port up/down events, IBTL guarantees there will not be concurrent
 * invocations of the handler. IBTL might coalesce link transition events,
 * and not invoke the handler for _each_ up/down transition, but it will
 * invoke the handler with last known state
 */
static void
ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
    ibt_async_code_t code, ibt_async_event_t *event)
{
	ibd_state_t *state = (ibd_state_t *)clnt_private;

	switch (code) {
	case IBT_ERROR_CATASTROPHIC_CHAN:
		ibd_print_warn(state, "catastrophic channel error");
		break;
	case IBT_ERROR_CQ:
		ibd_print_warn(state, "completion queue error");
		break;
	case IBT_PORT_CHANGE_EVENT:
		/*
		 * Events will be delivered to all instances that have
		 * done ibt_open_hca() but not yet done ibt_close_hca().
		 * Only need to do work for our port; IBTF will deliver
		 * events for other ports on the hca we have ibt_open_hca'ed
		 * too. Note that id_port is initialized in ibd_attach()
		 * before we do an ibt_open_hca() in ibd_attach().
		 */
		ASSERT(state->id_hca_hdl == hca_hdl);
		if (state->id_port != event->ev_port)
			break;

		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
		    IBT_PORT_CHANGE_PKEY) {
			ibd_link_mod(state, code);
		}
		break;
	case IBT_ERROR_PORT_DOWN:
	case IBT_CLNT_REREG_EVENT:
	case IBT_EVENT_PORT_UP:
		/*
		 * Events will be delivered to all instances that have
		 * done ibt_open_hca() but not yet done ibt_close_hca().
		 * Only need to do work for our port; IBTF will deliver
		 * events for other ports on the hca we have ibt_open_hca'ed
		 * too. Note that id_port is initialized in ibd_attach()
		 * before we do an ibt_open_hca() in ibd_attach().
		 */
		ASSERT(state->id_hca_hdl == hca_hdl);
		if (state->id_port != event->ev_port)
			break;

		ibd_link_mod(state, code);
		break;

	case IBT_HCA_ATTACH_EVENT:
	case IBT_HCA_DETACH_EVENT:
		/*
		 * When a new card is plugged to the system, attach_event is
		 * invoked. Additionally, a cfgadm needs to be run to make the
		 * card known to the system, and an ifconfig needs to be run to
		 * plumb up any ibd interfaces on the card. In the case of card
		 * unplug, a cfgadm is run that will trigger any RCM scripts to
		 * unplumb the ibd interfaces on the card; when the card is
		 * actually unplugged, the detach_event is invoked;
		 * additionally, if any ibd instances are still active on the
		 * card (eg there were no associated RCM scripts), driver's
		 * detach routine is invoked.
		 */
		break;
	default:
		break;
	}
}

static int
ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
{
	mac_register_t *macp;
	int ret;

	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
		return (DDI_FAILURE);
	}

	/*
	 * Note that when we register with mac during attach, we don't
	 * have the id_macaddr yet, so we'll simply be registering a
	 * zero macaddr that we'll overwrite later during plumb (in
	 * ibd_m_start()). Similar is the case with id_mtu - we'll
	 * update the mac layer with the correct mtu during plumb.
	 */
	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
	macp->m_driver = state;
	macp->m_dip = dip;
	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
	macp->m_callbacks = &ibd_m_callbacks;
	macp->m_min_sdu = 0;
	macp->m_max_sdu = IBD_DEF_MAX_SDU;

	/*
	 *  Register ourselves with the GLDv3 interface
	 */
	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
		mac_free(macp);
		DPRINT(10,
		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
		return (DDI_FAILURE);
	}

	mac_free(macp);
	return (DDI_SUCCESS);
}

static int
ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
{
	ibt_hca_attr_t hca_attrs;
	ibt_status_t ibt_status;

	/*
	 * Query the HCA and fetch its attributes
	 */
	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
	ASSERT(ibt_status == IBT_SUCCESS);

	/*
	 * 1. Set the Hardware Checksum capability. Currently we only consider
	 *    full checksum offload.
	 */
	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
	}

	/*
	 * 2. Set LSO policy, capability and maximum length
	 */
	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
		state->id_lso_policy = B_TRUE;
	} else {
		state->id_lso_policy = B_FALSE;
	}

	/*
	 * Work-around for Bug 6866957. Ignore policy from ibd.conf.
	 * Turn off LSO forcibly. Remove it when the work-around is no longer
	 * needed.
	 */
	if (ibd_force_lso_disable) {
		state->id_lso_policy = B_FALSE;
	}
	/* End of Workaround */

	if (hca_attrs.hca_max_lso_size > 0) {
		state->id_lso_capable = B_TRUE;
		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
			state->id_lso_maxlen = IBD_LSO_MAXLEN;
		else
			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
	} else {
		state->id_lso_capable = B_FALSE;
		state->id_lso_maxlen = 0;
	}

	/*
	 * 3. Set Reserved L_Key capability
	 */
	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
		state->id_hca_res_lkey_capab = 1;
		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
	}

	/*
	 * 4. Set maximum sqseg value after checking to see if extended sgl
	 *    size information is provided by the hca
	 */
	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
	} else {
		state->id_max_sqseg = hca_attrs.hca_max_sgl;
	}
	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
		state->id_max_sqseg = IBD_MAX_SQSEG;
	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
		    state->id_max_sqseg, IBD_MAX_SQSEG);
	}

	/*
	 * 5. Set number of recv and send wqes after checking hca maximum
	 *    channel size
	 */
	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
	} else {
		state->id_num_rwqe = IBD_NUM_RWQE;
	}
	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
	} else {
		state->id_num_swqe = IBD_NUM_SWQE;
	}

	return (DDI_SUCCESS);
}

static int
ibd_unattach(ibd_state_t *state, dev_info_t *dip)
{
	int instance;
	uint32_t progress = state->id_mac_state;
	ibt_status_t ret;

	if (progress & IBD_DRV_MAC_REGISTERED) {
		(void) mac_unregister(state->id_mh);
		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
	}

	if (progress & IBD_DRV_PD_ALLOCD) {
		if ((ret = ibt_free_pd(state->id_hca_hdl,
		    state->id_pd_hdl)) != IBT_SUCCESS) {
			ibd_print_warn(state, "failed to free "
			    "protection domain, ret=%d", ret);
		}
		state->id_pd_hdl = NULL;
		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
	}

	if (progress & IBD_DRV_HCA_OPENED) {
		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
		    IBT_SUCCESS) {
			ibd_print_warn(state, "failed to close "
			    "HCA device, ret=%d", ret);
		}
		state->id_hca_hdl = NULL;
		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
	}

	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
			ibd_print_warn(state,
			    "ibt_detach() failed, ret=%d", ret);
		}
		state->id_ibt_hdl = NULL;
		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
	}

	if (progress & IBD_DRV_TXINTR_ADDED) {
		ddi_remove_softintr(state->id_tx);
		state->id_tx = NULL;
		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
	}

	if (progress & IBD_DRV_RXINTR_ADDED) {
		ddi_remove_softintr(state->id_rx);
		state->id_rx = NULL;
		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
	}

	if (progress & IBD_DRV_STATE_INITIALIZED) {
		ibd_state_fini(state);
		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
	}

	instance = ddi_get_instance(dip);
	ddi_soft_state_free(ibd_list, instance);

	return (DDI_SUCCESS);
}

/*
 * Attach device to the IO framework.
 */
static int
ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	ibd_state_t *state = NULL;
	ib_guid_t hca_guid;
	int instance;
	ibt_status_t ret;
	int rv;

	/*
	 * IBD doesn't support suspend/resume
	 */
	if (cmd != DDI_ATTACH)
		return (DDI_FAILURE);

	/*
	 * Allocate softstate structure
	 */
	instance = ddi_get_instance(dip);
	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
		return (DDI_FAILURE);
	state = ddi_get_soft_state(ibd_list, instance);

	/*
	 * Initialize mutexes and condition variables
	 */
	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
		goto attach_fail;
	}
	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;

	/*
	 * Allocate rx,tx softintr
	 */
	if (ibd_rx_softintr == 1) {
		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
			DPRINT(10, "ibd_attach: failed in "
			    "ddi_add_softintr(id_rx),  ret=%d", rv);
			goto attach_fail;
		}
		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
	}
	if (ibd_tx_softintr == 1) {
		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
		    NULL, NULL, ibd_tx_recycle,
		    (caddr_t)state)) != DDI_SUCCESS) {
			DPRINT(10, "ibd_attach: failed in "
			    "ddi_add_softintr(id_tx), ret=%d", rv);
			goto attach_fail;
		}
		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
	}

	/*
	 * Obtain IBA P_Key, port number and HCA guid and validate
	 * them (for P_Key, only full members are allowed as per
	 * IPoIB specification; neither port number nor HCA guid
	 * can be zero)
	 */
	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
		    state->id_pkey);
		goto attach_fail;
	}
	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
	    "port-number", 0)) == 0) {
		DPRINT(10, "ibd_attach: invalid port number (%d)",
		    state->id_port);
		goto attach_fail;
	}
	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
	    "hca-guid", 0)) == 0) {
		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
		    hca_guid);
		goto attach_fail;
	}

	/*
	 * Attach to IBTL
	 */
	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
		goto attach_fail;
	}
	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;

	/*
	 * Open the HCA
	 */
	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
	    &state->id_hca_hdl)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
		goto attach_fail;
	}
	state->id_mac_state |= IBD_DRV_HCA_OPENED;

	/*
	 * Record capabilities
	 */
	(void) ibd_record_capab(state, dip);

	/*
	 * Allocate a protection domain on the HCA
	 */
	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
	    &state->id_pd_hdl)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
		goto attach_fail;
	}
	state->id_mac_state |= IBD_DRV_PD_ALLOCD;


	/*
	 * Register ibd interfaces with the Nemo framework
	 */
	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
		goto attach_fail;
	}
	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;

	/*
	 * We're done with everything we could to make the attach
	 * succeed.  All the buffer allocations and IPoIB broadcast
	 * group joins are deferred to when the interface instance
	 * is actually plumbed to avoid wasting memory.
	 */
	return (DDI_SUCCESS);

attach_fail:
	(void) ibd_unattach(state, dip);
	return (DDI_FAILURE);
}

/*
 * Detach device from the IO framework.
 */
static int
ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	ibd_state_t *state;
	int instance;

	/*
	 * IBD doesn't support suspend/resume
	 */
	if (cmd != DDI_DETACH)
		return (DDI_FAILURE);

	/*
	 * Get the instance softstate
	 */
	instance = ddi_get_instance(dip);
	state = ddi_get_soft_state(ibd_list, instance);

	/*
	 * Release all resources we're holding still.  Note that if we'd
	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
	 * so far, we should find all the flags we need in id_mac_state.
	 */
	(void) ibd_unattach(state, dip);

	return (DDI_SUCCESS);
}

/*
 * Pre ibt_attach() driver initialization
 */
static int
ibd_state_init(ibd_state_t *state, dev_info_t *dip)
{
	char buf[64];

	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
	state->id_link_state = LINK_STATE_UNKNOWN;

	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
	state->id_trap_stop = B_TRUE;
	state->id_trap_inprog = 0;

	mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
	state->id_dip = dip;

	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);

	state->id_tx_list.dl_head = NULL;
	state->id_tx_list.dl_tail = NULL;
	state->id_tx_list.dl_pending_sends = B_FALSE;
	state->id_tx_list.dl_cnt = 0;
	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
	state->id_tx_busy = 0;

	state->id_rx_list.dl_head = NULL;
	state->id_rx_list.dl_tail = NULL;
	state->id_rx_list.dl_bufs_outstanding = 0;
	state->id_rx_list.dl_cnt = 0;
	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);

	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
	    0, NULL, NULL, NULL, NULL, NULL, 0);

	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);

	return (DDI_SUCCESS);
}

/*
 * Post ibt_detach() driver deconstruction
 */
static void
ibd_state_fini(ibd_state_t *state)
{
	cv_destroy(&state->id_macst_cv);
	mutex_destroy(&state->id_macst_lock);

	kmem_cache_destroy(state->id_req_kmc);

	mutex_destroy(&state->id_rxpost_lock);
	mutex_destroy(&state->id_rx_list.dl_mutex);

	mutex_destroy(&state->id_txpost_lock);
	mutex_destroy(&state->id_tx_list.dl_mutex);

	mutex_destroy(&state->id_sched_lock);
	mutex_destroy(&state->id_cq_poll_lock);

	cv_destroy(&state->id_trap_cv);
	mutex_destroy(&state->id_trap_lock);
	mutex_destroy(&state->id_link_mutex);
}

/*
 * Fetch link speed from SA for snmp ifspeed reporting.
 */
static uint64_t
ibd_get_portspeed(ibd_state_t *state)
{
	int			ret;
	ibt_path_info_t		path;
	ibt_path_attr_t		path_attr;
	uint8_t			num_paths;
	uint64_t		ifspeed;

	/*
	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
	 * 2000000000. Start with that as default.
	 */
	ifspeed = 2000000000;

	bzero(&path_attr, sizeof (path_attr));

	/*
	 * Get the port speed from Loopback path information.
	 */
	path_attr.pa_dgids = &state->id_sgid;
	path_attr.pa_num_dgids = 1;
	path_attr.pa_sgid = state->id_sgid;

	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
		goto earlydone;

	if (num_paths < 1)
		goto earlydone;

	/*
	 * In case SA does not return an expected value, report the default
	 * speed as 1X.
	 */
	ret = 1;
	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
			ret = 1;
			break;
		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
			ret = 4;
			break;
		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
			ret = 12;
			break;
		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
			ret = 2;
			break;
		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
			ret = 8;
			break;
		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
			ret = 16;
			break;
		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
			ret = 24;
			break;
		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
			ret = 32;
			break;
		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
			ret = 48;
			break;
	}

	ifspeed *= ret;

earlydone:
	return (ifspeed);
}

/*
 * Search input mcg list (id_mc_full or id_mc_non) for an entry
 * representing the input mcg mgid.
 */
static ibd_mce_t *
ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
{
	ibd_mce_t *ptr = list_head(mlist);

	/*
	 * Do plain linear search.
	 */
	while (ptr != NULL) {
		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
		    sizeof (ib_gid_t)) == 0)
			return (ptr);
		ptr = list_next(mlist, ptr);
	}
	return (NULL);
}

/*
 * Execute IBA JOIN.
 */
static ibt_status_t
ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
{
	ibt_mcg_attr_t mcg_attr;

	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
	mcg_attr.mc_mgid = mgid;
	mcg_attr.mc_join_state = mce->mc_jstate;
	mcg_attr.mc_scope = state->id_scope;
	mcg_attr.mc_pkey = state->id_pkey;
	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
	    NULL, NULL));
}

/*
 * This code JOINs the port in the proper way (depending on the join
 * state) so that IBA fabric will forward mcg packets to/from the port.
 * It also attaches the QPN to the mcg so it can receive those mcg
 * packets. This code makes sure not to attach the mcg to the QP if
 * that has been previously done due to the mcg being joined with a
 * different join state, even though this is not required by SWG_0216,
 * refid 3610.
 */
static ibd_mce_t *
ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
{
	ibt_status_t ibt_status;
	ibd_mce_t *mce, *tmce, *omce = NULL;
	boolean_t do_attach = B_TRUE;

	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
	    jstate, mgid.gid_prefix, mgid.gid_guid);

	/*
	 * For enable_multicast Full member joins, we need to do some
	 * extra work. If there is already an mce on the list that
	 * indicates full membership, that means the membership has
	 * not yet been dropped (since the disable_multicast was issued)
	 * because there are pending Tx's to the mcg; in that case, just
	 * mark the mce not to be reaped when the Tx completion queues
	 * an async reap operation.
	 *
	 * If there is already an mce on the list indicating sendonly
	 * membership, try to promote to full membership. Be careful
	 * not to deallocate the old mce, since there might be an AH
	 * pointing to it; instead, update the old mce with new data
	 * that tracks the full membership.
	 */
	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
			ASSERT(omce->mc_fullreap);
			omce->mc_fullreap = B_FALSE;
			return (omce);
		} else {
			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
		}
	}

	/*
	 * Allocate the ibd_mce_t to track this JOIN.
	 */
	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
	mce->mc_fullreap = B_FALSE;
	mce->mc_jstate = jstate;

	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
		    ibt_status);
		kmem_free(mce, sizeof (ibd_mce_t));
		return (NULL);
	}

	/*
	 * Is an IBA attach required? Not if the interface is already joined
	 * to the mcg in a different appropriate join state.
	 */
	if (jstate == IB_MC_JSTATE_NON) {
		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
			do_attach = B_FALSE;
	} else if (jstate == IB_MC_JSTATE_FULL) {
		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
			do_attach = B_FALSE;
	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
		do_attach = B_FALSE;
	}

	if (do_attach) {
		/*
		 * Do the IBA attach.
		 */
		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
		    &mce->mc_info)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_join_group : failed qp attachment "
			    "%d\n", ibt_status);
			/*
			 * NOTE that we should probably preserve the join info
			 * in the list and later try to leave again at detach
			 * time.
			 */
			(void) ibt_leave_mcg(state->id_sgid, mgid,
			    state->id_sgid, jstate);
			kmem_free(mce, sizeof (ibd_mce_t));
			return (NULL);
		}
	}

	/*
	 * Insert the ibd_mce_t in the proper list.
	 */
	if (jstate == IB_MC_JSTATE_NON) {
		IBD_MCACHE_INSERT_NON(state, mce);
	} else {
		/*
		 * Set up the mc_req fields used for reaping the
		 * mcg in case of delayed tx completion (see
		 * ibd_tx_cleanup()). Also done for sendonly join in
		 * case we are promoted to fullmembership later and
		 * keep using the same mce.
		 */
		mce->mc_req.rq_gid = mgid;
		mce->mc_req.rq_ptr = mce;
		/*
		 * Check whether this is the case of trying to join
		 * full member, and we were already joined send only.
		 * We try to drop our SendOnly membership, but it is
		 * possible that the mcg does not exist anymore (and
		 * the subnet trap never reached us), so the leave
		 * operation might fail.
		 */
		if (omce != NULL) {
			(void) ibt_leave_mcg(state->id_sgid, mgid,
			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
			omce->mc_jstate = IB_MC_JSTATE_FULL;
			bcopy(&mce->mc_info, &omce->mc_info,
			    sizeof (ibt_mcg_info_t));
			kmem_free(mce, sizeof (ibd_mce_t));
			return (omce);
		}
		mutex_enter(&state->id_mc_mutex);
		IBD_MCACHE_INSERT_FULL(state, mce);
		mutex_exit(&state->id_mc_mutex);
	}

	return (mce);
}

/*
 * Called during port up event handling to attempt to reacquire full
 * membership to an mcg. Stripped down version of ibd_join_group().
 * Note that it is possible that the mcg might have gone away, and
 * gets recreated at this point.
 */
static void
ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
{
	ib_gid_t mgid;

	/*
	 * If the mc_fullreap flag is set, or this join fails, a subsequent
	 * reap/leave is going to try to leave the group. We could prevent
	 * that by adding a boolean flag into ibd_mce_t, if required.
	 */
	if (mce->mc_fullreap)
		return;

	mgid = mce->mc_info.mc_adds_vect.av_dgid;

	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
	    mgid.gid_guid);

	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
		ibd_print_warn(state, "Failure on port up to rejoin "
		    "multicast gid %016llx:%016llx",
		    (u_longlong_t)mgid.gid_prefix,
		    (u_longlong_t)mgid.gid_guid);
}

/*
 * This code handles delayed Tx completion cleanups for mcg's to which
 * disable_multicast has been issued, regular mcg related cleanups during
 * disable_multicast, disable_promiscous and mcg traps, as well as
 * cleanups during driver detach time. Depending on the join state,
 * it deletes the mce from the appropriate list and issues the IBA
 * leave/detach; except in the disable_multicast case when the mce
 * is left on the active list for a subsequent Tx completion cleanup.
 */
static void
ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
    uint8_t jstate)
{
	ibd_mce_t *tmce;
	boolean_t do_detach = B_TRUE;

	/*
	 * Before detaching, we must check whether the other list
	 * contains the mcg; if we detach blindly, the consumer
	 * who set up the other list will also stop receiving
	 * traffic.
	 */
	if (jstate == IB_MC_JSTATE_FULL) {
		/*
		 * The following check is only relevant while coming
		 * from the Tx completion path in the reap case.
		 */
		if (!mce->mc_fullreap)
			return;
		mutex_enter(&state->id_mc_mutex);
		IBD_MCACHE_PULLOUT_FULL(state, mce);
		mutex_exit(&state->id_mc_mutex);
		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
			do_detach = B_FALSE;
	} else if (jstate == IB_MC_JSTATE_NON) {
		IBD_MCACHE_PULLOUT_NON(state, mce);
		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
			do_detach = B_FALSE;
	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
		mutex_enter(&state->id_mc_mutex);
		IBD_MCACHE_PULLOUT_FULL(state, mce);
		mutex_exit(&state->id_mc_mutex);
		do_detach = B_FALSE;
	}

	/*
	 * If we are reacting to a mcg trap and leaving our sendonly or
	 * non membership, the mcg is possibly already gone, so attempting
	 * to leave might fail. On the other hand, we must try to leave
	 * anyway, since this might be a trap from long ago, and we could
	 * have potentially sendonly joined to a recent incarnation of
	 * the mcg and are about to loose track of this information.
	 */
	if (do_detach) {
		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
	}

	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
	kmem_free(mce, sizeof (ibd_mce_t));
}

/*
 * Async code executed due to multicast and promiscuous disable requests
 * and mcg trap handling; also executed during driver detach. Mostly, a
 * leave and detach is done; except for the fullmember case when Tx
 * requests are pending, whence arrangements are made for subsequent
 * cleanup on Tx completion.
 */
static void
ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
{
	ipoib_mac_t mcmac;
	boolean_t recycled;
	ibd_mce_t *mce;

	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
	    jstate, mgid.gid_prefix, mgid.gid_guid);

	if (jstate == IB_MC_JSTATE_NON) {
		recycled = B_TRUE;
		mce = IBD_MCACHE_FIND_NON(state, mgid);
		/*
		 * In case we are handling a mcg trap, we might not find
		 * the mcg in the non list.
		 */
		if (mce == NULL) {
			return;
		}
	} else {
		mce = IBD_MCACHE_FIND_FULL(state, mgid);

		/*
		 * In case we are handling a mcg trap, make sure the trap
		 * is not arriving late; if we have an mce that indicates
		 * that we are already a fullmember, that would be a clear
		 * indication that the trap arrived late (ie, is for a
		 * previous incarnation of the mcg).
		 */
		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
			if ((mce == NULL) || (mce->mc_jstate ==
			    IB_MC_JSTATE_FULL)) {
				return;
			}
		} else {
			ASSERT(jstate == IB_MC_JSTATE_FULL);

			/*
			 * If join group failed, mce will be NULL here.
			 * This is because in GLDv3 driver, set multicast
			 *  will always return success.
			 */
			if (mce == NULL) {
				return;
			}

			mce->mc_fullreap = B_TRUE;
		}

		/*
		 * If no pending Tx's remain that reference the AH
		 * for the mcg, recycle it from active to free list.
		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
		 * so the last completing Tx will cause an async reap
		 * operation to be invoked, at which time we will drop our
		 * membership to the mcg so that the pending Tx's complete
		 * successfully. Refer to comments on "AH and MCE active
		 * list manipulation" at top of this file. The lock protects
		 * against Tx fast path and Tx cleanup code.
		 */
		mutex_enter(&state->id_ac_mutex);
		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
		    IB_MC_JSTATE_SEND_ONLY_NON));
		mutex_exit(&state->id_ac_mutex);
	}

	if (recycled) {
		DPRINT(2, "ibd_leave_group : leave_group reaping : "
		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
		ibd_async_reap_group(state, mce, mgid, jstate);
	}
}

/*
 * Find the broadcast address as defined by IPoIB; implicitly
 * determines the IBA scope, mtu, tclass etc of the link the
 * interface is going to be a member of.
 */
static ibt_status_t
ibd_find_bgroup(ibd_state_t *state)
{
	ibt_mcg_attr_t mcg_attr;
	uint_t numg;
	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
	    IB_MC_SCOPE_GLOBAL };
	int i, mcgmtu;
	boolean_t found = B_FALSE;
	int ret;
	ibt_mcg_info_t mcg_info;

	state->id_bgroup_created = B_FALSE;

query_bcast_grp:
	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
	mcg_attr.mc_pkey = state->id_pkey;
	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;

	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
		state->id_scope = mcg_attr.mc_scope = scopes[i];

		/*
		 * Look for the IPoIB broadcast group.
		 */
		state->id_mgid.gid_prefix =
		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
		    ((uint64_t)state->id_scope << 48) |
		    ((uint32_t)(state->id_pkey << 16)));
		mcg_attr.mc_mgid = state->id_mgid;
		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
			found = B_TRUE;
			break;
		}
	}

	if (!found) {
		if (ibd_create_broadcast_group) {
			/*
			 * If we created the broadcast group, but failed to
			 * find it, we can't do anything except leave the
			 * one we created and return failure.
			 */
			if (state->id_bgroup_created) {
				ibd_print_warn(state, "IPoIB broadcast group "
				    "absent. Unable to query after create.");
				goto find_bgroup_fail;
			}

			/*
			 * Create the ipoib broadcast group if it didn't exist
			 */
			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
			mcg_attr.mc_pkey = state->id_pkey;
			mcg_attr.mc_flow = 0;
			mcg_attr.mc_sl = 0;
			mcg_attr.mc_tclass = 0;
			state->id_mgid.gid_prefix =
			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
			    ((uint32_t)(state->id_pkey << 16)));
			mcg_attr.mc_mgid = state->id_mgid;

			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
				ibd_print_warn(state, "IPoIB broadcast group "
				    "absent, create failed: ret = %d\n", ret);
				state->id_bgroup_created = B_FALSE;
				return (IBT_FAILURE);
			}
			state->id_bgroup_created = B_TRUE;
			goto query_bcast_grp;
		} else {
			ibd_print_warn(state, "IPoIB broadcast group absent");
			return (IBT_FAILURE);
		}
	}

	/*
	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
	 */
	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
	if (state->id_mtu < mcgmtu) {
		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
		    "greater than port's maximum MTU %d", mcgmtu,
		    state->id_mtu);
		ibt_free_mcg_info(state->id_mcinfo, 1);
		goto find_bgroup_fail;
	}
	state->id_mtu = mcgmtu;

	return (IBT_SUCCESS);

find_bgroup_fail:
	if (state->id_bgroup_created) {
		(void) ibt_leave_mcg(state->id_sgid,
		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
		    IB_MC_JSTATE_FULL);
	}

	return (IBT_FAILURE);
}

static int
ibd_alloc_tx_copybufs(ibd_state_t *state)
{
	ibt_mr_attr_t mem_attr;

	/*
	 * Allocate one big chunk for all regular tx copy bufs
	 */
	state->id_tx_buf_sz = state->id_mtu;
	if (state->id_lso_policy && state->id_lso_capable &&
	    (IBD_TX_BUF_SZ > state->id_mtu)) {
		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
	}

	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
	    state->id_tx_buf_sz, KM_SLEEP);

	/*
	 * Do one memory registration on the entire txbuf area
	 */
	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
	mem_attr.mr_as = NULL;
	mem_attr.mr_flags = IBT_MR_SLEEP;
	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
		kmem_free(state->id_tx_bufs,
		    state->id_num_swqe * state->id_tx_buf_sz);
		state->id_tx_bufs = NULL;
		return (DDI_FAILURE);
	}

	return (DDI_SUCCESS);
}

static int
ibd_alloc_tx_lsobufs(ibd_state_t *state)
{
	ibt_mr_attr_t mem_attr;
	ibd_lsobuf_t *buflist;
	ibd_lsobuf_t *lbufp;
	ibd_lsobuf_t *tail;
	ibd_lsobkt_t *bktp;
	uint8_t *membase;
	uint8_t *memp;
	uint_t memsz;
	int i;

	/*
	 * Allocate the lso bucket
	 */
	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);

	/*
	 * Allocate the entire lso memory and register it
	 */
	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
	membase = kmem_zalloc(memsz, KM_SLEEP);

	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
	mem_attr.mr_len = memsz;
	mem_attr.mr_as = NULL;
	mem_attr.mr_flags = IBT_MR_SLEEP;
	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
		kmem_free(membase, memsz);
		kmem_free(bktp, sizeof (ibd_lsobkt_t));
		return (DDI_FAILURE);
	}

	/*
	 * Now allocate the buflist.  Note that the elements in the buflist and
	 * the buffers in the lso memory have a permanent 1-1 relation, so we
	 * can always derive the address of a buflist entry from the address of
	 * an lso buffer.
	 */
	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
	    KM_SLEEP);

	/*
	 * Set up the lso buf chain
	 */
	memp = membase;
	lbufp = buflist;
	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
		lbufp->lb_isfree = 1;
		lbufp->lb_buf = memp;
		lbufp->lb_next = lbufp + 1;

		tail = lbufp;

		memp += IBD_LSO_BUFSZ;
		lbufp++;
	}
	tail->lb_next = NULL;

	/*
	 * Set up the LSO buffer information in ibd state
	 */
	bktp->bkt_bufl = buflist;
	bktp->bkt_free_head = buflist;
	bktp->bkt_mem = membase;
	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
	bktp->bkt_nfree = bktp->bkt_nelem;

	state->id_lso = bktp;

	return (DDI_SUCCESS);
}

/*
 * Statically allocate Tx buffer list(s).
 */
static int
ibd_init_txlist(ibd_state_t *state)
{
	ibd_swqe_t *swqe;
	ibt_lkey_t lkey;
	int i;

	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
		return (DDI_FAILURE);

	if (state->id_lso_policy && state->id_lso_capable) {
		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
			state->id_lso_policy = B_FALSE;
	}

	/*
	 * Allocate and setup the swqe list
	 */
	lkey = state->id_tx_mr_desc.md_lkey;
	for (i = 0; i < state->id_num_swqe; i++) {
		if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
			DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
			ibd_fini_txlist(state);
			return (DDI_FAILURE);
		}

		/* add to list */
		state->id_tx_list.dl_cnt++;
		if (state->id_tx_list.dl_head == NULL) {
			swqe->swqe_prev = NULL;
			swqe->swqe_next = NULL;
			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
		} else {
			swqe->swqe_prev = state->id_tx_list.dl_tail;
			swqe->swqe_next = NULL;
			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
		}
	}

	return (DDI_SUCCESS);
}

static int
ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
    uint32_t *nds_p)
{
	ibd_lsobkt_t *bktp;
	ibd_lsobuf_t *lbufp;
	ibd_lsobuf_t *nextp;
	ibt_lkey_t lso_lkey;
	uint_t frag_sz;
	uint_t num_needed;
	int i;

	ASSERT(sgl_p != NULL);
	ASSERT(nds_p != NULL);
	ASSERT(req_sz != 0);

	/*
	 * Determine how many bufs we'd need for the size requested
	 */
	num_needed = req_sz / IBD_LSO_BUFSZ;
	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
		num_needed++;

	mutex_enter(&state->id_lso_lock);

	/*
	 * If we don't have enough lso bufs, return failure
	 */
	ASSERT(state->id_lso != NULL);
	bktp = state->id_lso;
	if (bktp->bkt_nfree < num_needed) {
		mutex_exit(&state->id_lso_lock);
		return (-1);
	}

	/*
	 * Pick the first 'num_needed' bufs from the free list
	 */
	lso_lkey = bktp->bkt_mr_desc.md_lkey;
	lbufp = bktp->bkt_free_head;
	for (i = 0; i < num_needed; i++) {
		ASSERT(lbufp->lb_isfree != 0);
		ASSERT(lbufp->lb_buf != NULL);

		nextp = lbufp->lb_next;

		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
		sgl_p[i].ds_key = lso_lkey;
		sgl_p[i].ds_len = IBD_LSO_BUFSZ;

		lbufp->lb_isfree = 0;
		lbufp->lb_next = NULL;

		lbufp = nextp;
	}
	bktp->bkt_free_head = lbufp;

	/*
	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
	 * to adjust the last sgl entry's length. Since we know we need atleast
	 * one, the i-1 use below is ok.
	 */
	if (frag_sz) {
		sgl_p[i-1].ds_len = frag_sz;
	}

	/*
	 * Update nfree count and return
	 */
	bktp->bkt_nfree -= num_needed;

	mutex_exit(&state->id_lso_lock);

	*nds_p = num_needed;

	return (0);
}

static void
ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
{
	ibd_lsobkt_t *bktp;
	ibd_lsobuf_t *lbufp;
	uint8_t *lso_mem_end;
	uint_t ndx;
	int i;

	mutex_enter(&state->id_lso_lock);

	bktp = state->id_lso;
	ASSERT(bktp != NULL);

	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
	for (i = 0; i < nds; i++) {
		uint8_t *va;

		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);

		/*
		 * Figure out the buflist element this sgl buffer corresponds
		 * to and put it back at the head
		 */
		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
		lbufp = bktp->bkt_bufl + ndx;

		ASSERT(lbufp->lb_isfree == 0);
		ASSERT(lbufp->lb_buf == va);

		lbufp->lb_isfree = 1;
		lbufp->lb_next = bktp->bkt_free_head;
		bktp->bkt_free_head = lbufp;
	}
	bktp->bkt_nfree += nds;

	mutex_exit(&state->id_lso_lock);
}

static void
ibd_free_tx_copybufs(ibd_state_t *state)
{
	/*
	 * Unregister txbuf mr
	 */
	if (ibt_deregister_mr(state->id_hca_hdl,
	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
	}
	state->id_tx_mr_hdl = NULL;

	/*
	 * Free txbuf memory
	 */
	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
	state->id_tx_bufs = NULL;
}

static void
ibd_free_tx_lsobufs(ibd_state_t *state)
{
	ibd_lsobkt_t *bktp;

	mutex_enter(&state->id_lso_lock);

	if ((bktp = state->id_lso) == NULL) {
		mutex_exit(&state->id_lso_lock);
		return;
	}

	/*
	 * First, free the buflist
	 */
	ASSERT(bktp->bkt_bufl != NULL);
	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));

	/*
	 * Unregister the LSO memory and free it
	 */
	ASSERT(bktp->bkt_mr_hdl != NULL);
	if (ibt_deregister_mr(state->id_hca_hdl,
	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
		DPRINT(10,
		    "ibd_free_lsobufs: ibt_deregister_mr failed");
	}
	ASSERT(bktp->bkt_mem);
	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);

	/*
	 * Finally free the bucket
	 */
	kmem_free(bktp, sizeof (ibd_lsobkt_t));
	state->id_lso = NULL;

	mutex_exit(&state->id_lso_lock);
}

/*
 * Free the statically allocated Tx buffer list.
 */
static void
ibd_fini_txlist(ibd_state_t *state)
{
	ibd_swqe_t *node;

	/*
	 * Free the allocated swqes
	 */
	mutex_enter(&state->id_tx_list.dl_mutex);
	while (state->id_tx_list.dl_head != NULL) {
		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
		state->id_tx_list.dl_head = node->swqe_next;
		ASSERT(state->id_tx_list.dl_cnt > 0);
		state->id_tx_list.dl_cnt--;
		ibd_free_swqe(state, node);
	}
	mutex_exit(&state->id_tx_list.dl_mutex);

	ibd_free_tx_lsobufs(state);
	ibd_free_tx_copybufs(state);
}

/*
 * Allocate a single send wqe and register it so it is almost
 * ready to be posted to the hardware.
 */
static int
ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
{
	ibd_swqe_t *swqe;

	swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
	*wqe = swqe;

	swqe->swqe_type = IBD_WQE_SEND;
	swqe->swqe_next = NULL;
	swqe->swqe_prev = NULL;
	swqe->swqe_im_mblk = NULL;

	swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
	    (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
	swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */

	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
	swqe->w_swr.wr_trans = IBT_UD_SRV;

	/* These are set in send */
	swqe->w_swr.wr_nds = 0;
	swqe->w_swr.wr_sgl = NULL;
	swqe->w_swr.wr_opcode = IBT_WRC_SEND;

	return (DDI_SUCCESS);
}

/*
 * Free an allocated send wqe.
 */
/*ARGSUSED*/
static void
ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
{
	kmem_free(swqe, sizeof (ibd_swqe_t));
}

/*
 * Post a rwqe to the hardware and add it to the Rx list. The
 * "recycle" parameter indicates whether an old rwqe is being
 * recycled, or this is a new one.
 */
static int
ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
{
	ibt_status_t ibt_status;

	if (recycle == B_FALSE) {
		mutex_enter(&state->id_rx_list.dl_mutex);
		if (state->id_rx_list.dl_head == NULL) {
			rwqe->rwqe_prev = NULL;
			rwqe->rwqe_next = NULL;
			state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
		} else {
			rwqe->rwqe_prev = state->id_rx_list.dl_tail;
			rwqe->rwqe_next = NULL;
			state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
		}
		mutex_exit(&state->id_rx_list.dl_mutex);
	}

	mutex_enter(&state->id_rxpost_lock);
	if (state->id_rx_busy) {
		rwqe->w_post_link = NULL;
		if (state->id_rx_head)
			*(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
		else
			state->id_rx_head = rwqe;
		state->id_rx_tailp = &(rwqe->w_post_link);
	} else {
		state->id_rx_busy = 1;
		do {
			mutex_exit(&state->id_rxpost_lock);

			/*
			 * Here we should add dl_cnt before post recv, because
			 * we would have to make sure dl_cnt is updated before
			 * the corresponding ibd_process_rx() is called.
			 */
			atomic_add_32(&state->id_rx_list.dl_cnt, 1);

			ibt_status = ibt_post_recv(state->id_chnl_hdl,
			    &rwqe->w_rwr, 1, NULL);
			if (ibt_status != IBT_SUCCESS) {
				(void) atomic_add_32_nv(
				    &state->id_rx_list.dl_cnt, -1);
				ibd_print_warn(state, "ibd_post_recv: "
				    "posting failed, ret=%d", ibt_status);
				return (DDI_FAILURE);
			}

			mutex_enter(&state->id_rxpost_lock);
			rwqe = state->id_rx_head;
			if (rwqe) {
				state->id_rx_head =
				    (ibd_rwqe_t *)(rwqe->w_post_link);
			}
		} while (rwqe);
		state->id_rx_busy = 0;
	}
	mutex_exit(&state->id_rxpost_lock);

	return (DDI_SUCCESS);
}

/*
 * Allocate the statically allocated Rx buffer list.
 */
static int
ibd_init_rxlist(ibd_state_t *state)
{
	ibd_rwqe_t *rwqe;
	int i;

	for (i = 0; i < state->id_num_rwqe; i++) {
		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
			ibd_fini_rxlist(state);
			return (DDI_FAILURE);
		}

		if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
			ibd_free_rwqe(state, rwqe);
			ibd_fini_rxlist(state);
			return (DDI_FAILURE);
		}
	}

	return (DDI_SUCCESS);
}

/*
 * Free the statically allocated Rx buffer list.
 *
 */
static void
ibd_fini_rxlist(ibd_state_t *state)
{
	ibd_rwqe_t *node;

	mutex_enter(&state->id_rx_list.dl_mutex);
	while (state->id_rx_list.dl_head != NULL) {
		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
		ASSERT(state->id_rx_list.dl_cnt > 0);
		state->id_rx_list.dl_cnt--;

		ibd_free_rwqe(state, node);
	}
	mutex_exit(&state->id_rx_list.dl_mutex);
}

/*
 * Allocate a single recv wqe and register it so it is almost
 * ready to be posted to the hardware.
 */
static int
ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
{
	ibt_mr_attr_t mem_attr;
	ibd_rwqe_t *rwqe;

	if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
		return (DDI_FAILURE);
	}
	*wqe = rwqe;
	rwqe->rwqe_type = IBD_WQE_RECV;
	rwqe->w_state = state;
	rwqe->rwqe_next = NULL;
	rwqe->rwqe_prev = NULL;
	rwqe->w_freeing_wqe = B_FALSE;
	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;

	rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
	    IPOIB_GRH_SIZE, KM_NOSLEEP);
	if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
		kmem_free(rwqe, sizeof (ibd_rwqe_t));
		return (DDI_FAILURE);
	}

	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
	    NULL) {
		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
		    state->id_mtu + IPOIB_GRH_SIZE);
		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
		kmem_free(rwqe, sizeof (ibd_rwqe_t));
		return (DDI_FAILURE);
	}

	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
	mem_attr.mr_as = NULL;
	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
	    IBT_SUCCESS) {
		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
		rwqe->w_freeing_wqe = B_TRUE;
		freemsg(rwqe->rwqe_im_mblk);
		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
		    state->id_mtu + IPOIB_GRH_SIZE);
		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
		kmem_free(rwqe, sizeof (ibd_rwqe_t));
		return (DDI_FAILURE);
	}

	rwqe->rwqe_copybuf.ic_sgl.ds_va =
	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
	rwqe->rwqe_copybuf.ic_sgl.ds_key =
	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
	rwqe->w_rwr.wr_nds = 1;
	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;

	return (DDI_SUCCESS);
}

/*
 * Free an allocated recv wqe.
 */
static void
ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
{
	if (ibt_deregister_mr(state->id_hca_hdl,
	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
		DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
		return;
	}

	/*
	 * Indicate to the callback function that this rwqe/mblk
	 * should not be recycled. The freemsg() will invoke
	 * ibd_freemsg_cb().
	 */
	if (rwqe->rwqe_im_mblk != NULL) {
		rwqe->w_freeing_wqe = B_TRUE;
		freemsg(rwqe->rwqe_im_mblk);
	}
	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
	    state->id_mtu + IPOIB_GRH_SIZE);
	rwqe->rwqe_copybuf.ic_bufaddr = NULL;
	kmem_free(rwqe, sizeof (ibd_rwqe_t));
}

/*
 * Delete the rwqe being freed from the rx list.
 */
static void
ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
{
	mutex_enter(&state->id_rx_list.dl_mutex);
	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
		state->id_rx_list.dl_head = rwqe->rwqe_next;
	else
		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
	else
		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
	mutex_exit(&state->id_rx_list.dl_mutex);
}

/*
 * IBA Rx/Tx completion queue handler. Guaranteed to be single
 * threaded and nonreentrant for this CQ. When using combined CQ,
 * this handles Tx and Rx completions. With separate CQs, this handles
 * only Rx completions.
 */
/* ARGSUSED */
static void
ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
	ibd_state_t *state = (ibd_state_t *)arg;

	atomic_add_64(&state->id_num_intrs, 1);

	if (ibd_rx_softintr == 1)
		ddi_trigger_softintr(state->id_rx);
	else
		(void) ibd_intr((char *)state);
}

/*
 * Separate CQ handler for Tx completions, when the Tx CQ is in
 * interrupt driven mode.
 */
/* ARGSUSED */
static void
ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
	ibd_state_t *state = (ibd_state_t *)arg;

	atomic_add_64(&state->id_num_intrs, 1);

	if (ibd_tx_softintr == 1)
		ddi_trigger_softintr(state->id_tx);
	else
		(void) ibd_tx_recycle((char *)state);
}

/*
 * Multicast group create/delete trap handler. These will be delivered
 * on a kernel thread (handling can thus block) and can be invoked
 * concurrently. The handler can be invoked anytime after it is
 * registered and before ibt_detach().
 */
/* ARGSUSED */
static void
ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
    ibt_subnet_event_t *event)
{
	ibd_state_t *state = (ibd_state_t *)arg;
	ibd_req_t *req;

	/*
	 * The trap handler will get invoked once for every event for
	 * evert port. The input "gid" is the GID0 of the port the
	 * trap came in on; we just need to act on traps that came
	 * to our port, meaning the port on which the ipoib interface
	 * resides. Since ipoib uses GID0 of the port, we just match
	 * the gids to check whether we need to handle the trap.
	 */
	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
		return;

	DPRINT(10, "ibd_notices_handler : %d\n", code);

	switch (code) {
		case IBT_SM_EVENT_UNAVAILABLE:
			/*
			 * If we are in promiscuous mode or have
			 * sendnonmembers, we need to print a warning
			 * message right now. Else, just store the
			 * information, print when we enter promiscuous
			 * mode or attempt nonmember send. We might
			 * also want to stop caching sendnonmember.
			 */
			ibd_print_warn(state, "IBA multicast support "
			    "degraded due to unavailability of multicast "
			    "traps");
			break;
		case IBT_SM_EVENT_AVAILABLE:
			/*
			 * If we printed a warning message above or
			 * while trying to nonmember send or get into
			 * promiscuous mode, print an okay message.
			 */
			ibd_print_warn(state, "IBA multicast support "
			    "restored due to availability of multicast "
			    "traps");
			break;
		case IBT_SM_EVENT_MCG_CREATED:
		case IBT_SM_EVENT_MCG_DELETED:
			/*
			 * Common processing of creation/deletion traps.
			 * First check if the instance is being
			 * [de]initialized; back off then, without doing
			 * anything more, since we are not sure if the
			 * async thread is around, or whether we might
			 * be racing with the detach code in ibd_m_stop()
			 * that scans the mcg list.
			 */
			if (!ibd_async_safe(state))
				return;

			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
			req->rq_gid = event->sm_notice_gid;
			req->rq_ptr = (void *)code;
			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
			break;
	}
}

static void
ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
{
	ib_gid_t mgid = req->rq_gid;
	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;

	DPRINT(10, "ibd_async_trap : %d\n", code);

	/*
	 * Atomically search the nonmember and sendonlymember lists and
	 * delete.
	 */
	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);

	if (state->id_prom_op == IBD_OP_COMPLETED) {
		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);

		/*
		 * If in promiscuous mode, try to join/attach to the new
		 * mcg. Given the unreliable out-of-order mode of trap
		 * delivery, we can never be sure whether it is a problem
		 * if the join fails. Thus, we warn the admin of a failure
		 * if this was a creation trap. Note that the trap might
		 * actually be reporting a long past event, and the mcg
		 * might already have been deleted, thus we might be warning
		 * in vain.
		 */
		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
			ibd_print_warn(state, "IBA promiscuous mode missed "
			    "new multicast gid %016llx:%016llx",
			    (u_longlong_t)mgid.gid_prefix,
			    (u_longlong_t)mgid.gid_guid);
	}

	/*
	 * Free the request slot allocated by the subnet event thread.
	 */
	ibd_async_done(state);
}

/*
 * GLDv3 entry point to get capabilities.
 */
static boolean_t
ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
	ibd_state_t *state = arg;

	switch (cap) {
	case MAC_CAPAB_HCKSUM: {
		uint32_t *txflags = cap_data;

		/*
		 * We either do full checksum or not do it at all
		 */
		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
		else
			return (B_FALSE);
		break;
	}

	case MAC_CAPAB_LSO: {
		mac_capab_lso_t *cap_lso = cap_data;

		/*
		 * In addition to the capability and policy, since LSO
		 * relies on hw checksum, we'll not enable LSO if we
		 * don't have hw checksum.  Of course, if the HCA doesn't
		 * provide the reserved lkey capability, enabling LSO will
		 * actually affect performance adversely, so we'll disable
		 * LSO even for that case.
		 */
		if (!state->id_lso_policy || !state->id_lso_capable)
			return (B_FALSE);

		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
			return (B_FALSE);

		if (state->id_hca_res_lkey_capab == 0) {
			ibd_print_warn(state, "no reserved-lkey capability, "
			    "disabling LSO");
			return (B_FALSE);
		}

		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
		break;
	}

	default:
		return (B_FALSE);
	}

	return (B_TRUE);
}

static int
ibd_get_port_details(ibd_state_t *state)
{
	ibt_hca_portinfo_t *port_infop;
	ibt_status_t ret;
	uint_t psize, port_infosz;

	mutex_enter(&state->id_link_mutex);

	/*
	 * Query for port information
	 */
	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
	    &port_infop, &psize, &port_infosz);
	if ((ret != IBT_SUCCESS) || (psize != 1)) {
		mutex_exit(&state->id_link_mutex);
		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
		    "failed, ret=%d", ret);
		return (ENETDOWN);
	}

	/*
	 * If the link already went down by the time we get here,
	 * give up
	 */
	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
		mutex_exit(&state->id_link_mutex);
		ibt_free_portinfo(port_infop, port_infosz);
		DPRINT(10, "ibd_get_port_details: port is not active");
		return (ENETDOWN);
	}

	/*
	 * If the link is active, verify the pkey
	 */
	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
		mutex_exit(&state->id_link_mutex);
		ibt_free_portinfo(port_infop, port_infosz);
		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
		    "failed, ret=%d", ret);
		return (ENONET);
	}

	state->id_mtu = (128 << port_infop->p_mtu);
	state->id_sgid = *port_infop->p_sgid_tbl;
	state->id_link_state = LINK_STATE_UP;

	mutex_exit(&state->id_link_mutex);
	ibt_free_portinfo(port_infop, port_infosz);

	/*
	 * Now that the port is active, record the port speed
	 */
	state->id_link_speed = ibd_get_portspeed(state);

	return (0);
}

static int
ibd_alloc_cqs(ibd_state_t *state)
{
	ibt_hca_attr_t hca_attrs;
	ibt_cq_attr_t cq_attr;
	ibt_status_t ret;
	uint32_t real_size;

	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
	ASSERT(ret == IBT_SUCCESS);

	/*
	 * Allocate Rx/combined CQ:
	 * Theoretically, there is no point in having more than #rwqe
	 * plus #swqe cqe's, except that the CQ will be signalled for
	 * overflow when the last wqe completes, if none of the previous
	 * cqe's have been polled. Thus, we allocate just a few less wqe's
	 * to make sure such overflow does not occur.
	 */
	cq_attr.cq_sched = NULL;
	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;

	if (ibd_separate_cqs == 1) {
		/*
		 * Allocate Receive CQ.
		 */
		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
			cq_attr.cq_size = state->id_num_rwqe + 1;
		} else {
			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
			state->id_num_rwqe = cq_attr.cq_size - 1;
		}

		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
			    "failed, ret=%d\n", ret);
			return (DDI_FAILURE);
		}

		if ((ret = ibt_modify_cq(state->id_rcq_hdl,
		    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
			    "moderation failed, ret=%d\n", ret);
		}

		state->id_rxwcs_size = state->id_num_rwqe + 1;
		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
		    state->id_rxwcs_size, KM_SLEEP);

		/*
		 * Allocate Send CQ.
		 */
		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
			cq_attr.cq_size = state->id_num_swqe + 1;
		} else {
			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
			state->id_num_swqe = cq_attr.cq_size - 1;
		}

		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
		    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
			    "failed, ret=%d\n", ret);
			kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
			    state->id_rxwcs_size);
			(void) ibt_free_cq(state->id_rcq_hdl);
			return (DDI_FAILURE);
		}
		if ((ret = ibt_modify_cq(state->id_scq_hdl,
		    IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
			    "moderation failed, ret=%d\n", ret);
		}

		state->id_txwcs_size = state->id_num_swqe + 1;
		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
		    state->id_txwcs_size, KM_SLEEP);
	} else {
		/*
		 * Allocate combined Send/Receive CQ.
		 */
		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
		    state->id_num_swqe + 1)) {
			cq_attr.cq_size = state->id_num_rwqe +
			    state->id_num_swqe + 1;
		} else {
			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
			    state->id_num_rwqe) / (state->id_num_rwqe +
			    state->id_num_swqe);
			state->id_num_swqe = cq_attr.cq_size - 1 -
			    state->id_num_rwqe;
		}

		state->id_rxwcs_size = cq_attr.cq_size;
		state->id_txwcs_size = state->id_rxwcs_size;

		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
			    "failed, ret=%d\n", ret);
			return (DDI_FAILURE);
		}
		state->id_scq_hdl = state->id_rcq_hdl;
		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
		    state->id_rxwcs_size, KM_SLEEP);
		state->id_txwcs = state->id_rxwcs;
	}

	/*
	 * Print message in case we could not allocate as many wqe's
	 * as was requested.
	 */
	if (state->id_num_rwqe != IBD_NUM_RWQE) {
		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
	}
	if (state->id_num_swqe != IBD_NUM_SWQE) {
		ibd_print_warn(state, "Setting #swqe = %d instead of default "
		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
	}

	return (DDI_SUCCESS);
}

static int
ibd_setup_ud_channel(ibd_state_t *state)
{
	ibt_ud_chan_alloc_args_t ud_alloc_attr;
	ibt_ud_chan_query_attr_t ud_chan_attr;
	ibt_status_t ret;

	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
	if (state->id_hca_res_lkey_capab)
		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
	if (state->id_lso_policy && state->id_lso_capable)
		ud_alloc_attr.ud_flags |= IBT_USES_LSO;

	ud_alloc_attr.ud_hca_port_num	= state->id_port;
	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
	ud_alloc_attr.ud_clone_chan	= NULL;

	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
		    "failed, ret=%d\n", ret);
		return (DDI_FAILURE);
	}

	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
	    &ud_chan_attr)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
		    "failed, ret=%d\n", ret);
		(void) ibt_free_channel(state->id_chnl_hdl);
		return (DDI_FAILURE);
	}

	state->id_qpnum = ud_chan_attr.ud_qpn;

	return (DDI_SUCCESS);
}

static int
ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
{
	uint32_t progress = state->id_mac_state;
	uint_t attempts;
	ibt_status_t ret;
	ib_gid_t mgid;
	ibd_mce_t *mce;
	uint8_t jstate;

	/*
	 * Before we try to stop/undo whatever we did in ibd_start(),
	 * we need to mark the link state appropriately to prevent the
	 * ip layer from using this instance for any new transfers. Note
	 * that if the original state of the link was "up" when we're
	 * here, we'll set the final link state to "unknown", to behave
	 * in the same fashion as other ethernet drivers.
	 */
	mutex_enter(&state->id_link_mutex);
	if (cur_link_state == LINK_STATE_DOWN) {
		state->id_link_state = cur_link_state;
	} else {
		state->id_link_state = LINK_STATE_UNKNOWN;
	}
	mutex_exit(&state->id_link_mutex);
	mac_link_update(state->id_mh, state->id_link_state);

	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
	if (progress & IBD_DRV_STARTED) {
		state->id_mac_state &= (~IBD_DRV_STARTED);
	}

	/*
	 * First, stop receive interrupts; this stops the driver from
	 * handing up buffers to higher layers.  Wait for receive buffers
	 * to be returned and give up after 5 seconds.
	 */
	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {

		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);

		attempts = 50;
		while (state->id_rx_list.dl_bufs_outstanding > 0) {
			delay(drv_usectohz(100000));
			if (--attempts == 0) {
				/*
				 * There are pending bufs with the network
				 * layer and we have no choice but to wait
				 * for them to be done with. Reap all the
				 * Tx/Rx completions that were posted since
				 * we turned off the notification and
				 * return failure.
				 */
				DPRINT(2, "ibd_undo_start: "
				    "reclaiming failed");
				ibd_poll_compq(state, state->id_rcq_hdl);
				ibt_set_cq_handler(state->id_rcq_hdl,
				    ibd_rcq_handler, state);
				return (DDI_FAILURE);
			}
		}
		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
	}

	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);

		mutex_enter(&state->id_trap_lock);
		state->id_trap_stop = B_TRUE;
		while (state->id_trap_inprog > 0)
			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
		mutex_exit(&state->id_trap_lock);

		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
	}

	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
		/*
		 * Flushing the channel ensures that all pending WQE's
		 * are marked with flush_error and handed to the CQ. It
		 * does not guarantee the invocation of the CQ handler.
		 * This call is guaranteed to return successfully for
		 * UD QPNs.
		 */
		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
		    IBT_SUCCESS) {
			DPRINT(10, "ibd_undo_start: flush_channel "
			    "failed, ret=%d", ret);
		}

		/*
		 * Turn off Tx interrupts and poll. By the time the polling
		 * returns an empty indicator, we are sure we have seen all
		 * pending Tx callbacks. Note that after the call to
		 * ibt_set_cq_handler() returns, the old handler is
		 * guaranteed not to be invoked anymore.
		 */
		if (ibd_separate_cqs == 1) {
			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
		}
		ibd_poll_compq(state, state->id_scq_hdl);

		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
	}

	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
		/*
		 * No new async requests will be posted since the device
		 * link state has been marked as unknown; completion handlers
		 * have been turned off, so Tx handler will not cause any
		 * more IBD_ASYNC_REAP requests.
		 *
		 * Queue a request for the async thread to exit, which will
		 * be serviced after any pending ones. This can take a while,
		 * specially if the SM is unreachable, since IBMF will slowly
		 * timeout each SM request issued by the async thread.  Reap
		 * the thread before continuing on, we do not want it to be
		 * lingering in modunloaded code (or we could move the reap
		 * to ibd_detach(), provided we keep track of the current
		 * id_async_thrid somewhere safe).
		 */
		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
		thread_join(state->id_async_thrid);

		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
	}

	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
		/*
		 * Drop all residual full/non membership. This includes full
		 * membership to the broadcast group, and any nonmembership
		 * acquired during transmits. We do this after the Tx completion
		 * handlers are done, since those might result in some late
		 * leaves; this also eliminates a potential race with that
		 * path wrt the mc full list insert/delete. Trap handling
		 * has also been suppressed at this point. Thus, no locks
		 * are required while traversing the mc full list.
		 */
		DPRINT(2, "ibd_undo_start: clear full cache entries");
		mce = list_head(&state->id_mc_full);
		while (mce != NULL) {
			mgid = mce->mc_info.mc_adds_vect.av_dgid;
			jstate = mce->mc_jstate;
			mce = list_next(&state->id_mc_full, mce);
			ibd_leave_group(state, mgid, jstate);
		}
		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
	}

	if (progress & IBD_DRV_RXLIST_ALLOCD) {
		ibd_fini_rxlist(state);
		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
	}

	if (progress & IBD_DRV_TXLIST_ALLOCD) {
		ibd_fini_txlist(state);
		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
	}

	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
		    IBT_SUCCESS) {
			DPRINT(10, "ibd_undo_start: free_channel "
			    "failed, ret=%d", ret);
		}

		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
	}

	if (progress & IBD_DRV_CQS_ALLOCD) {
		if (ibd_separate_cqs == 1) {
			kmem_free(state->id_txwcs,
			    sizeof (ibt_wc_t) * state->id_txwcs_size);
			if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
			    IBT_SUCCESS) {
				DPRINT(10, "ibd_undo_start: free_cq(scq) "
				    "failed, ret=%d", ret);
			}
		}

		kmem_free(state->id_rxwcs,
		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
			    "ret=%d", ret);
		}

		state->id_txwcs = NULL;
		state->id_rxwcs = NULL;
		state->id_scq_hdl = NULL;
		state->id_rcq_hdl = NULL;

		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
	}

	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
		mod_hash_destroy_hash(state->id_ah_active_hash);
		ibd_acache_fini(state);

		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
	}

	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
		/*
		 * If we'd created the ipoib broadcast group and had
		 * successfully joined it, leave it now
		 */
		if (state->id_bgroup_created) {
			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
			jstate = IB_MC_JSTATE_FULL;
			(void) ibt_leave_mcg(state->id_sgid, mgid,
			    state->id_sgid, jstate);
		}
		ibt_free_mcg_info(state->id_mcinfo, 1);

		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
	}

	return (DDI_SUCCESS);
}

/*
 * These pair of routines are used to set/clear the condition that
 * the caller is likely to do something to change the id_mac_state.
 * If there's already someone doing either a start or a stop (possibly
 * due to the async handler detecting a pkey relocation event, a plumb
 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
 * that's done.
 */
static void
ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
{
	mutex_enter(&state->id_macst_lock);
	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
		cv_wait(&state->id_macst_cv, &state->id_macst_lock);

	state->id_mac_state |= flag;
	mutex_exit(&state->id_macst_lock);
}

static void
ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
{
	mutex_enter(&state->id_macst_lock);
	state->id_mac_state &= (~flag);
	cv_signal(&state->id_macst_cv);
	mutex_exit(&state->id_macst_lock);
}

/*
 * GLDv3 entry point to start hardware.
 */
/*ARGSUSED*/
static int
ibd_m_start(void *arg)
{
	ibd_state_t *state = arg;
	int	ret;

	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);

	ret = ibd_start(state);

	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);

	return (ret);
}

static int
ibd_start(ibd_state_t *state)
{
	kthread_t *kht;
	int err;
	ibt_status_t ret;

	if (state->id_mac_state & IBD_DRV_STARTED)
		return (DDI_SUCCESS);

	/*
	 * Get port details; if we fail here, very likely the port
	 * state is inactive or the pkey can't be verified.
	 */
	if ((err = ibd_get_port_details(state)) != 0) {
		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;

	/*
	 * Find the IPoIB broadcast group
	 */
	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
		err = ENOTACTIVE;
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;

	/*
	 * Initialize per-interface caches and lists; if we fail here,
	 * it is most likely due to a lack of resources
	 */
	if (ibd_acache_init(state) != DDI_SUCCESS) {
		DPRINT(10, "ibd_start: ibd_acache_init() failed");
		err = ENOMEM;
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;

	/*
	 * Allocate send and receive completion queues
	 */
	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
		err = ENOMEM;
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;

	/*
	 * Setup a UD channel
	 */
	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
		err = ENOMEM;
		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;

	/*
	 * Allocate and initialize the tx buffer list
	 */
	if (ibd_init_txlist(state) != DDI_SUCCESS) {
		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
		err = ENOMEM;
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;

	/*
	 * If we have separate cqs, create the send cq handler here
	 */
	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
		if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
		    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
			DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
			    "failed, ret=%d", ret);
			err = EINVAL;
			goto start_fail;
		}
		state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
	}

	/*
	 * Allocate and initialize the rx buffer list
	 */
	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
		err = ENOMEM;
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;

	/*
	 * Join IPoIB broadcast group
	 */
	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
		DPRINT(10, "ibd_start: ibd_join_group() failed");
		err = ENOTACTIVE;
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;

	/*
	 * Create the async thread; thread_create never fails.
	 */
	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
	    TS_RUN, minclsyspri);
	state->id_async_thrid = kht->t_did;
	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;

	/*
	 * When we did mac_register() in ibd_attach(), we didn't register
	 * the real macaddr and we didn't have the true port mtu. Now that
	 * we're almost ready, set the local mac address and broadcast
	 * addresses and update gldv3 about the real values of these
	 * parameters.
	 */
	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);

	mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);

	/*
	 * Setup the receive cq handler
	 */
	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
		    "failed, ret=%d", ret);
		err = EINVAL;
		goto start_fail;
	}
	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;

	/*
	 * Setup the subnet notices handler after we've initialized the acache/
	 * mcache and started the async thread, both of which are required for
	 * the trap handler to function properly.
	 *
	 * Now that the async thread has been started (and we've already done
	 * a mac_register() during attach so mac_tx_update() can be called
	 * if necessary without any problem), we can enable the trap handler
	 * to queue requests to the async thread.
	 */
	ibt_register_subnet_notices(state->id_ibt_hdl,
	    ibd_snet_notices_handler, state);
	mutex_enter(&state->id_trap_lock);
	state->id_trap_stop = B_FALSE;
	mutex_exit(&state->id_trap_lock);
	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;

	/*
	 * Indicate link status to GLDv3 and higher layers. By default,
	 * we assume we are in up state (which must have been true at
	 * least at the time the broadcast mcg's were probed); if there
	 * were any up/down transitions till the time we come here, the
	 * async handler will have updated last known state, which we
	 * use to tell GLDv3. The async handler will not send any
	 * notifications to GLDv3 till we reach here in the initialization
	 * sequence.
	 */
	state->id_mac_state |= IBD_DRV_STARTED;
	mac_link_update(state->id_mh, state->id_link_state);

	return (DDI_SUCCESS);

start_fail:
	/*
	 * If we ran into a problem during ibd_start() and ran into
	 * some other problem during undoing our partial work, we can't
	 * do anything about it.  Ignore any errors we might get from
	 * ibd_undo_start() and just return the original error we got.
	 */
	(void) ibd_undo_start(state, LINK_STATE_DOWN);
	return (err);
}

/*
 * GLDv3 entry point to stop hardware from receiving packets.
 */
/*ARGSUSED*/
static void
ibd_m_stop(void *arg)
{
	ibd_state_t *state = (ibd_state_t *)arg;

	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);

	(void) ibd_undo_start(state, state->id_link_state);

	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
}

/*
 * GLDv3 entry point to modify device's mac address. We do not
 * allow address modifications.
 */
static int
ibd_m_unicst(void *arg, const uint8_t *macaddr)
{
	ibd_state_t *state = arg;

	/*
	 * Don't bother even comparing the macaddr if we haven't
	 * completed ibd_m_start().
	 */
	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
		return (0);

	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
		return (0);
	else
		return (EINVAL);
}

/*
 * The blocking part of the IBA join/leave operations are done out
 * of here on the async thread.
 */
static void
ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
{
	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);

	if (op == IBD_ASYNC_JOIN) {
		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
			ibd_print_warn(state, "Joint multicast group failed :"
			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
		}
	} else {
		/*
		 * Here, we must search for the proper mcg_info and
		 * use that to leave the group.
		 */
		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
	}
}

/*
 * GLDv3 entry point for multicast enable/disable requests.
 * This function queues the operation to the async thread and
 * return success for a valid multicast address.
 */
static int
ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
{
	ibd_state_t *state = (ibd_state_t *)arg;
	ipoib_mac_t maddr, *mcast;
	ib_gid_t mgid;
	ibd_req_t *req;

	/*
	 * If we haven't completed ibd_m_start(), async thread wouldn't
	 * have been started and id_bcaddr wouldn't be set, so there's
	 * no point in continuing.
	 */
	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
		return (0);

	/*
	 * The incoming multicast address might not be aligned properly
	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
	 * it to look like one though, to get the offsets of the mc gid,
	 * since we know we are not going to dereference any values with
	 * the ipoib_mac_t pointer.
	 */
	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
	mcast = &maddr;

	/*
	 * Check validity of MCG address. We could additionally check
	 * that a enable/disable is not being issued on the "broadcast"
	 * mcg, but since this operation is only invokable by priviledged
	 * programs anyway, we allow the flexibility to those dlpi apps.
	 * Note that we do not validate the "scope" of the IBA mcg.
	 */
	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
		return (EINVAL);

	/*
	 * fill in multicast pkey and scope
	 */
	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);

	/*
	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
	 * nothing (i.e. we stay JOINed to the broadcast group done in
	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
	 * requires to be joined to broadcast groups at all times.
	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
	 * depends on this.
	 */
	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
		return (0);

	ibd_n2h_gid(mcast, &mgid);
	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
	if (req == NULL)
		return (ENOMEM);

	req->rq_gid = mgid;

	if (add) {
		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
		    mgid.gid_prefix, mgid.gid_guid);
		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
	} else {
		DPRINT(1, "ibd_m_multicst : unset_multicast : "
		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
	}
	return (0);
}

/*
 * The blocking part of the IBA promiscuous operations are done
 * out of here on the async thread. The dlpireq parameter indicates
 * whether this invocation is due to a dlpi request or due to
 * a port up/down event.
 */
static void
ibd_async_unsetprom(ibd_state_t *state)
{
	ibd_mce_t *mce = list_head(&state->id_mc_non);
	ib_gid_t mgid;

	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");

	while (mce != NULL) {
		mgid = mce->mc_info.mc_adds_vect.av_dgid;
		mce = list_next(&state->id_mc_non, mce);
		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
	}
	state->id_prom_op = IBD_OP_NOTSTARTED;
}

/*
 * The blocking part of the IBA promiscuous operations are done
 * out of here on the async thread. The dlpireq parameter indicates
 * whether this invocation is due to a dlpi request or due to
 * a port up/down event.
 */
static void
ibd_async_setprom(ibd_state_t *state)
{
	ibt_mcg_attr_t mcg_attr;
	ibt_mcg_info_t *mcg_info;
	ib_gid_t mgid;
	uint_t numg;
	int i;
	char ret = IBD_OP_COMPLETED;

	DPRINT(2, "ibd_async_setprom : async_set_promisc");

	/*
	 * Obtain all active MC groups on the IB fabric with
	 * specified criteria (scope + Pkey + Qkey + mtu).
	 */
	bzero(&mcg_attr, sizeof (mcg_attr));
	mcg_attr.mc_pkey = state->id_pkey;
	mcg_attr.mc_scope = state->id_scope;
	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
	    IBT_SUCCESS) {
		ibd_print_warn(state, "Could not get list of IBA multicast "
		    "groups");
		ret = IBD_OP_ERRORED;
		goto done;
	}

	/*
	 * Iterate over the returned mcg's and join as NonMember
	 * to the IP mcg's.
	 */
	for (i = 0; i < numg; i++) {
		/*
		 * Do a NonMember JOIN on the MC group.
		 */
		mgid = mcg_info[i].mc_adds_vect.av_dgid;
		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
			ibd_print_warn(state, "IBA promiscuous mode missed "
			    "multicast gid %016llx:%016llx",
			    (u_longlong_t)mgid.gid_prefix,
			    (u_longlong_t)mgid.gid_guid);
	}

	ibt_free_mcg_info(mcg_info, numg);
	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
done:
	state->id_prom_op = ret;
}

/*
 * GLDv3 entry point for multicast promiscuous enable/disable requests.
 * GLDv3 assumes phys state receives more packets than multi state,
 * which is not true for IPoIB. Thus, treat the multi and phys
 * promiscuous states the same way to work with GLDv3's assumption.
 */
static int
ibd_m_promisc(void *arg, boolean_t on)
{
	ibd_state_t *state = (ibd_state_t *)arg;
	ibd_req_t *req;

	/*
	 * Async thread wouldn't have been started if we haven't
	 * passed ibd_m_start()
	 */
	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
		return (0);

	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
	if (req == NULL)
		return (ENOMEM);
	if (on) {
		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
	} else {
		DPRINT(1, "ibd_m_promisc : unset_promisc");
		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
	}

	return (0);
}

/*
 * GLDv3 entry point for gathering statistics.
 */
static int
ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
{
	ibd_state_t *state = (ibd_state_t *)arg;

	switch (stat) {
	case MAC_STAT_IFSPEED:
		*val = state->id_link_speed;
		break;
	case MAC_STAT_MULTIRCV:
		*val = state->id_multi_rcv;
		break;
	case MAC_STAT_BRDCSTRCV:
		*val = state->id_brd_rcv;
		break;
	case MAC_STAT_MULTIXMT:
		*val = state->id_multi_xmt;
		break;
	case MAC_STAT_BRDCSTXMT:
		*val = state->id_brd_xmt;
		break;
	case MAC_STAT_RBYTES:
		*val = state->id_rcv_bytes;
		break;
	case MAC_STAT_IPACKETS:
		*val = state->id_rcv_pkt;
		break;
	case MAC_STAT_OBYTES:
		*val = state->id_xmt_bytes;
		break;
	case MAC_STAT_OPACKETS:
		*val = state->id_xmt_pkt;
		break;
	case MAC_STAT_OERRORS:
		*val = state->id_ah_error;	/* failed AH translation */
		break;
	case MAC_STAT_IERRORS:
		*val = 0;
		break;
	case MAC_STAT_NOXMTBUF:
		*val = state->id_tx_short;
		break;
	case MAC_STAT_NORCVBUF:
	default:
		return (ENOTSUP);
	}

	return (0);
}

static void
ibd_async_txsched(ibd_state_t *state)
{
	ibd_req_t *req;
	int ret;

	if (ibd_txcomp_poll)
		ibd_poll_compq(state, state->id_scq_hdl);

	ret = ibd_resume_transmission(state);
	if (ret && ibd_txcomp_poll) {
		if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
			ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
		else {
			ibd_print_warn(state, "ibd_async_txsched: "
			    "no memory, can't schedule work slot");
		}
	}
}

static int
ibd_resume_transmission(ibd_state_t *state)
{
	int flag;
	int met_thresh = 0;
	int ret = -1;

	mutex_enter(&state->id_sched_lock);
	if (state->id_sched_needed & IBD_RSRC_SWQE) {
		met_thresh = (state->id_tx_list.dl_cnt >
		    IBD_FREE_SWQES_THRESH);
		flag = IBD_RSRC_SWQE;
	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
		ASSERT(state->id_lso != NULL);
		met_thresh = (state->id_lso->bkt_nfree >
		    IBD_FREE_LSOS_THRESH);
		flag = IBD_RSRC_LSOBUF;
	}
	if (met_thresh) {
		state->id_sched_needed &= ~flag;
		ret = 0;
	}
	mutex_exit(&state->id_sched_lock);

	if (ret == 0)
		mac_tx_update(state->id_mh);

	return (ret);
}

/*
 * Release the send wqe back into free list.
 */
static void
ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
{
	/*
	 * Add back on Tx list for reuse.
	 */
	swqe->swqe_next = NULL;
	mutex_enter(&state->id_tx_list.dl_mutex);
	if (state->id_tx_list.dl_pending_sends) {
		state->id_tx_list.dl_pending_sends = B_FALSE;
	}
	if (state->id_tx_list.dl_head == NULL) {
		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
	} else {
		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
	}
	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
	state->id_tx_list.dl_cnt++;
	mutex_exit(&state->id_tx_list.dl_mutex);
}

/*
 * Acquire a send wqe from free list.
 * Returns error number and send wqe pointer.
 */
static int
ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
{
	int rc = 0;
	ibd_swqe_t *wqe;

	/*
	 * Check and reclaim some of the completed Tx requests.
	 * If someone else is already in this code and pulling Tx
	 * completions, no need to poll, since the current lock holder
	 * will do the work anyway. Normally, we poll for completions
	 * every few Tx attempts, but if we are short on Tx descriptors,
	 * we always try to poll.
	 */
	if ((ibd_txcomp_poll == 1) &&
	    (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
		ibd_poll_compq(state, state->id_scq_hdl);
	}

	/*
	 * Grab required transmit wqes.
	 */
	mutex_enter(&state->id_tx_list.dl_mutex);
	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
	if (wqe != NULL) {
		state->id_tx_list.dl_cnt -= 1;
		state->id_tx_list.dl_head = wqe->swqe_next;
		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
			state->id_tx_list.dl_tail = NULL;
	} else {
		/*
		 * If we did not find the number we were looking for, flag
		 * no resource. Adjust list appropriately in either case.
		 */
		rc = ENOENT;
		state->id_tx_list.dl_pending_sends = B_TRUE;
		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
		atomic_add_64(&state->id_tx_short, 1);
	}
	mutex_exit(&state->id_tx_list.dl_mutex);
	*swqe = wqe;

	return (rc);
}

static int
ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
    ibt_ud_dest_hdl_t ud_dest)
{
	mblk_t	*nmp;
	int iph_len, tcph_len;
	ibt_wr_lso_t *lso;
	uintptr_t ip_start, tcp_start;
	uint8_t *dst;
	uint_t pending, mblen;

	/*
	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
	 * we need to adjust it here for lso.
	 */
	lso = &(node->w_swr.wr.ud_lso);
	lso->lso_ud_dest = ud_dest;
	lso->lso_mss = mss;

	/*
	 * Calculate the LSO header size and set it in the UD LSO structure.
	 * Note that the only assumption we make is that each of the IPoIB,
	 * IP and TCP headers will be contained in a single mblk fragment;
	 * together, the headers may span multiple mblk fragments.
	 */
	nmp = mp;
	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
		ip_start = (uintptr_t)nmp->b_cont->b_rptr
		    + (ip_start - (uintptr_t)(nmp->b_wptr));
		nmp = nmp->b_cont;

	}
	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);

	tcp_start = ip_start + iph_len;
	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
		nmp = nmp->b_cont;
	}
	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;

	/*
	 * If the lso header fits entirely within a single mblk fragment,
	 * we'll avoid an additional copy of the lso header here and just
	 * pass the b_rptr of the mblk directly.
	 *
	 * If this isn't true, we'd have to allocate for it explicitly.
	 */
	if (lso->lso_hdr_sz <= MBLKL(mp)) {
		lso->lso_hdr = mp->b_rptr;
	} else {
		/* On work completion, remember to free this allocated hdr */
		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
		if (lso->lso_hdr == NULL) {
			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
			    "sz = %d", lso->lso_hdr_sz);
			lso->lso_hdr_sz = 0;
			lso->lso_mss = 0;
			return (-1);
		}
	}

	/*
	 * Copy in the lso header only if we need to
	 */
	if (lso->lso_hdr != mp->b_rptr) {
		dst = lso->lso_hdr;
		pending = lso->lso_hdr_sz;

		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
			mblen = MBLKL(nmp);
			if (pending > mblen) {
				bcopy(nmp->b_rptr, dst, mblen);
				dst += mblen;
				pending -= mblen;
			} else {
				bcopy(nmp->b_rptr, dst, pending);
				break;
			}
		}
	}

	return (0);
}

static void
ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
{
	ibt_wr_lso_t *lso;

	if ((!node) || (!mp))
		return;

	/*
	 * Free any header space that we might've allocated if we
	 * did an LSO
	 */
	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
		lso = &(node->w_swr.wr.ud_lso);
		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
			lso->lso_hdr = NULL;
			lso->lso_hdr_sz = 0;
		}
	}
}

static void
ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
{
	uint_t		i;
	uint_t		num_posted;
	uint_t		n_wrs;
	ibt_status_t	ibt_status;
	ibt_send_wr_t	wrs[IBD_MAX_POST_MULTIPLE];
	ibd_swqe_t	*elem;
	ibd_swqe_t	*nodes[IBD_MAX_POST_MULTIPLE];

	node->swqe_next = NULL;

	mutex_enter(&state->id_txpost_lock);

	/*
	 * Enqueue the new node in chain of wqes to send
	 */
	if (state->id_tx_head) {
		*(state->id_tx_tailp) = (ibd_wqe_t *)node;
	} else {
		state->id_tx_head = node;
	}
	state->id_tx_tailp = &(node->swqe_next);

	/*
	 * If someone else is helping out with the sends,
	 * just go back
	 */
	if (state->id_tx_busy) {
		mutex_exit(&state->id_txpost_lock);
		return;
	}

	/*
	 * Otherwise, mark the flag to indicate that we'll be
	 * doing the dispatch of what's there in the wqe chain
	 */
	state->id_tx_busy = 1;

	while (state->id_tx_head) {
		/*
		 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
		 * at a time if possible, and keep posting them.
		 */
		for (n_wrs = 0, elem = state->id_tx_head;
		    (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {

			nodes[n_wrs] = elem;
			wrs[n_wrs] = elem->w_swr;
		}
		state->id_tx_head = elem;

		/*
		 * Release the txpost lock before posting the
		 * send request to the hca; if the posting fails
		 * for some reason, we'll never receive completion
		 * intimation, so we'll need to cleanup.
		 */
		mutex_exit(&state->id_txpost_lock);

		ASSERT(n_wrs != 0);

		/*
		 * If posting fails for some reason, we'll never receive
		 * completion intimation, so we'll need to cleanup. But
		 * we need to make sure we don't clean up nodes whose
		 * wrs have been successfully posted. We assume that the
		 * hca driver returns on the first failure to post and
		 * therefore the first 'num_posted' entries don't need
		 * cleanup here.
		 */
		num_posted = 0;
		ibt_status = ibt_post_send(state->id_chnl_hdl,
		    wrs, n_wrs, &num_posted);
		if (ibt_status != IBT_SUCCESS) {

			ibd_print_warn(state, "ibd_post_send: "
			    "posting multiple wrs failed: "
			    "requested=%d, done=%d, ret=%d",
			    n_wrs, num_posted, ibt_status);

			for (i = num_posted; i < n_wrs; i++)
				ibd_tx_cleanup(state, nodes[i]);
		}

		/*
		 * Grab the mutex before we go and check the tx Q again
		 */
		mutex_enter(&state->id_txpost_lock);
	}

	state->id_tx_busy = 0;
	mutex_exit(&state->id_txpost_lock);
}

static int
ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
    uint_t lsohdr_sz)
{
	ibt_wr_ds_t *sgl;
	ibt_status_t ibt_status;
	mblk_t *nmp;
	mblk_t *data_mp;
	uchar_t *bufp;
	size_t blksize;
	size_t skip;
	size_t avail;
	uint_t pktsize;
	uint_t frag_len;
	uint_t pending_hdr;
	uint_t hiwm;
	int nmblks;
	int i;

	/*
	 * Let's skip ahead to the data if this is LSO
	 */
	data_mp = mp;
	pending_hdr = 0;
	if (lsohdr_sz) {
		pending_hdr = lsohdr_sz;
		for (nmp = mp; nmp; nmp = nmp->b_cont) {
			frag_len = nmp->b_wptr - nmp->b_rptr;
			if (frag_len > pending_hdr)
				break;
			pending_hdr -= frag_len;
		}
		data_mp = nmp;	/* start of data past lso header */
		ASSERT(data_mp != NULL);
	}

	/*
	 * Calculate the size of message data and number of msg blocks
	 */
	pktsize = 0;
	for (nmblks = 0, nmp = data_mp; nmp != NULL;
	    nmp = nmp->b_cont, nmblks++) {
		pktsize += MBLKL(nmp);
	}
	pktsize -= pending_hdr;

	/*
	 * Translating the virtual address regions into physical regions
	 * for using the Reserved LKey feature results in a wr sgl that
	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
	 * we'll fix a high-water mark (65%) for when we should stop.
	 */
	hiwm = (state->id_max_sqseg * 65) / 100;

	/*
	 * We only do ibt_map_mem_iov() if the pktsize is above the
	 * "copy-threshold", and if the number of mp fragments is less than
	 * the maximum acceptable.
	 */
	if ((state->id_hca_res_lkey_capab) &&
	    (pktsize > IBD_TX_COPY_THRESH) &&
	    (nmblks < hiwm)) {
		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
		ibt_iov_attr_t iov_attr;

		iov_attr.iov_as = NULL;
		iov_attr.iov = iov_arr;
		iov_attr.iov_buf = NULL;
		iov_attr.iov_list_len = nmblks;
		iov_attr.iov_wr_nds = state->id_max_sqseg;
		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
		iov_attr.iov_flags = IBT_IOV_SLEEP;

		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
			iov_arr[i].iov_len = MBLKL(nmp);
			if (i == 0) {
				iov_arr[i].iov_addr += pending_hdr;
				iov_arr[i].iov_len -= pending_hdr;
			}
		}

		node->w_buftype = IBD_WQE_MAPPED;
		node->w_swr.wr_sgl = node->w_sgl;

		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
		if (ibt_status != IBT_SUCCESS) {
			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
			goto ibd_copy_path;
		}

		return (0);
	}

ibd_copy_path:
	if (pktsize <= state->id_tx_buf_sz) {
		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
		node->w_swr.wr_nds = 1;
		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
		node->w_buftype = IBD_WQE_TXBUF;

		/*
		 * Even though this is the copy path for transfers less than
		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
		 * is possible the first data mblk fragment (data_mp) still
		 * contains part of the LSO header that we need to skip.
		 */
		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
			blksize = MBLKL(nmp) - pending_hdr;
			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
			bufp += blksize;
			pending_hdr = 0;
		}

		return (0);
	}

	/*
	 * Copy path for transfers greater than id_tx_buf_sz
	 */
	node->w_swr.wr_sgl = node->w_sgl;
	if (ibd_acquire_lsobufs(state, pktsize,
	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
		return (-1);
	}
	node->w_buftype = IBD_WQE_LSOBUF;

	/*
	 * Copy the larger-than-id_tx_buf_sz packet into a set of
	 * fixed-sized, pre-mapped LSO buffers. Note that we might
	 * need to skip part of the LSO header in the first fragment
	 * as before.
	 */
	nmp = data_mp;
	skip = pending_hdr;
	for (i = 0; i < node->w_swr.wr_nds; i++) {
		sgl = node->w_swr.wr_sgl + i;
		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
		avail = IBD_LSO_BUFSZ;
		while (nmp && avail) {
			blksize = MBLKL(nmp) - skip;
			if (blksize > avail) {
				bcopy(nmp->b_rptr + skip, bufp, avail);
				skip += avail;
				avail = 0;
			} else {
				bcopy(nmp->b_rptr + skip, bufp, blksize);
				skip = 0;
				avail -= blksize;
				bufp += blksize;
				nmp = nmp->b_cont;
			}
		}
	}

	return (0);
}

/*
 * Schedule a completion queue polling to reap the resource we're
 * short on.  If we implement the change to reap tx completions
 * in a separate thread, we'll need to wake up that thread here.
 */
static int
ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
{
	ibd_req_t *req;

	mutex_enter(&state->id_sched_lock);
	state->id_sched_needed |= resource_type;
	mutex_exit(&state->id_sched_lock);

	/*
	 * If we are asked to queue a work entry, we need to do it
	 */
	if (q_flag) {
		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
		if (req == NULL)
			return (-1);

		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
	}

	return (0);
}

/*
 * The passed in packet has this format:
 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
 */
static boolean_t
ibd_send(ibd_state_t *state, mblk_t *mp)
{
	ibd_ace_t *ace;
	ibd_swqe_t *node;
	ipoib_mac_t *dest;
	ib_header_info_t *ipibp;
	ip6_t *ip6h;
	uint_t pktsize;
	uint32_t mss;
	uint32_t hckflags;
	uint32_t lsoflags = 0;
	uint_t lsohdr_sz = 0;
	int ret, len;
	boolean_t dofree = B_FALSE;
	boolean_t rc;

	/*
	 * If we aren't done with the device initialization and start,
	 * we shouldn't be here.
	 */
	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
		return (B_FALSE);

	node = NULL;
	if (ibd_acquire_swqe(state, &node) != 0) {
		/*
		 * If we don't have an swqe available, schedule a transmit
		 * completion queue cleanup and hold off on sending more
		 * more packets until we have some free swqes
		 */
		if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
			return (B_FALSE);

		/*
		 * If a poll cannot be scheduled, we have no choice but
		 * to drop this packet
		 */
		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
		return (B_TRUE);
	}

	/*
	 * Initialize the commonly used fields in swqe to NULL to protect
	 * against ibd_tx_cleanup accidentally misinterpreting these on a
	 * failure.
	 */
	node->swqe_im_mblk = NULL;
	node->w_swr.wr_nds = 0;
	node->w_swr.wr_sgl = NULL;
	node->w_swr.wr_opcode = IBT_WRC_SEND;

	/*
	 * Obtain an address handle for the destination.
	 */
	ipibp = (ib_header_info_t *)mp->b_rptr;
	dest = (ipoib_mac_t *)&ipibp->ib_dst;
	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);

	pktsize = msgsize(mp);

	atomic_add_64(&state->id_xmt_bytes, pktsize);
	atomic_inc_64(&state->id_xmt_pkt);
	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
		atomic_inc_64(&state->id_brd_xmt);
	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
		atomic_inc_64(&state->id_multi_xmt);

	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
		node->w_ahandle = ace;
		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
	} else {
		DPRINT(5,
		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
		    ((ret == EFAULT) ? "failed" : "queued"),
		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
		    htonl(dest->ipoib_gidpref[1]),
		    htonl(dest->ipoib_gidsuff[0]),
		    htonl(dest->ipoib_gidsuff[1]));
		node->w_ahandle = NULL;

		/*
		 * for the poll mode, it is probably some cqe pending in the
		 * cq. So ibd has to poll cq here, otherwise acache probably
		 * may not be recycled.
		 */
		if (ibd_txcomp_poll == 1)
			ibd_poll_compq(state, state->id_scq_hdl);

		/*
		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
		 * can not find a path for the specific dest address. We
		 * should get rid of this kind of packet.  We also should get
		 * rid of the packet if we cannot schedule a poll via the
		 * async thread.  For the normal case, ibd will return the
		 * packet to upper layer and wait for AH creating.
		 *
		 * Note that we always queue a work slot entry for the async
		 * thread when we fail AH lookup (even in intr mode); this is
		 * due to the convoluted way the code currently looks for AH.
		 */
		if (ret == EFAULT) {
			dofree = B_TRUE;
			rc = B_TRUE;
		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
			dofree = B_TRUE;
			rc = B_TRUE;
		} else {
			dofree = B_FALSE;
			rc = B_FALSE;
		}
		goto ibd_send_fail;
	}

	/*
	 * For ND6 packets, padding is at the front of the source lladdr.
	 * Insert the padding at front.
	 */
	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
			if (!pullupmsg(mp, IPV6_HDR_LEN +
			    sizeof (ib_header_info_t))) {
				DPRINT(10, "ibd_send: pullupmsg failure ");
				dofree = B_TRUE;
				rc = B_TRUE;
				goto ibd_send_fail;
			}
			ipibp = (ib_header_info_t *)mp->b_rptr;
		}
		ip6h = (ip6_t *)((uchar_t *)ipibp +
		    sizeof (ib_header_info_t));
		len = ntohs(ip6h->ip6_plen);
		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
			mblk_t	*pad;

			pad = allocb(4, 0);
			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
			linkb(mp, pad);
			if (MBLKL(mp) < sizeof (ib_header_info_t) +
			    IPV6_HDR_LEN + len + 4) {
				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
				    IPV6_HDR_LEN + len + 4)) {
					DPRINT(10, "ibd_send: pullupmsg "
					    "failure ");
					dofree = B_TRUE;
					rc = B_TRUE;
					goto ibd_send_fail;
				}
				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
				    sizeof (ib_header_info_t));
			}

			/* LINTED: E_CONSTANT_CONDITION */
			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
		}
	}

	mp->b_rptr += sizeof (ib_addrs_t);

	/*
	 * Do LSO and checksum related work here.  For LSO send, adjust the
	 * ud destination, the opcode and the LSO header information to the
	 * work request.
	 */
	lso_info_get(mp, &mss, &lsoflags);
	if ((lsoflags & HW_LSO) != HW_LSO) {
		node->w_swr.wr_opcode = IBT_WRC_SEND;
		lsohdr_sz = 0;
	} else {
		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
			/*
			 * The routine can only fail if there's no memory; we
			 * can only drop the packet if this happens
			 */
			ibd_print_warn(state,
			    "ibd_send: no memory, lso posting failed");
			dofree = B_TRUE;
			rc = B_TRUE;
			goto ibd_send_fail;
		}

		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
	}

	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
	else
		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;

	/*
	 * Prepare the sgl for posting; the routine can only fail if there's
	 * no lso buf available for posting. If this is the case, we should
	 * probably resched for lso bufs to become available and then try again.
	 */
	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
			dofree = B_TRUE;
			rc = B_TRUE;
		} else {
			dofree = B_FALSE;
			rc = B_FALSE;
		}
		goto ibd_send_fail;
	}
	node->swqe_im_mblk = mp;

	/*
	 * Queue the wqe to hardware; since we can now simply queue a
	 * post instead of doing it serially, we cannot assume anything
	 * about the 'node' after ibd_post_send() returns.
	 */
	ibd_post_send(state, node);

	return (B_TRUE);

ibd_send_fail:
	if (node && mp)
		ibd_free_lsohdr(node, mp);

	if (dofree)
		freemsg(mp);

	if (node != NULL)
		ibd_tx_cleanup(state, node);

	return (rc);
}

/*
 * GLDv3 entry point for transmitting datagram.
 */
static mblk_t *
ibd_m_tx(void *arg, mblk_t *mp)
{
	ibd_state_t *state = (ibd_state_t *)arg;
	mblk_t *next;

	if (state->id_link_state != LINK_STATE_UP) {
		freemsgchain(mp);
		mp = NULL;
	}

	while (mp != NULL) {
		next = mp->b_next;
		mp->b_next = NULL;
		if (ibd_send(state, mp) == B_FALSE) {
			/* Send fail */
			mp->b_next = next;
			break;
		}
		mp = next;
	}

	return (mp);
}

/*
 * this handles Tx and Rx completions. With separate CQs, this handles
 * only Rx completions.
 */
static uint_t
ibd_intr(char *arg)
{
	ibd_state_t *state = (ibd_state_t *)arg;

	ibd_poll_compq(state, state->id_rcq_hdl);

	return (DDI_INTR_CLAIMED);
}

/*
 * Poll and drain the cq
 */
static uint_t
ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
    uint_t numwcs)
{
	ibd_wqe_t *wqe;
	ibt_wc_t *wc;
	uint_t total_polled = 0;
	uint_t num_polled;
	int i;

	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
		total_polled += num_polled;
		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
			    (wqe->w_type == IBD_WQE_RECV));
			if (wc->wc_status != IBT_WC_SUCCESS) {
				/*
				 * Channel being torn down.
				 */
				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
					DPRINT(5, "ibd_drain_cq: flush error");
					/*
					 * Only invoke the Tx handler to
					 * release possibly held resources
					 * like AH refcount etc. Can not
					 * invoke Rx handler because it might
					 * try adding buffers to the Rx pool
					 * when we are trying to deinitialize.
					 */
					if (wqe->w_type == IBD_WQE_RECV) {
						continue;
					} else {
						DPRINT(10, "ibd_drain_cq: Bad "
						    "status %d", wc->wc_status);
					}
				}
			}
			if (wqe->w_type == IBD_WQE_SEND) {
				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
			} else {
				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
			}
		}
	}

	return (total_polled);
}

/*
 * Common code for interrupt handling as well as for polling
 * for all completed wqe's while detaching.
 */
static void
ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
{
	ibt_wc_t *wcs;
	uint_t numwcs;
	int flag, redo_flag;
	int redo = 1;
	uint_t num_polled = 0;

	if (ibd_separate_cqs == 1) {
		if (cq_hdl == state->id_rcq_hdl) {
			flag = IBD_RX_CQ_POLLING;
			redo_flag = IBD_REDO_RX_CQ_POLLING;
		} else {
			flag = IBD_TX_CQ_POLLING;
			redo_flag = IBD_REDO_TX_CQ_POLLING;
		}
	} else {
		flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
		redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
	}

	mutex_enter(&state->id_cq_poll_lock);
	if (state->id_cq_poll_busy & flag) {
		state->id_cq_poll_busy |= redo_flag;
		mutex_exit(&state->id_cq_poll_lock);
		return;
	}
	state->id_cq_poll_busy |= flag;
	mutex_exit(&state->id_cq_poll_lock);

	/*
	 * In some cases (eg detaching), this code can be invoked on
	 * any cpu after disabling cq notification (thus no concurrency
	 * exists). Apart from that, the following applies normally:
	 * The receive completion handling is always on the Rx interrupt
	 * cpu. Transmit completion handling could be from any cpu if
	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
	 * is interrupt driven. Combined completion handling is always
	 * on the interrupt cpu. Thus, lock accordingly and use the
	 * proper completion array.
	 */
	if (ibd_separate_cqs == 1) {
		if (cq_hdl == state->id_rcq_hdl) {
			wcs = state->id_rxwcs;
			numwcs = state->id_rxwcs_size;
		} else {
			wcs = state->id_txwcs;
			numwcs = state->id_txwcs_size;
		}
	} else {
		wcs = state->id_rxwcs;
		numwcs = state->id_rxwcs_size;
	}

	/*
	 * Poll and drain the CQ
	 */
	num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);

	/*
	 * Enable CQ notifications and redrain the cq to catch any
	 * completions we might have missed after the ibd_drain_cq()
	 * above and before the ibt_enable_cq_notify() that follows.
	 * Finally, service any new requests to poll the cq that
	 * could've come in after the ibt_enable_cq_notify().
	 */
	do {
		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
		    IBT_SUCCESS) {
			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
		}

		num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);

		mutex_enter(&state->id_cq_poll_lock);
		if (state->id_cq_poll_busy & redo_flag)
			state->id_cq_poll_busy &= ~redo_flag;
		else {
			state->id_cq_poll_busy &= ~flag;
			redo = 0;
		}
		mutex_exit(&state->id_cq_poll_lock);

	} while (redo);

	/*
	 * If we polled the receive cq and found anything, we need to flush
	 * it out to the nw layer here.
	 */
	if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
		ibd_flush_rx(state, NULL);
	}
}

/*
 * Unmap the memory area associated with a given swqe.
 */
static void
ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
{
	ibt_status_t stat;

	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);

	if (swqe->w_mi_hdl) {
		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
			DPRINT(10,
			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
		}
		swqe->w_mi_hdl = NULL;
	}
	swqe->w_swr.wr_nds = 0;
}

/*
 * Common code that deals with clean ups after a successful or
 * erroneous transmission attempt.
 */
static void
ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
{
	ibd_ace_t *ace = swqe->w_ahandle;

	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);

	/*
	 * If this was a dynamic mapping in ibd_send(), we need to
	 * unmap here. If this was an lso buffer we'd used for sending,
	 * we need to release the lso buf to the pool, since the resource
	 * is scarce. However, if this was simply a normal send using
	 * the copybuf (present in each swqe), we don't need to release it.
	 */
	if (swqe->swqe_im_mblk != NULL) {
		if (swqe->w_buftype == IBD_WQE_MAPPED) {
			ibd_unmap_mem(state, swqe);
		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
			ibd_release_lsobufs(state,
			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
		}
		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
		freemsg(swqe->swqe_im_mblk);
		swqe->swqe_im_mblk = NULL;
	}

	/*
	 * Drop the reference count on the AH; it can be reused
	 * now for a different destination if there are no more
	 * posted sends that will use it. This can be eliminated
	 * if we can always associate each Tx buffer with an AH.
	 * The ace can be null if we are cleaning up from the
	 * ibd_send() error path.
	 */
	if (ace != NULL) {
		/*
		 * The recycling logic can be eliminated from here
		 * and put into the async thread if we create another
		 * list to hold ACE's for unjoined mcg's.
		 */
		if (DEC_REF_DO_CYCLE(ace)) {
			ibd_mce_t *mce;

			/*
			 * Check with the lock taken: we decremented
			 * reference count without the lock, and some
			 * transmitter might alreay have bumped the
			 * reference count (possible in case of multicast
			 * disable when we leave the AH on the active
			 * list). If not still 0, get out, leaving the
			 * recycle bit intact.
			 *
			 * Atomically transition the AH from active
			 * to free list, and queue a work request to
			 * leave the group and destroy the mce. No
			 * transmitter can be looking at the AH or
			 * the MCE in between, since we have the
			 * ac_mutex lock. In the SendOnly reap case,
			 * it is not neccesary to hold the ac_mutex
			 * and recheck the ref count (since the AH was
			 * taken off the active list), we just do it
			 * to have uniform processing with the Full
			 * reap case.
			 */
			mutex_enter(&state->id_ac_mutex);
			mce = ace->ac_mce;
			if (GET_REF_CYCLE(ace) == 0) {
				CLEAR_REFCYCLE(ace);
				/*
				 * Identify the case of fullmember reap as
				 * opposed to mcg trap reap. Also, port up
				 * might set ac_mce to NULL to indicate Tx
				 * cleanup should do no more than put the
				 * AH in the free list (see ibd_async_link).
				 */
				if (mce != NULL) {
					ace->ac_mce = NULL;
					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
					/*
					 * mc_req was initialized at mce
					 * creation time.
					 */
					ibd_queue_work_slot(state,
					    &mce->mc_req, IBD_ASYNC_REAP);
				}
				IBD_ACACHE_INSERT_FREE(state, ace);
			}
			mutex_exit(&state->id_ac_mutex);
		}
	}

	/*
	 * Release the send wqe for reuse.
	 */
	ibd_release_swqe(state, swqe);
}

/*
 * Hand off the processed rx mp chain to mac_rx()
 */
static void
ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
{
	if (mpc == NULL) {
		mutex_enter(&state->id_rx_lock);

		mpc = state->id_rx_mp;

		state->id_rx_mp = NULL;
		state->id_rx_mp_tail = NULL;
		state->id_rx_mp_len = 0;

		mutex_exit(&state->id_rx_lock);
	}

	if (mpc) {
		mac_rx(state->id_mh, state->id_rh, mpc);
	}
}

/*
 * Processing to be done after receipt of a packet; hand off to GLD
 * in the format expected by GLD.  The received packet has this
 * format: 2b sap :: 00 :: data.
 */
static void
ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
{
	ib_header_info_t *phdr;
	mblk_t *mp;
	mblk_t *mpc = NULL;
	ipoib_hdr_t *ipibp;
	ipha_t *iphap;
	ip6_t *ip6h;
	int rxcnt, len;

	/*
	 * Track number handed to upper layer, and number still
	 * available to receive packets.
	 */
	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
	ASSERT(rxcnt >= 0);
	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);

	/*
	 * Adjust write pointer depending on how much data came in.
	 */
	mp = rwqe->rwqe_im_mblk;
	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;

	/*
	 * Make sure this is NULL or we're in trouble.
	 */
	if (mp->b_next != NULL) {
		ibd_print_warn(state,
		    "ibd_process_rx: got duplicate mp from rcq?");
		mp->b_next = NULL;
	}

	/*
	 * the IB link will deliver one of the IB link layer
	 * headers called, the Global Routing Header (GRH).
	 * ibd driver uses the information in GRH to build the
	 * Header_info structure and pass it with the datagram up
	 * to GLDv3.
	 * If the GRH is not valid, indicate to GLDv3 by setting
	 * the VerTcFlow field to 0.
	 */
	phdr = (ib_header_info_t *)mp->b_rptr;
	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);

		/* if it is loop back packet, just drop it. */
		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
		    IPOIB_ADDRL) == 0) {
			freemsg(mp);
			return;
		}

		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
		    sizeof (ipoib_mac_t));
		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
		} else {
			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
		}
	} else {
		/*
		 * It can not be a IBA multicast packet. Must have been
		 * unicast for us. Just copy the interface address to dst.
		 */
		phdr->ib_grh.ipoib_vertcflow = 0;
		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
		    sizeof (ipoib_mac_t));
	}

	/*
	 * For ND6 packets, padding is at the front of the source/target
	 * lladdr. However the inet6 layer is not aware of it, hence remove
	 * the padding from such packets.
	 */
	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
			if (!pullupmsg(mp, IPV6_HDR_LEN +
			    sizeof (ipoib_hdr_t))) {
				DPRINT(10, "ibd_process_rx: pullupmsg failed");
				freemsg(mp);
				return;
			}
			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
			    sizeof (ipoib_pgrh_t));
		}
		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
		len = ntohs(ip6h->ip6_plen);
		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
			    IPV6_HDR_LEN + len) {
				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
				    IPV6_HDR_LEN + len)) {
					DPRINT(10, "ibd_process_rx: pullupmsg"
					    " failed");
					freemsg(mp);
					return;
				}
				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
				    sizeof (ipoib_pgrh_t) +
				    sizeof (ipoib_hdr_t));
			}
			/* LINTED: E_CONSTANT_CONDITION */
			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
		}
	}

	/*
	 * Update statistics
	 */
	atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
	atomic_inc_64(&state->id_rcv_pkt);
	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
		atomic_inc_64(&state->id_brd_rcv);
	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
		atomic_inc_64(&state->id_multi_rcv);

	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
	/*
	 * Set receive checksum status in mp
	 * Hardware checksumming can be considered valid only if:
	 * 1. CQE.IP_OK bit is set
	 * 2. CQE.CKSUM = 0xffff
	 * 3. IPv6 routing header is not present in the packet
	 * 4. If there are no IP_OPTIONS in the IP HEADER
	 */

	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
	    (wc->wc_cksum == 0xFFFF) &&
	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
	}

	/*
	 * Add this mp to the list of processed mp's to send to
	 * the nw layer
	 */
	mutex_enter(&state->id_rx_lock);
	if (state->id_rx_mp) {
		ASSERT(state->id_rx_mp_tail != NULL);
		state->id_rx_mp_tail->b_next = mp;
	} else {
		ASSERT(state->id_rx_mp_tail == NULL);
		state->id_rx_mp = mp;
	}

	state->id_rx_mp_tail = mp;
	state->id_rx_mp_len++;

	if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
		mpc = state->id_rx_mp;

		state->id_rx_mp = NULL;
		state->id_rx_mp_tail = NULL;
		state->id_rx_mp_len = 0;
	}

	mutex_exit(&state->id_rx_lock);

	if (mpc) {
		ibd_flush_rx(state, mpc);
	}
}

/*
 * Callback code invoked from STREAMs when the receive data buffer is
 * free for recycling.
 */
static void
ibd_freemsg_cb(char *arg)
{
	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
	ibd_state_t *state = rwqe->w_state;

	/*
	 * If the wqe is being destructed, do not attempt recycling.
	 */
	if (rwqe->w_freeing_wqe == B_TRUE) {
		DPRINT(6, "ibd_freemsg: wqe being freed");
		return;
	} else {
		/*
		 * Upper layer has released held mblk, so we have
		 * no more use for keeping the old pointer in
		 * our rwqe.
		 */
		rwqe->rwqe_im_mblk = NULL;
	}

	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
	if (rwqe->rwqe_im_mblk == NULL) {
		ibd_delete_rwqe(state, rwqe);
		ibd_free_rwqe(state, rwqe);
		DPRINT(6, "ibd_freemsg: desballoc failed");
		return;
	}

	if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
		ibd_delete_rwqe(state, rwqe);
		ibd_free_rwqe(state, rwqe);
		return;
	}

	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
}

static uint_t
ibd_tx_recycle(char *arg)
{
	ibd_state_t *state = (ibd_state_t *)arg;

	/*
	 * Poll for completed entries
	 */
	ibd_poll_compq(state, state->id_scq_hdl);

	/*
	 * Resume any blocked transmissions if possible
	 */
	(void) ibd_resume_transmission(state);

	return (DDI_INTR_CLAIMED);
}

#ifdef IBD_LOGGING
static void
ibd_log_init(void)
{
	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
	ibd_lbuf_ndx = 0;

	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
}

static void
ibd_log_fini(void)
{
	if (ibd_lbuf)
		kmem_free(ibd_lbuf, IBD_LOG_SZ);
	ibd_lbuf_ndx = 0;
	ibd_lbuf = NULL;

	mutex_destroy(&ibd_lbuf_lock);
}

static void
ibd_log(const char *fmt, ...)
{
	va_list	ap;
	uint32_t off;
	uint32_t msglen;
	char tmpbuf[IBD_DMAX_LINE];

	if (ibd_lbuf == NULL)
		return;

	va_start(ap, fmt);
	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
	va_end(ap);

	if (msglen >= IBD_DMAX_LINE)
		msglen = IBD_DMAX_LINE - 1;

	mutex_enter(&ibd_lbuf_lock);

	off = ibd_lbuf_ndx;		/* current msg should go here */
	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';

	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */

	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
		ibd_lbuf_ndx = 0;

	mutex_exit(&ibd_lbuf_lock);

	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
}
#endif