xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision b88604e3d674a4c82f27e5c41d05b8774a7547e4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip6.h>		/* for ip6_t */
54 #include <inet/tcp.h>		/* for tcph_t */
55 #include <netinet/icmp6.h>	/* for icmp6_t */
56 #include <sys/callb.h>
57 #include <sys/modhash.h>
58 
59 #include <sys/ib/clients/ibd/ibd.h>
60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
61 #include <sys/note.h>
62 #include <sys/multidata.h>
63 
64 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
65 
66 /*
67  * Per-interface tunables (for developers)
68  *
69  * ibd_tx_copy_thresh
70  *     This sets the threshold at which ibd will attempt to do a bcopy of the
71  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
72  *     is restricted by various parameters, so setting of this value must be
73  *     made after careful considerations only.  For instance, IB HCAs currently
74  *     impose a relatively small limit (when compared to ethernet NICs) on the
75  *     length of the SGL for transmit. On the other hand, the ip stack could
76  *     send down mp chains that are quite long when LSO is enabled.
77  *
78  * ibd_num_swqe
79  *     Number of "send WQE" elements that will be allocated and used by ibd.
80  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
81  *     buffer in each of these send wqes must be taken into account. This
82  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
83  *     currently set to the same value of ibd_tx_copy_thresh, but may be
84  *     changed independently if needed).
85  *
86  * ibd_num_rwqe
87  *     Number of "receive WQE" elements that will be allocated and used by
88  *     ibd. This parameter is limited by the maximum channel size of the HCA.
89  *     Each buffer in the receive wqe will be of MTU size.
90  *
91  * ibd_num_lso_bufs
92  *     Number of "larger-than-MTU" copy buffers to use for cases when the
93  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
94  *     and too large to be used with regular MTU-sized copy buffers. It is
95  *     not recommended to tune this variable without understanding the
96  *     application environment and/or memory resources. The size of each of
97  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
98  *
99  * ibd_num_ah
100  *     Number of AH cache entries to allocate
101  *
102  * ibd_hash_size
103  *     Hash table size for the active AH list
104  *
105  * ibd_tx_softintr
106  * ibd_rx_softintr
107  *     The softintr mechanism allows ibd to avoid event queue overflows if
108  *     the receive/completion handlers are to be expensive. These are enabled
109  *     by default.
110  *
111  * ibd_log_sz
112  *     This specifies the size of the ibd log buffer in bytes. The buffer is
113  *     allocated and logging is enabled only when IBD_LOGGING is defined.
114  *
115  */
116 uint_t ibd_tx_copy_thresh = 0x1000;
117 uint_t ibd_num_swqe = 4000;
118 uint_t ibd_num_rwqe = 4000;
119 uint_t ibd_num_lso_bufs = 0x400;
120 uint_t ibd_num_ah = 256;
121 uint_t ibd_hash_size = 32;
122 uint_t ibd_rx_softintr = 1;
123 uint_t ibd_tx_softintr = 1;
124 uint_t ibd_create_broadcast_group = 1;
125 #ifdef IBD_LOGGING
126 uint_t ibd_log_sz = 0x20000;
127 #endif
128 
129 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
130 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
131 #define	IBD_NUM_SWQE			ibd_num_swqe
132 #define	IBD_NUM_RWQE			ibd_num_rwqe
133 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
134 #define	IBD_NUM_AH			ibd_num_ah
135 #define	IBD_HASH_SIZE			ibd_hash_size
136 #ifdef IBD_LOGGING
137 #define	IBD_LOG_SZ			ibd_log_sz
138 #endif
139 
140 /*
141  * Receive CQ moderation parameters: tunable (for developers)
142  */
143 uint_t ibd_rxcomp_count = 4;
144 uint_t ibd_rxcomp_usec = 10;
145 
146 /*
147  * Send CQ moderation parameters: tunable (for developers)
148  */
149 uint_t ibd_txcomp_count = 16;
150 uint_t ibd_txcomp_usec = 300;
151 
152 /*
153  * Thresholds
154  *
155  * When waiting for resources (swqes or lso buffers) to become available,
156  * the first two thresholds below determine how long to wait before informing
157  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
158  * determines how low the available swqes should go before we start polling
159  * the completion queue.
160  */
161 #define	IBD_FREE_LSOS_THRESH		8
162 #define	IBD_FREE_SWQES_THRESH		20
163 #define	IBD_TX_POLL_THRESH		80
164 
165 /*
166  * When doing multiple-send-wr, this value determines how many to do at
167  * a time (in a single ibt_post_send).
168  */
169 #define	IBD_MAX_TX_POST_MULTIPLE	4
170 
171 /* Post IBD_RX_POST_CNT receive work requests at a time. */
172 #define	IBD_RX_POST_CNT			8
173 
174 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
175 #define	IBD_LOG_RX_POST			4
176 
177 /* Minimum number of receive work requests driver needs to always have */
178 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
179 
180 /*
181  * Maximum length for returning chained mps back to crossbow.
182  * Also used as the maximum number of rx wc's polled at a time.
183  */
184 #define	IBD_MAX_RX_MP_LEN		16
185 
186 /*
187  * LSO parameters
188  */
189 #define	IBD_LSO_MAXLEN			65536
190 #define	IBD_LSO_BUFSZ			8192
191 #define	IBD_PROP_LSO_POLICY		"lso-policy"
192 
193 /*
194  * Completion queue polling control
195  */
196 #define	IBD_CQ_POLLING			0x1
197 #define	IBD_REDO_CQ_POLLING		0x2
198 
199 /*
200  * Flag bits for resources to reap
201  */
202 #define	IBD_RSRC_SWQE			0x1
203 #define	IBD_RSRC_LSOBUF			0x2
204 
205 /*
206  * Async operation types
207  */
208 #define	IBD_ASYNC_GETAH			1
209 #define	IBD_ASYNC_JOIN			2
210 #define	IBD_ASYNC_LEAVE			3
211 #define	IBD_ASYNC_PROMON		4
212 #define	IBD_ASYNC_PROMOFF		5
213 #define	IBD_ASYNC_REAP			6
214 #define	IBD_ASYNC_TRAP			7
215 #define	IBD_ASYNC_SCHED			8
216 #define	IBD_ASYNC_LINK			9
217 #define	IBD_ASYNC_EXIT			10
218 
219 /*
220  * Async operation states
221  */
222 #define	IBD_OP_NOTSTARTED		0
223 #define	IBD_OP_ONGOING			1
224 #define	IBD_OP_COMPLETED		2
225 #define	IBD_OP_ERRORED			3
226 #define	IBD_OP_ROUTERED			4
227 
228 /*
229  * State of IBD driver initialization during attach/m_start
230  */
231 #define	IBD_DRV_STATE_INITIALIZED	0x00001
232 #define	IBD_DRV_RXINTR_ADDED		0x00002
233 #define	IBD_DRV_TXINTR_ADDED		0x00004
234 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
235 #define	IBD_DRV_HCA_OPENED		0x00010
236 #define	IBD_DRV_PD_ALLOCD		0x00020
237 #define	IBD_DRV_MAC_REGISTERED		0x00040
238 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
239 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
240 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
241 #define	IBD_DRV_CQS_ALLOCD		0x00400
242 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
243 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
244 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
245 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
246 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
247 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
248 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
249 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
250 #define	IBD_DRV_STARTED			0x80000
251 
252 /*
253  * Start/stop in-progress flags; note that restart must always remain
254  * the OR of start and stop flag values.
255  */
256 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
257 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
258 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
259 
260 /*
261  * Miscellaneous constants
262  */
263 #define	IBD_SEND			0
264 #define	IBD_RECV			1
265 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
266 #define	IBD_DEF_MAX_SDU			2044
267 #define	IBD_DEFAULT_QKEY		0xB1B
268 #ifdef IBD_LOGGING
269 #define	IBD_DMAX_LINE			100
270 #endif
271 
272 /*
273  * Enumerations for link states
274  */
275 typedef enum {
276 	IBD_LINK_DOWN,
277 	IBD_LINK_UP,
278 	IBD_LINK_UP_ABSENT
279 } ibd_link_op_t;
280 
281 /*
282  * Driver State Pointer
283  */
284 void *ibd_list;
285 
286 /*
287  * Logging
288  */
289 #ifdef IBD_LOGGING
290 kmutex_t ibd_lbuf_lock;
291 uint8_t *ibd_lbuf;
292 uint32_t ibd_lbuf_ndx;
293 #endif
294 
295 /*
296  * Required system entry points
297  */
298 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
299 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
300 
301 /*
302  * Required driver entry points for GLDv3
303  */
304 static int ibd_m_stat(void *, uint_t, uint64_t *);
305 static int ibd_m_start(void *);
306 static void ibd_m_stop(void *);
307 static int ibd_m_promisc(void *, boolean_t);
308 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
309 static int ibd_m_unicst(void *, const uint8_t *);
310 static mblk_t *ibd_m_tx(void *, mblk_t *);
311 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
312 
313 /*
314  * Private driver entry points for GLDv3
315  */
316 
317 /*
318  * Initialization
319  */
320 static int ibd_state_init(ibd_state_t *, dev_info_t *);
321 static int ibd_init_txlist(ibd_state_t *);
322 static int ibd_init_rxlist(ibd_state_t *);
323 static int ibd_acache_init(ibd_state_t *);
324 #ifdef IBD_LOGGING
325 static void ibd_log_init(void);
326 #endif
327 
328 /*
329  * Termination/cleanup
330  */
331 static void ibd_state_fini(ibd_state_t *);
332 static void ibd_fini_txlist(ibd_state_t *);
333 static void ibd_fini_rxlist(ibd_state_t *);
334 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
335 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
336 static void ibd_acache_fini(ibd_state_t *);
337 #ifdef IBD_LOGGING
338 static void ibd_log_fini(void);
339 #endif
340 
341 /*
342  * Allocation/acquire/map routines
343  */
344 static int ibd_alloc_tx_copybufs(ibd_state_t *);
345 static int ibd_alloc_rx_copybufs(ibd_state_t *);
346 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
347 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
348 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
349     uint32_t *);
350 
351 /*
352  * Free/release/unmap routines
353  */
354 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
355 static void ibd_free_tx_copybufs(ibd_state_t *);
356 static void ibd_free_rx_copybufs(ibd_state_t *);
357 static void ibd_free_rx_rsrcs(ibd_state_t *);
358 static void ibd_free_tx_lsobufs(ibd_state_t *);
359 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
360 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
361 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
362 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
363 
364 /*
365  * Handlers/callback routines
366  */
367 static uint_t ibd_intr(caddr_t);
368 static uint_t ibd_tx_recycle(caddr_t);
369 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
370 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
371 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
372 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
373 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
374 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
375 static void ibd_freemsg_cb(char *);
376 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
377     ibt_async_event_t *);
378 static void ibd_snet_notices_handler(void *, ib_gid_t,
379     ibt_subnet_event_code_t, ibt_subnet_event_t *);
380 
381 /*
382  * Send/receive routines
383  */
384 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
385 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
386 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
387 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
388 
389 /*
390  * Threads
391  */
392 static void ibd_async_work(ibd_state_t *);
393 
394 /*
395  * Async tasks
396  */
397 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
398 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
399 static void ibd_async_setprom(ibd_state_t *);
400 static void ibd_async_unsetprom(ibd_state_t *);
401 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
402 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
403 static void ibd_async_txsched(ibd_state_t *);
404 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
405 
406 /*
407  * Async task helpers
408  */
409 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
410 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
411 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
412 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
413     ipoib_mac_t *, ipoib_mac_t *);
414 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
415 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
416 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
417 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
418 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
419 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
420 static uint64_t ibd_get_portspeed(ibd_state_t *);
421 static boolean_t ibd_async_safe(ibd_state_t *);
422 static void ibd_async_done(ibd_state_t *);
423 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
424 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
425 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
426 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
427 static void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *);
428 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
429 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
430 
431 /*
432  * Helpers for attach/start routines
433  */
434 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
435 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
436 static int ibd_unattach(ibd_state_t *, dev_info_t *);
437 static int ibd_get_port_details(ibd_state_t *);
438 static int ibd_alloc_cqs(ibd_state_t *);
439 static int ibd_setup_ud_channel(ibd_state_t *);
440 static int ibd_start(ibd_state_t *);
441 static int ibd_undo_start(ibd_state_t *, link_state_t);
442 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
443 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
444 
445 
446 /*
447  * Miscellaneous helpers
448  */
449 static int ibd_sched_poll(ibd_state_t *, int, int);
450 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
451 static void ibd_resume_transmission(ibd_state_t *);
452 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
453 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
454 static void *list_get_head(list_t *);
455 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
456 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
457 static void ibd_print_warn(ibd_state_t *, char *, ...);
458 #ifdef IBD_LOGGING
459 static void ibd_log(const char *, ...);
460 #endif
461 
462 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
463     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
464 
465 /* Module Driver Info */
466 static struct modldrv ibd_modldrv = {
467 	&mod_driverops,			/* This one is a driver */
468 	"InfiniBand GLDv3 Driver",	/* short description */
469 	&ibd_dev_ops			/* driver specific ops */
470 };
471 
472 /* Module Linkage */
473 static struct modlinkage ibd_modlinkage = {
474 	MODREV_1, (void *)&ibd_modldrv, NULL
475 };
476 
477 /*
478  * Module (static) info passed to IBTL during ibt_attach
479  */
480 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
481 	IBTI_V_CURR,
482 	IBT_NETWORK,
483 	ibd_async_handler,
484 	NULL,
485 	"IPIB"
486 };
487 
488 /*
489  * GLDv3 entry points
490  */
491 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
492 static mac_callbacks_t ibd_m_callbacks = {
493 	IBD_M_CALLBACK_FLAGS,
494 	ibd_m_stat,
495 	ibd_m_start,
496 	ibd_m_stop,
497 	ibd_m_promisc,
498 	ibd_m_multicst,
499 	ibd_m_unicst,
500 	ibd_m_tx,
501 	NULL,
502 	ibd_m_getcapab
503 };
504 
505 /*
506  * Fill/clear <scope> and <p_key> in multicast/broadcast address
507  */
508 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
509 {							\
510 	*(uint32_t *)((char *)(maddr) + 4) |=		\
511 	    htonl((uint32_t)(scope) << 16);		\
512 	*(uint32_t *)((char *)(maddr) + 8) |=		\
513 	    htonl((uint32_t)(pkey) << 16);		\
514 }
515 
516 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
517 {							\
518 	*(uint32_t *)((char *)(maddr) + 4) &=		\
519 	    htonl(~((uint32_t)0xF << 16));		\
520 	*(uint32_t *)((char *)(maddr) + 8) &=		\
521 	    htonl(~((uint32_t)0xFFFF << 16));		\
522 }
523 
524 /*
525  * Rudimentary debugging support
526  */
527 #ifdef DEBUG
528 int ibd_debuglevel = 100;
529 static void
530 debug_print(int l, char *fmt, ...)
531 {
532 	va_list ap;
533 
534 	if (l < ibd_debuglevel)
535 		return;
536 	va_start(ap, fmt);
537 	vcmn_err(CE_CONT, fmt, ap);
538 	va_end(ap);
539 }
540 #define	DPRINT		debug_print
541 #else
542 #define	DPRINT		0 &&
543 #endif
544 
545 /*
546  * Common routine to print warning messages; adds in hca guid, port number
547  * and pkey to be able to identify the IBA interface.
548  */
549 static void
550 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
551 {
552 	ib_guid_t hca_guid;
553 	char ibd_print_buf[256];
554 	int len;
555 	va_list ap;
556 
557 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
558 	    0, "hca-guid", 0);
559 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
560 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
561 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
562 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
563 	va_start(ap, fmt);
564 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
565 	    fmt, ap);
566 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
567 	va_end(ap);
568 }
569 
570 /*
571  * Warlock directives
572  */
573 
574 /*
575  * id_lso_lock
576  *
577  * state->id_lso->bkt_nfree may be accessed without a lock to
578  * determine the threshold at which we have to ask the nw layer
579  * to resume transmission (see ibd_resume_transmission()).
580  */
581 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
582     ibd_state_t::id_lso))
583 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
584 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
585 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
586 
587 /*
588  * id_scq_poll_lock
589  */
590 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
591     ibd_state_t::id_scq_poll_busy))
592 
593 /*
594  * id_txpost_lock
595  */
596 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
597     ibd_state_t::id_tx_head))
598 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
599     ibd_state_t::id_tx_busy))
600 
601 /*
602  * id_acache_req_lock
603  */
604 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
605     ibd_state_t::id_acache_req_cv))
606 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
607     ibd_state_t::id_req_list))
608 _NOTE(SCHEME_PROTECTS_DATA("atomic",
609     ibd_acache_s::ac_ref))
610 
611 /*
612  * id_ac_mutex
613  *
614  * This mutex is actually supposed to protect id_ah_op as well,
615  * but this path of the code isn't clean (see update of id_ah_op
616  * in ibd_async_acache(), immediately after the call to
617  * ibd_async_mcache()). For now, we'll skip this check by
618  * declaring that id_ah_op is protected by some internal scheme
619  * that warlock isn't aware of.
620  */
621 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
622     ibd_state_t::id_ah_active))
623 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
624     ibd_state_t::id_ah_free))
625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
626     ibd_state_t::id_ah_addr))
627 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
628     ibd_state_t::id_ah_op))
629 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
630     ibd_state_t::id_ah_error))
631 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
632     ibd_state_t::id_ac_hot_ace))
633 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
634 
635 /*
636  * id_mc_mutex
637  */
638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
639     ibd_state_t::id_mc_full))
640 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
641     ibd_state_t::id_mc_non))
642 
643 /*
644  * id_trap_lock
645  */
646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
647     ibd_state_t::id_trap_cv))
648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
649     ibd_state_t::id_trap_stop))
650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
651     ibd_state_t::id_trap_inprog))
652 
653 /*
654  * id_prom_op
655  */
656 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
657     ibd_state_t::id_prom_op))
658 
659 /*
660  * id_sched_lock
661  */
662 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
663     ibd_state_t::id_sched_needed))
664 
665 /*
666  * id_link_mutex
667  */
668 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
669     ibd_state_t::id_link_state))
670 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
671 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
672     ibd_state_t::id_link_speed))
673 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
674 
675 /*
676  * id_tx_list.dl_mutex
677  */
678 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
679     ibd_state_t::id_tx_list.dl_head))
680 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
681     ibd_state_t::id_tx_list.dl_pending_sends))
682 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
683     ibd_state_t::id_tx_list.dl_cnt))
684 
685 /*
686  * id_rx_list.dl_mutex
687  */
688 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
689     ibd_state_t::id_rx_list.dl_bufs_outstanding))
690 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
691     ibd_state_t::id_rx_list.dl_cnt))
692 
693 
694 /*
695  * Items protected by atomic updates
696  */
697 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
698     ibd_state_s::id_brd_rcv
699     ibd_state_s::id_brd_xmt
700     ibd_state_s::id_multi_rcv
701     ibd_state_s::id_multi_xmt
702     ibd_state_s::id_num_intrs
703     ibd_state_s::id_rcv_bytes
704     ibd_state_s::id_rcv_pkt
705     ibd_state_s::id_rx_post_queue_index
706     ibd_state_s::id_tx_short
707     ibd_state_s::id_xmt_bytes
708     ibd_state_s::id_xmt_pkt))
709 
710 /*
711  * Non-mutex protection schemes for data elements. Almost all of
712  * these are non-shared items.
713  */
714 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
715     callb_cpr
716     ib_gid_s
717     ib_header_info
718     ibd_acache_rq
719     ibd_acache_s::ac_mce
720     ibd_mcache::mc_fullreap
721     ibd_mcache::mc_jstate
722     ibd_mcache::mc_req
723     ibd_rwqe_s
724     ibd_swqe_s
725     ibd_wqe_s
726     ibt_wr_ds_s::ds_va
727     ibt_wr_lso_s
728     ipoib_mac::ipoib_qpn
729     mac_capab_lso_s
730     msgb::b_next
731     msgb::b_rptr
732     msgb::b_wptr
733     ibd_state_s::id_bgroup_created
734     ibd_state_s::id_mac_state
735     ibd_state_s::id_mtu
736     ibd_state_s::id_num_rwqe
737     ibd_state_s::id_num_swqe
738     ibd_state_s::id_qpnum
739     ibd_state_s::id_rcq_hdl
740     ibd_state_s::id_rx_buf_sz
741     ibd_state_s::id_rx_bufs
742     ibd_state_s::id_rx_mr_hdl
743     ibd_state_s::id_rx_wqes
744     ibd_state_s::id_rxwcs
745     ibd_state_s::id_rxwcs_size
746     ibd_state_s::id_rx_nqueues
747     ibd_state_s::id_rx_queues
748     ibd_state_s::id_scope
749     ibd_state_s::id_scq_hdl
750     ibd_state_s::id_tx_buf_sz
751     ibd_state_s::id_tx_bufs
752     ibd_state_s::id_tx_mr_hdl
753     ibd_state_s::id_tx_rel_list.dl_cnt
754     ibd_state_s::id_tx_wqes
755     ibd_state_s::id_txwcs
756     ibd_state_s::id_txwcs_size))
757 
758 int
759 _init()
760 {
761 	int status;
762 
763 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
764 	    PAGESIZE), 0);
765 	if (status != 0) {
766 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
767 		return (status);
768 	}
769 
770 	mac_init_ops(&ibd_dev_ops, "ibd");
771 	status = mod_install(&ibd_modlinkage);
772 	if (status != 0) {
773 		DPRINT(10, "_init:failed in mod_install()");
774 		ddi_soft_state_fini(&ibd_list);
775 		mac_fini_ops(&ibd_dev_ops);
776 		return (status);
777 	}
778 
779 #ifdef IBD_LOGGING
780 	ibd_log_init();
781 #endif
782 	return (0);
783 }
784 
785 int
786 _info(struct modinfo *modinfop)
787 {
788 	return (mod_info(&ibd_modlinkage, modinfop));
789 }
790 
791 int
792 _fini()
793 {
794 	int status;
795 
796 	status = mod_remove(&ibd_modlinkage);
797 	if (status != 0)
798 		return (status);
799 
800 	mac_fini_ops(&ibd_dev_ops);
801 	ddi_soft_state_fini(&ibd_list);
802 #ifdef IBD_LOGGING
803 	ibd_log_fini();
804 #endif
805 	return (0);
806 }
807 
808 /*
809  * Convert the GID part of the mac address from network byte order
810  * to host order.
811  */
812 static void
813 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
814 {
815 	ib_sn_prefix_t nbopref;
816 	ib_guid_t nboguid;
817 
818 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
819 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
820 	dgid->gid_prefix = b2h64(nbopref);
821 	dgid->gid_guid = b2h64(nboguid);
822 }
823 
824 /*
825  * Create the IPoIB address in network byte order from host order inputs.
826  */
827 static void
828 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
829     ib_guid_t guid)
830 {
831 	ib_sn_prefix_t nbopref;
832 	ib_guid_t nboguid;
833 
834 	mac->ipoib_qpn = htonl(qpn);
835 	nbopref = h2b64(prefix);
836 	nboguid = h2b64(guid);
837 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
838 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
839 }
840 
841 /*
842  * Send to the appropriate all-routers group when the IBA multicast group
843  * does not exist, based on whether the target group is v4 or v6.
844  */
845 static boolean_t
846 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
847     ipoib_mac_t *rmac)
848 {
849 	boolean_t retval = B_TRUE;
850 	uint32_t adjscope = state->id_scope << 16;
851 	uint32_t topword;
852 
853 	/*
854 	 * Copy the first 4 bytes in without assuming any alignment of
855 	 * input mac address; this will have IPoIB signature, flags and
856 	 * scope bits.
857 	 */
858 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
859 	topword = ntohl(topword);
860 
861 	/*
862 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
863 	 */
864 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
865 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
866 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
867 		    ((uint32_t)(state->id_pkey << 16))),
868 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
869 	else
870 		/*
871 		 * Does not have proper bits in the mgid address.
872 		 */
873 		retval = B_FALSE;
874 
875 	return (retval);
876 }
877 
878 /*
879  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
880  * front of optional src/tgt link layer address. Right now Solaris inserts
881  * padding by default at the end. The routine which is doing is nce_xmit()
882  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
883  * the packet comes down from IP layer to the IBD driver, it is in the
884  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
885  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
886  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
887  *
888  * The send routine at IBD driver changes this packet as follows:
889  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
890  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
891  * aligned.
892  *
893  * At the receiving side again ibd_process_rx takes the above packet and
894  * removes the two bytes of front padding and inserts it at the end. This
895  * is since the IP layer does not understand padding at the front.
896  */
897 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
898 	uchar_t 	*nd_lla_ptr;					\
899 	icmp6_t 	*icmp6;						\
900 	nd_opt_hdr_t	*opt;						\
901 	int 		i;						\
902 									\
903 	icmp6 = (icmp6_t *)&ip6h[1];					\
904 	len -= sizeof (nd_neighbor_advert_t);				\
905 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
906 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
907 	    (len != 0)) {						\
908 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
909 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
910 		ASSERT(opt != NULL);					\
911 		nd_lla_ptr = (uchar_t *)&opt[1];			\
912 		if (type == IBD_SEND) {					\
913 			for (i = IPOIB_ADDRL; i > 0; i--)		\
914 				*(nd_lla_ptr + i + 1) =			\
915 				    *(nd_lla_ptr + i - 1);		\
916 		} else {						\
917 			for (i = 0; i < IPOIB_ADDRL; i++)		\
918 				*(nd_lla_ptr + i) =			\
919 				    *(nd_lla_ptr + i + 2);		\
920 		}							\
921 		*(nd_lla_ptr + i) = 0;					\
922 		*(nd_lla_ptr + i + 1) = 0;				\
923 	}								\
924 }
925 
926 /*
927  * Address handle entries maintained by the driver are kept in the
928  * free and active lists. Each entry starts out in the free list;
929  * it migrates to the active list when primed using ibt_get_paths()
930  * and ibt_modify_ud_dest() for transmission to a specific destination.
931  * In the active list, the entry has a reference count indicating the
932  * number of ongoing/uncompleted transmits that reference it. The
933  * entry is left in the active list even after the reference count
934  * goes to 0, since successive transmits can find it there and do
935  * not need to set up another entry (ie the path information is
936  * cached using the active list). Entries on the active list are
937  * also hashed using the destination link address as a key for faster
938  * lookups during transmits.
939  *
940  * For any destination address (unicast or multicast, whatever the
941  * join states), there will be at most one entry in the active list.
942  * Entries with a 0 reference count on the active list can be reused
943  * for a transmit to a new destination, if the free list is empty.
944  *
945  * The AH free list insertion/deletion is protected with the id_ac_mutex,
946  * since the async thread and Tx callback handlers insert/delete. The
947  * active list does not need a lock (all operations are done by the
948  * async thread) but updates to the reference count are atomically
949  * done (increments done by Tx path, decrements by the Tx callback handler).
950  */
951 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
952 	list_insert_head(&state->id_ah_free, ce)
953 #define	IBD_ACACHE_GET_FREE(state) \
954 	list_get_head(&state->id_ah_free)
955 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
956 	int _ret_;						\
957 	list_insert_head(&state->id_ah_active, ce);		\
958 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
959 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
960 	ASSERT(_ret_ == 0);					\
961 	state->id_ac_hot_ace = ce;				\
962 }
963 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
964 	list_remove(&state->id_ah_active, ce);			\
965 	if (state->id_ac_hot_ace == ce)				\
966 		state->id_ac_hot_ace = NULL;			\
967 	(void) mod_hash_remove(state->id_ah_active_hash,	\
968 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
969 }
970 #define	IBD_ACACHE_GET_ACTIVE(state) \
971 	list_get_head(&state->id_ah_active)
972 
973 /*
974  * Membership states for different mcg's are tracked by two lists:
975  * the "non" list is used for promiscuous mode, when all mcg traffic
976  * needs to be inspected. This type of membership is never used for
977  * transmission, so there can not be an AH in the active list
978  * corresponding to a member in this list. This list does not need
979  * any protection, since all operations are performed by the async
980  * thread.
981  *
982  * "Full" and "SendOnly" membership is tracked using a single list,
983  * the "full" list. This is because this single list can then be
984  * searched during transmit to a multicast group (if an AH for the
985  * mcg is not found in the active list), since at least one type
986  * of membership must be present before initiating the transmit.
987  * This list is also emptied during driver detach, since sendonly
988  * membership acquired during transmit is dropped at detach time
989  * along with ipv4 broadcast full membership. Insert/deletes to
990  * this list are done only by the async thread, but it is also
991  * searched in program context (see multicast disable case), thus
992  * the id_mc_mutex protects the list. The driver detach path also
993  * deconstructs the "full" list, but it ensures that the async
994  * thread will not be accessing the list (by blocking out mcg
995  * trap handling and making sure no more Tx reaping will happen).
996  *
997  * Currently, an IBA attach is done in the SendOnly case too,
998  * although this is not required.
999  */
1000 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1001 	list_insert_head(&state->id_mc_full, mce)
1002 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1003 	list_insert_head(&state->id_mc_non, mce)
1004 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1005 	ibd_mcache_find(mgid, &state->id_mc_full)
1006 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1007 	ibd_mcache_find(mgid, &state->id_mc_non)
1008 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1009 	list_remove(&state->id_mc_full, mce)
1010 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1011 	list_remove(&state->id_mc_non, mce)
1012 
1013 /*
1014  * AH and MCE active list manipulation:
1015  *
1016  * Multicast disable requests and MCG delete traps are two cases
1017  * where the active AH entry for the mcg (if any unreferenced one exists)
1018  * will be moved to the free list (to force the next Tx to the mcg to
1019  * join the MCG in SendOnly mode). Port up handling will also move AHs
1020  * from active to free list.
1021  *
1022  * In the case when some transmits are still pending on an entry
1023  * for an mcg, but a multicast disable has already been issued on the
1024  * mcg, there are some options to consider to preserve the join state
1025  * to ensure the emitted packet is properly routed on the IBA fabric.
1026  * For the AH, we can
1027  * 1. take out of active list at multicast disable time.
1028  * 2. take out of active list only when last pending Tx completes.
1029  * For the MCE, we can
1030  * 3. take out of active list at multicast disable time.
1031  * 4. take out of active list only when last pending Tx completes.
1032  * 5. move from active list to stale list at multicast disable time.
1033  * We choose to use 2,4. We use option 4 so that if a multicast enable
1034  * is tried before the pending Tx completes, the enable code finds the
1035  * mce in the active list and just has to make sure it will not be reaped
1036  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1037  * a stale list (#5) that would be checked in the enable code would need
1038  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1039  * after the multicast disable would try to put an AH in the active list,
1040  * and associate the mce it finds in the active list to this new AH,
1041  * whereas the mce is already associated with the previous AH (taken off
1042  * the active list), and will be removed once the pending Tx's complete
1043  * (unless a reference count on mce's is implemented). One implication of
1044  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1045  * grab new references on the AH, further delaying the leave.
1046  *
1047  * In the case of mcg delete (or create) trap when the port is sendonly
1048  * joined, the AH and MCE handling is different: the AH and MCE has to be
1049  * immediately taken off the active lists (forcing a join and path lookup
1050  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1051  * to an mcg as it is repeatedly created and deleted and goes thru
1052  * reincarnations).
1053  *
1054  * When a port is already sendonly joined, and a multicast enable is
1055  * attempted, the same mce structure is promoted; this ensures only a
1056  * single mce on the active list tracks the most powerful join state.
1057  *
1058  * In the case of port up event handling, the MCE for sendonly membership
1059  * is freed up, and the ACE is put into the free list as soon as possible
1060  * (depending on whether posted Tx's have completed). For fullmembership
1061  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1062  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1063  * done; else the mce is deconstructed (mc_fullreap case).
1064  *
1065  * MCG creation and deletion trap handling:
1066  *
1067  * These traps are unreliable (meaning sometimes the trap might never
1068  * be delivered to the subscribed nodes) and may arrive out-of-order
1069  * since they use UD transport. An alternative to relying on these
1070  * unreliable traps is to poll for mcg presence every so often, but
1071  * instead of doing that, we try to be as conservative as possible
1072  * while handling the traps, and hope that the traps do arrive at
1073  * the subscribed nodes soon. Note that if a node is fullmember
1074  * joined to an mcg, it can not possibly receive a mcg create/delete
1075  * trap for that mcg (by fullmember definition); if it does, it is
1076  * an old trap from a previous incarnation of the mcg.
1077  *
1078  * Whenever a trap is received, the driver cleans up its sendonly
1079  * membership to the group; we choose to do a sendonly leave even
1080  * on a creation trap to handle the case of a prior deletion of the mcg
1081  * having gone unnoticed. Consider an example scenario:
1082  * T1: MCG M is deleted, and fires off deletion trap D1.
1083  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1084  * T3: Node N tries to transmit to M, joining in sendonly mode.
1085  * T4: MCG M is deleted, and fires off deletion trap D2.
1086  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1087  *     If the trap is D2, then a LEAVE is not required, since the mcg
1088  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1089  *     approach is to always LEAVE, but the SM may be confused if it
1090  *     receives a LEAVE without a prior JOIN.
1091  *
1092  * Management of the non-membership to an mcg is similar to the above,
1093  * except that if the interface is in promiscuous mode, it is required
1094  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1095  * if the re-join attempt fails (in which case a warning message needs
1096  * to be printed), it is not clear whether it failed due to the mcg not
1097  * existing, or some fabric/hca issues, due to the delayed nature of
1098  * trap delivery. Querying the SA to establish presence/absence of the
1099  * mcg is also racy at best. Thus, the driver just prints a warning
1100  * message when it can not rejoin after receiving a create trap, although
1101  * this might be (on rare occasions) a mis-warning if the create trap is
1102  * received after the mcg was deleted.
1103  */
1104 
1105 /*
1106  * Implementation of atomic "recycle" bits and reference count
1107  * on address handles. This utilizes the fact that max reference
1108  * count on any handle is limited by number of send wqes, thus
1109  * high bits in the ac_ref field can be used as the recycle bits,
1110  * and only the low bits hold the number of pending Tx requests.
1111  * This atomic AH reference counting allows the Tx completion
1112  * handler not to acquire the id_ac_mutex to process every completion,
1113  * thus reducing lock contention problems between completion and
1114  * the Tx path.
1115  */
1116 #define	CYCLEVAL		0x80000
1117 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1118 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1119 #define	GET_REF(ace)		((ace)->ac_ref)
1120 #define	GET_REF_CYCLE(ace) (				\
1121 	/*						\
1122 	 * Make sure "cycle" bit is set.		\
1123 	 */						\
1124 	ASSERT(CYCLE_SET(ace)),				\
1125 	((ace)->ac_ref & ~(CYCLEVAL))			\
1126 )
1127 #define	INC_REF(ace, num) {				\
1128 	atomic_add_32(&(ace)->ac_ref, num);		\
1129 }
1130 #define	SET_CYCLE_IF_REF(ace) (				\
1131 	CYCLE_SET(ace) ? B_TRUE :			\
1132 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1133 		CYCLEVAL ?				\
1134 		/*					\
1135 		 * Clear the "cycle" bit we just set;	\
1136 		 * ref count known to be 0 from above.	\
1137 		 */					\
1138 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1139 		/*					\
1140 		 * We set "cycle" bit; let caller know.	\
1141 		 */					\
1142 		B_TRUE					\
1143 )
1144 #define	DEC_REF_DO_CYCLE(ace) (				\
1145 	atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ?	\
1146 		/*					\
1147 		 * Ref count known to be 0 from above.	\
1148 		 */					\
1149 		B_TRUE :				\
1150 		B_FALSE					\
1151 )
1152 
1153 static void *
1154 list_get_head(list_t *list)
1155 {
1156 	list_node_t *lhead = list_head(list);
1157 
1158 	if (lhead != NULL)
1159 		list_remove(list, lhead);
1160 	return (lhead);
1161 }
1162 
1163 /*
1164  * This is always guaranteed to be able to queue the work.
1165  */
1166 static void
1167 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1168 {
1169 	/* Initialize request */
1170 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1171 	ptr->rq_op = op;
1172 
1173 	/*
1174 	 * Queue provided slot onto request pool.
1175 	 */
1176 	mutex_enter(&state->id_acache_req_lock);
1177 	list_insert_tail(&state->id_req_list, ptr);
1178 
1179 	/* Go, fetch, async thread */
1180 	cv_signal(&state->id_acache_req_cv);
1181 	mutex_exit(&state->id_acache_req_lock);
1182 }
1183 
1184 /*
1185  * Main body of the per interface async thread.
1186  */
1187 static void
1188 ibd_async_work(ibd_state_t *state)
1189 {
1190 	ibd_req_t *ptr;
1191 	callb_cpr_t cprinfo;
1192 
1193 	mutex_enter(&state->id_acache_req_lock);
1194 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1195 	    callb_generic_cpr, "ibd_async_work");
1196 
1197 	for (;;) {
1198 		ptr = list_get_head(&state->id_req_list);
1199 		if (ptr != NULL) {
1200 			mutex_exit(&state->id_acache_req_lock);
1201 
1202 			/*
1203 			 * Once we have done the operation, there is no
1204 			 * guarantee the request slot is going to be valid,
1205 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1206 			 * TRAP).
1207 			 *
1208 			 * Perform the request.
1209 			 */
1210 			switch (ptr->rq_op) {
1211 				case IBD_ASYNC_GETAH:
1212 					ibd_async_acache(state, &ptr->rq_mac);
1213 					break;
1214 				case IBD_ASYNC_JOIN:
1215 				case IBD_ASYNC_LEAVE:
1216 					ibd_async_multicast(state,
1217 					    ptr->rq_gid, ptr->rq_op);
1218 					break;
1219 				case IBD_ASYNC_PROMON:
1220 					ibd_async_setprom(state);
1221 					break;
1222 				case IBD_ASYNC_PROMOFF:
1223 					ibd_async_unsetprom(state);
1224 					break;
1225 				case IBD_ASYNC_REAP:
1226 					ibd_async_reap_group(state,
1227 					    ptr->rq_ptr, ptr->rq_gid,
1228 					    IB_MC_JSTATE_FULL);
1229 					/*
1230 					 * the req buf contains in mce
1231 					 * structure, so we do not need
1232 					 * to free it here.
1233 					 */
1234 					ptr = NULL;
1235 					break;
1236 				case IBD_ASYNC_TRAP:
1237 					ibd_async_trap(state, ptr);
1238 					break;
1239 				case IBD_ASYNC_SCHED:
1240 					ibd_async_txsched(state);
1241 					break;
1242 				case IBD_ASYNC_LINK:
1243 					ibd_async_link(state, ptr);
1244 					break;
1245 				case IBD_ASYNC_EXIT:
1246 					mutex_enter(&state->id_acache_req_lock);
1247 #ifndef __lock_lint
1248 					CALLB_CPR_EXIT(&cprinfo);
1249 #else
1250 					mutex_exit(&state->id_acache_req_lock);
1251 #endif
1252 					return;
1253 			}
1254 			if (ptr != NULL)
1255 				kmem_cache_free(state->id_req_kmc, ptr);
1256 
1257 			mutex_enter(&state->id_acache_req_lock);
1258 		} else {
1259 #ifndef __lock_lint
1260 			/*
1261 			 * Nothing to do: wait till new request arrives.
1262 			 */
1263 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1264 			cv_wait(&state->id_acache_req_cv,
1265 			    &state->id_acache_req_lock);
1266 			CALLB_CPR_SAFE_END(&cprinfo,
1267 			    &state->id_acache_req_lock);
1268 #endif
1269 		}
1270 	}
1271 
1272 	/*NOTREACHED*/
1273 	_NOTE(NOT_REACHED)
1274 }
1275 
1276 /*
1277  * Return when it is safe to queue requests to the async daemon; primarily
1278  * for subnet trap and async event handling. Disallow requests before the
1279  * daemon is created, and when interface deinitilization starts.
1280  */
1281 static boolean_t
1282 ibd_async_safe(ibd_state_t *state)
1283 {
1284 	mutex_enter(&state->id_trap_lock);
1285 	if (state->id_trap_stop) {
1286 		mutex_exit(&state->id_trap_lock);
1287 		return (B_FALSE);
1288 	}
1289 	state->id_trap_inprog++;
1290 	mutex_exit(&state->id_trap_lock);
1291 	return (B_TRUE);
1292 }
1293 
1294 /*
1295  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1296  * trap or event handling to complete to kill the async thread and deconstruct
1297  * the mcg/ace list.
1298  */
1299 static void
1300 ibd_async_done(ibd_state_t *state)
1301 {
1302 	mutex_enter(&state->id_trap_lock);
1303 	if (--state->id_trap_inprog == 0)
1304 		cv_signal(&state->id_trap_cv);
1305 	mutex_exit(&state->id_trap_lock);
1306 }
1307 
1308 /*
1309  * Hash functions:
1310  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1311  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1312  * These operate on mac addresses input into ibd_send, but there is no
1313  * guarantee on the alignment of the ipoib_mac_t structure.
1314  */
1315 /*ARGSUSED*/
1316 static uint_t
1317 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1318 {
1319 	ulong_t ptraddr = (ulong_t)key;
1320 	uint_t hval;
1321 
1322 	/*
1323 	 * If the input address is 4 byte aligned, we can just dereference
1324 	 * it. This is most common, since IP will send in a 4 byte aligned
1325 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1326 	 * 4 byte aligned too.
1327 	 */
1328 	if ((ptraddr & 3) == 0)
1329 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1330 
1331 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1332 	return (hval);
1333 }
1334 
1335 static int
1336 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1337 {
1338 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1339 		return (0);
1340 	else
1341 		return (1);
1342 }
1343 
1344 /*
1345  * Initialize all the per interface caches and lists; AH cache,
1346  * MCG list etc.
1347  */
1348 static int
1349 ibd_acache_init(ibd_state_t *state)
1350 {
1351 	ibd_ace_t *ce;
1352 	int i;
1353 
1354 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1355 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1356 
1357 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1358 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1359 	mutex_enter(&state->id_ac_mutex);
1360 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1361 	    offsetof(ibd_ace_t, ac_list));
1362 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1363 	    offsetof(ibd_ace_t, ac_list));
1364 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1365 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1366 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1367 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1368 	    offsetof(ibd_mce_t, mc_list));
1369 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1370 	    offsetof(ibd_mce_t, mc_list));
1371 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1372 	    offsetof(ibd_req_t, rq_list));
1373 	state->id_ac_hot_ace = NULL;
1374 
1375 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1376 	    IBD_NUM_AH, KM_SLEEP);
1377 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1378 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1379 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1380 			mutex_exit(&state->id_ac_mutex);
1381 			ibd_acache_fini(state);
1382 			return (DDI_FAILURE);
1383 		} else {
1384 			CLEAR_REFCYCLE(ce);
1385 			ce->ac_mce = NULL;
1386 			IBD_ACACHE_INSERT_FREE(state, ce);
1387 		}
1388 	}
1389 	mutex_exit(&state->id_ac_mutex);
1390 	return (DDI_SUCCESS);
1391 }
1392 
1393 static void
1394 ibd_acache_fini(ibd_state_t *state)
1395 {
1396 	ibd_ace_t *ptr;
1397 
1398 	mutex_enter(&state->id_ac_mutex);
1399 
1400 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1401 		ASSERT(GET_REF(ptr) == 0);
1402 		(void) ibt_free_ud_dest(ptr->ac_dest);
1403 	}
1404 
1405 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1406 		ASSERT(GET_REF(ptr) == 0);
1407 		(void) ibt_free_ud_dest(ptr->ac_dest);
1408 	}
1409 
1410 	list_destroy(&state->id_ah_free);
1411 	list_destroy(&state->id_ah_active);
1412 	list_destroy(&state->id_mc_full);
1413 	list_destroy(&state->id_mc_non);
1414 	list_destroy(&state->id_req_list);
1415 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1416 	mutex_exit(&state->id_ac_mutex);
1417 	mutex_destroy(&state->id_ac_mutex);
1418 	mutex_destroy(&state->id_mc_mutex);
1419 	mutex_destroy(&state->id_acache_req_lock);
1420 	cv_destroy(&state->id_acache_req_cv);
1421 }
1422 
1423 /*
1424  * Search AH active hash list for a cached path to input destination.
1425  * If we are "just looking", hold == F. When we are in the Tx path,
1426  * we set hold == T to grab a reference on the AH so that it can not
1427  * be recycled to a new destination while the Tx request is posted.
1428  */
1429 static ibd_ace_t *
1430 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1431 {
1432 	ibd_ace_t *ptr;
1433 
1434 	ASSERT(mutex_owned(&state->id_ac_mutex));
1435 
1436 	/*
1437 	 * Do hash search.
1438 	 */
1439 	if (mod_hash_find(state->id_ah_active_hash,
1440 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1441 		if (hold)
1442 			INC_REF(ptr, num);
1443 		return (ptr);
1444 	}
1445 	return (NULL);
1446 }
1447 
1448 /*
1449  * This is called by the tx side; if an initialized AH is found in
1450  * the active list, it is locked down and can be used; if no entry
1451  * is found, an async request is queued to do path resolution.
1452  */
1453 static ibd_ace_t *
1454 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1455 {
1456 	ibd_ace_t *ptr;
1457 	ibd_req_t *req;
1458 
1459 	/*
1460 	 * Only attempt to print when we can; in the mdt pattr case, the
1461 	 * address is not aligned properly.
1462 	 */
1463 	if (((ulong_t)mac & 3) == 0) {
1464 		DPRINT(4,
1465 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1466 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1467 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1468 		    htonl(mac->ipoib_gidsuff[1]));
1469 	}
1470 
1471 	mutex_enter(&state->id_ac_mutex);
1472 
1473 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1474 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1475 		INC_REF(ptr, numwqe);
1476 		mutex_exit(&state->id_ac_mutex);
1477 		return (ptr);
1478 	}
1479 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1480 		state->id_ac_hot_ace = ptr;
1481 		mutex_exit(&state->id_ac_mutex);
1482 		return (ptr);
1483 	}
1484 
1485 	/*
1486 	 * Implementation of a single outstanding async request; if
1487 	 * the operation is not started yet, queue a request and move
1488 	 * to ongoing state. Remember in id_ah_addr for which address
1489 	 * we are queueing the request, in case we need to flag an error;
1490 	 * Any further requests, for the same or different address, until
1491 	 * the operation completes, is sent back to GLDv3 to be retried.
1492 	 * The async thread will update id_ah_op with an error indication
1493 	 * or will set it to indicate the next look up can start; either
1494 	 * way, it will mac_tx_update() so that all blocked requests come
1495 	 * back here.
1496 	 */
1497 	*err = EAGAIN;
1498 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1499 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1500 		if (req != NULL) {
1501 			/*
1502 			 * We did not even find the entry; queue a request
1503 			 * for it.
1504 			 */
1505 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1506 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1507 			state->id_ah_op = IBD_OP_ONGOING;
1508 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1509 		}
1510 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1511 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1512 		/*
1513 		 * Check the status of the pathrecord lookup request
1514 		 * we had queued before.
1515 		 */
1516 		if (state->id_ah_op == IBD_OP_ERRORED) {
1517 			*err = EFAULT;
1518 			state->id_ah_error++;
1519 		} else {
1520 			/*
1521 			 * IBD_OP_ROUTERED case: We need to send to the
1522 			 * all-router MCG. If we can find the AH for
1523 			 * the mcg, the Tx will be attempted. If we
1524 			 * do not find the AH, we return NORESOURCES
1525 			 * to retry.
1526 			 */
1527 			ipoib_mac_t routermac;
1528 
1529 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1530 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1531 			    numwqe);
1532 		}
1533 		state->id_ah_op = IBD_OP_NOTSTARTED;
1534 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1535 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1536 		/*
1537 		 * This case can happen when we get a higher band
1538 		 * packet. The easiest way is to reset the state machine
1539 		 * to accommodate the higher priority packet.
1540 		 */
1541 		state->id_ah_op = IBD_OP_NOTSTARTED;
1542 	}
1543 	mutex_exit(&state->id_ac_mutex);
1544 
1545 	return (ptr);
1546 }
1547 
1548 /*
1549  * Grab a not-currently-in-use AH/PathRecord from the active
1550  * list to recycle to a new destination. Only the async thread
1551  * executes this code.
1552  */
1553 static ibd_ace_t *
1554 ibd_acache_get_unref(ibd_state_t *state)
1555 {
1556 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1557 
1558 	ASSERT(mutex_owned(&state->id_ac_mutex));
1559 
1560 	/*
1561 	 * Do plain linear search.
1562 	 */
1563 	while (ptr != NULL) {
1564 		/*
1565 		 * Note that it is possible that the "cycle" bit
1566 		 * is set on the AH w/o any reference count. The
1567 		 * mcg must have been deleted, and the tx cleanup
1568 		 * just decremented the reference count to 0, but
1569 		 * hasn't gotten around to grabbing the id_ac_mutex
1570 		 * to move the AH into the free list.
1571 		 */
1572 		if (GET_REF(ptr) == 0) {
1573 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1574 			break;
1575 		}
1576 		ptr = list_next(&state->id_ah_active, ptr);
1577 	}
1578 	return (ptr);
1579 }
1580 
1581 /*
1582  * Invoked to clean up AH from active list in case of multicast
1583  * disable and to handle sendonly memberships during mcg traps.
1584  * And for port up processing for multicast and unicast AHs.
1585  * Normally, the AH is taken off the active list, and put into
1586  * the free list to be recycled for a new destination. In case
1587  * Tx requests on the AH have not completed yet, the AH is marked
1588  * for reaping (which will put the AH on the free list) once the Tx's
1589  * complete; in this case, depending on the "force" input, we take
1590  * out the AH from the active list right now, or leave it also for
1591  * the reap operation. Returns TRUE if the AH is taken off the active
1592  * list (and either put into the free list right now, or arranged for
1593  * later), FALSE otherwise.
1594  */
1595 static boolean_t
1596 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1597 {
1598 	ibd_ace_t *acactive;
1599 	boolean_t ret = B_TRUE;
1600 
1601 	ASSERT(mutex_owned(&state->id_ac_mutex));
1602 
1603 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1604 
1605 		/*
1606 		 * Note that the AH might already have the cycle bit set
1607 		 * on it; this might happen if sequences of multicast
1608 		 * enables and disables are coming so fast, that posted
1609 		 * Tx's to the mcg have not completed yet, and the cycle
1610 		 * bit is set successively by each multicast disable.
1611 		 */
1612 		if (SET_CYCLE_IF_REF(acactive)) {
1613 			if (!force) {
1614 				/*
1615 				 * The ace is kept on the active list, further
1616 				 * Tx's can still grab a reference on it; the
1617 				 * ace is reaped when all pending Tx's
1618 				 * referencing the AH complete.
1619 				 */
1620 				ret = B_FALSE;
1621 			} else {
1622 				/*
1623 				 * In the mcg trap case, we always pull the
1624 				 * AH from the active list. And also the port
1625 				 * up multi/unicast case.
1626 				 */
1627 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1628 				acactive->ac_mce = NULL;
1629 			}
1630 		} else {
1631 			/*
1632 			 * Determined the ref count is 0, thus reclaim
1633 			 * immediately after pulling out the ace from
1634 			 * the active list.
1635 			 */
1636 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1637 			acactive->ac_mce = NULL;
1638 			IBD_ACACHE_INSERT_FREE(state, acactive);
1639 		}
1640 
1641 	}
1642 	return (ret);
1643 }
1644 
1645 /*
1646  * Helper function for async path record lookup. If we are trying to
1647  * Tx to a MCG, check our membership, possibly trying to join the
1648  * group if required. If that fails, try to send the packet to the
1649  * all router group (indicated by the redirect output), pointing
1650  * the input mac address to the router mcg address.
1651  */
1652 static ibd_mce_t *
1653 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1654 {
1655 	ib_gid_t mgid;
1656 	ibd_mce_t *mce;
1657 	ipoib_mac_t routermac;
1658 
1659 	*redirect = B_FALSE;
1660 	ibd_n2h_gid(mac, &mgid);
1661 
1662 	/*
1663 	 * Check the FullMember+SendOnlyNonMember list.
1664 	 * Since we are the only one who manipulates the
1665 	 * id_mc_full list, no locks are needed.
1666 	 */
1667 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1668 	if (mce != NULL) {
1669 		DPRINT(4, "ibd_async_mcache : already joined to group");
1670 		return (mce);
1671 	}
1672 
1673 	/*
1674 	 * Not found; try to join(SendOnlyNonMember) and attach.
1675 	 */
1676 	DPRINT(4, "ibd_async_mcache : not joined to group");
1677 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1678 	    NULL) {
1679 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1680 		return (mce);
1681 	}
1682 
1683 	/*
1684 	 * MCGroup not present; try to join the all-router group. If
1685 	 * any of the following steps succeed, we will be redirecting
1686 	 * to the all router group.
1687 	 */
1688 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1689 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1690 		return (NULL);
1691 	*redirect = B_TRUE;
1692 	ibd_n2h_gid(&routermac, &mgid);
1693 	bcopy(&routermac, mac, IPOIB_ADDRL);
1694 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1695 	    mgid.gid_prefix, mgid.gid_guid);
1696 
1697 	/*
1698 	 * Are we already joined to the router group?
1699 	 */
1700 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1701 		DPRINT(4, "ibd_async_mcache : using already joined router"
1702 		    "group\n");
1703 		return (mce);
1704 	}
1705 
1706 	/*
1707 	 * Can we join(SendOnlyNonMember) the router group?
1708 	 */
1709 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1710 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1711 	    NULL) {
1712 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1713 		return (mce);
1714 	}
1715 
1716 	return (NULL);
1717 }
1718 
1719 /*
1720  * Async path record lookup code.
1721  */
1722 static void
1723 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1724 {
1725 	ibd_ace_t *ce;
1726 	ibd_mce_t *mce = NULL;
1727 	ibt_path_attr_t path_attr;
1728 	ibt_path_info_t path_info;
1729 	ib_gid_t destgid;
1730 	char ret = IBD_OP_NOTSTARTED;
1731 
1732 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1733 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1734 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1735 	    htonl(mac->ipoib_gidsuff[1]));
1736 
1737 	/*
1738 	 * Check whether we are trying to transmit to a MCG.
1739 	 * In that case, we need to make sure we are a member of
1740 	 * the MCG.
1741 	 */
1742 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1743 		boolean_t redirected;
1744 
1745 		/*
1746 		 * If we can not find or join the group or even
1747 		 * redirect, error out.
1748 		 */
1749 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1750 		    NULL) {
1751 			state->id_ah_op = IBD_OP_ERRORED;
1752 			return;
1753 		}
1754 
1755 		/*
1756 		 * If we got redirected, we need to determine whether
1757 		 * the AH for the new mcg is in the cache already, and
1758 		 * not pull it in then; otherwise proceed to get the
1759 		 * path for the new mcg. There is no guarantee that
1760 		 * if the AH is currently in the cache, it will still be
1761 		 * there when we look in ibd_acache_lookup(), but that's
1762 		 * okay, we will come back here.
1763 		 */
1764 		if (redirected) {
1765 			ret = IBD_OP_ROUTERED;
1766 			DPRINT(4, "ibd_async_acache :  redirected to "
1767 			    "%08X:%08X:%08X:%08X:%08X",
1768 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1769 			    htonl(mac->ipoib_gidpref[1]),
1770 			    htonl(mac->ipoib_gidsuff[0]),
1771 			    htonl(mac->ipoib_gidsuff[1]));
1772 
1773 			mutex_enter(&state->id_ac_mutex);
1774 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1775 				state->id_ah_op = IBD_OP_ROUTERED;
1776 				mutex_exit(&state->id_ac_mutex);
1777 				DPRINT(4, "ibd_async_acache : router AH found");
1778 				return;
1779 			}
1780 			mutex_exit(&state->id_ac_mutex);
1781 		}
1782 	}
1783 
1784 	/*
1785 	 * Get an AH from the free list.
1786 	 */
1787 	mutex_enter(&state->id_ac_mutex);
1788 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1789 		/*
1790 		 * No free ones; try to grab an unreferenced active
1791 		 * one. Maybe we need to make the active list LRU,
1792 		 * but that will create more work for Tx callbacks.
1793 		 * Is there a way of not having to pull out the
1794 		 * entry from the active list, but just indicate it
1795 		 * is being recycled? Yes, but that creates one more
1796 		 * check in the fast lookup path.
1797 		 */
1798 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1799 			/*
1800 			 * Pretty serious shortage now.
1801 			 */
1802 			state->id_ah_op = IBD_OP_NOTSTARTED;
1803 			mutex_exit(&state->id_ac_mutex);
1804 			DPRINT(10, "ibd_async_acache : failed to find AH "
1805 			    "slot\n");
1806 			return;
1807 		}
1808 		/*
1809 		 * We could check whether ac_mce points to a SendOnly
1810 		 * member and drop that membership now. Or do it lazily
1811 		 * at detach time.
1812 		 */
1813 		ce->ac_mce = NULL;
1814 	}
1815 	mutex_exit(&state->id_ac_mutex);
1816 	ASSERT(ce->ac_mce == NULL);
1817 
1818 	/*
1819 	 * Update the entry.
1820 	 */
1821 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1822 
1823 	bzero(&path_info, sizeof (path_info));
1824 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1825 	path_attr.pa_sgid = state->id_sgid;
1826 	path_attr.pa_num_dgids = 1;
1827 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1828 	path_attr.pa_dgids = &destgid;
1829 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1830 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1831 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1832 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1833 		goto error;
1834 	}
1835 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1836 	    ntohl(ce->ac_mac.ipoib_qpn),
1837 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1838 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1839 		goto error;
1840 	}
1841 
1842 	/*
1843 	 * mce is set whenever an AH is being associated with a
1844 	 * MCG; this will come in handy when we leave the MCG. The
1845 	 * lock protects Tx fastpath from scanning the active list.
1846 	 */
1847 	if (mce != NULL)
1848 		ce->ac_mce = mce;
1849 	mutex_enter(&state->id_ac_mutex);
1850 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1851 	state->id_ah_op = ret;
1852 	mutex_exit(&state->id_ac_mutex);
1853 	return;
1854 error:
1855 	/*
1856 	 * We might want to drop SendOnly membership here if we
1857 	 * joined above. The lock protects Tx callbacks inserting
1858 	 * into the free list.
1859 	 */
1860 	mutex_enter(&state->id_ac_mutex);
1861 	state->id_ah_op = IBD_OP_ERRORED;
1862 	IBD_ACACHE_INSERT_FREE(state, ce);
1863 	mutex_exit(&state->id_ac_mutex);
1864 }
1865 
1866 /*
1867  * While restoring port's presence on the subnet on a port up, it is possible
1868  * that the port goes down again.
1869  */
1870 static void
1871 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1872 {
1873 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1874 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1875 	    LINK_STATE_UP;
1876 	ibd_mce_t *mce, *pmce;
1877 	ibd_ace_t *ace, *pace;
1878 
1879 	DPRINT(10, "ibd_async_link(): %d", opcode);
1880 
1881 	/*
1882 	 * On a link up, revalidate the link speed/width. No point doing
1883 	 * this on a link down, since we will be unable to do SA operations,
1884 	 * defaulting to the lowest speed. Also notice that we update our
1885 	 * notion of speed before calling mac_link_update(), which will do
1886 	 * necessary higher level notifications for speed changes.
1887 	 */
1888 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1889 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1890 		state->id_link_speed = ibd_get_portspeed(state);
1891 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1892 	}
1893 
1894 	/*
1895 	 * Do all the work required to establish our presence on
1896 	 * the subnet.
1897 	 */
1898 	if (opcode == IBD_LINK_UP_ABSENT) {
1899 		/*
1900 		 * If in promiscuous mode ...
1901 		 */
1902 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1903 			/*
1904 			 * Drop all nonmembership.
1905 			 */
1906 			ibd_async_unsetprom(state);
1907 
1908 			/*
1909 			 * Then, try to regain nonmembership to all mcg's.
1910 			 */
1911 			ibd_async_setprom(state);
1912 
1913 		}
1914 
1915 		/*
1916 		 * Drop all sendonly membership (which also gets rid of the
1917 		 * AHs); try to reacquire all full membership.
1918 		 */
1919 		mce = list_head(&state->id_mc_full);
1920 		while ((pmce = mce) != NULL) {
1921 			mce = list_next(&state->id_mc_full, mce);
1922 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1923 				ibd_leave_group(state,
1924 				    pmce->mc_info.mc_adds_vect.av_dgid,
1925 				    IB_MC_JSTATE_SEND_ONLY_NON);
1926 			else
1927 				ibd_reacquire_group(state, pmce);
1928 		}
1929 
1930 		/*
1931 		 * Recycle all active AHs to free list (and if there are
1932 		 * pending posts, make sure they will go into the free list
1933 		 * once the Tx's complete). Grab the lock to prevent
1934 		 * concurrent Tx's as well as Tx cleanups.
1935 		 */
1936 		mutex_enter(&state->id_ac_mutex);
1937 		ace = list_head(&state->id_ah_active);
1938 		while ((pace = ace) != NULL) {
1939 			boolean_t cycled;
1940 
1941 			ace = list_next(&state->id_ah_active, ace);
1942 			mce = pace->ac_mce;
1943 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
1944 			    B_TRUE);
1945 			/*
1946 			 * If this is for an mcg, it must be for a fullmember,
1947 			 * since we got rid of send-only members above when
1948 			 * processing the mce list.
1949 			 */
1950 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1951 			    IB_MC_JSTATE_FULL)));
1952 
1953 			/*
1954 			 * Check if the fullmember mce needs to be torn down,
1955 			 * ie whether the DLPI disable has already been done.
1956 			 * If so, do some of the work of tx_cleanup, namely
1957 			 * causing leave (which will fail), detach and
1958 			 * mce-freeing. tx_cleanup will put the AH into free
1959 			 * list. The reason to duplicate some of this
1960 			 * tx_cleanup work is because we want to delete the
1961 			 * AH right now instead of waiting for tx_cleanup, to
1962 			 * force subsequent Tx's to reacquire an AH.
1963 			 */
1964 			if ((mce != NULL) && (mce->mc_fullreap))
1965 				ibd_async_reap_group(state, mce,
1966 				    mce->mc_info.mc_adds_vect.av_dgid,
1967 				    mce->mc_jstate);
1968 		}
1969 		mutex_exit(&state->id_ac_mutex);
1970 	}
1971 
1972 	/*
1973 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1974 	 * (which stops further events from being delivered) before
1975 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1976 	 * has already been done.
1977 	 */
1978 	mutex_enter(&state->id_link_mutex);
1979 	state->id_link_state = lstate;
1980 	mac_link_update(state->id_mh, lstate);
1981 	mutex_exit(&state->id_link_mutex);
1982 
1983 	ibd_async_done(state);
1984 }
1985 
1986 /*
1987  * Check the pkey table to see if we can find the pkey we're looking for.
1988  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1989  * failure.
1990  */
1991 static int
1992 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1993     uint16_t *pkix)
1994 {
1995 	uint16_t ndx;
1996 
1997 	ASSERT(pkix != NULL);
1998 
1999 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2000 		if (pkey_tbl[ndx] == pkey) {
2001 			*pkix = ndx;
2002 			return (0);
2003 		}
2004 	}
2005 	return (-1);
2006 }
2007 
2008 /*
2009  * When the link is notified up, we need to do a few things, based
2010  * on the port's current p_init_type_reply claiming a reinit has been
2011  * done or not. The reinit steps are:
2012  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2013  *    the old Pkey and GID0 are correct.
2014  * 2. Register for mcg traps (already done by ibmf).
2015  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2016  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2017  * 4. Give up all sendonly memberships.
2018  * 5. Acquire all full memberships.
2019  * 6. In promiscuous mode, acquire all non memberships.
2020  * 7. Recycle all AHs to free list.
2021  */
2022 static void
2023 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2024 {
2025 	ibt_hca_portinfo_t *port_infop = NULL;
2026 	ibt_status_t ibt_status;
2027 	uint_t psize, port_infosz;
2028 	ibd_link_op_t opcode;
2029 	ibd_req_t *req;
2030 	link_state_t new_link_state = LINK_STATE_UP;
2031 	uint8_t itreply;
2032 	uint16_t pkix;
2033 	int ret;
2034 
2035 	/*
2036 	 * Let's not race with a plumb or an unplumb; if we detect a
2037 	 * pkey relocation event later on here, we may have to restart.
2038 	 */
2039 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2040 
2041 	mutex_enter(&state->id_link_mutex);
2042 
2043 	/*
2044 	 * If the init code in ibd_m_start hasn't yet set up the
2045 	 * pkey/gid, nothing to do; that code will set the link state.
2046 	 */
2047 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2048 		mutex_exit(&state->id_link_mutex);
2049 		goto link_mod_return;
2050 	}
2051 
2052 	/*
2053 	 * If this routine was called in response to a port down event,
2054 	 * we just need to see if this should be informed.
2055 	 */
2056 	if (code == IBT_ERROR_PORT_DOWN) {
2057 		new_link_state = LINK_STATE_DOWN;
2058 		goto update_link_state;
2059 	}
2060 
2061 	/*
2062 	 * If it's not a port down event we've received, try to get the port
2063 	 * attributes first. If we fail here, the port is as good as down.
2064 	 * Otherwise, if the link went down by the time the handler gets
2065 	 * here, give up - we cannot even validate the pkey/gid since those
2066 	 * are not valid and this is as bad as a port down anyway.
2067 	 */
2068 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2069 	    &port_infop, &psize, &port_infosz);
2070 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2071 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2072 		new_link_state = LINK_STATE_DOWN;
2073 		goto update_link_state;
2074 	}
2075 
2076 	/*
2077 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2078 	 * PreserveContentReply are 0, we don't know anything about the
2079 	 * data loaded into the port attributes, so we need to verify
2080 	 * if gid0 and pkey are still valid.
2081 	 */
2082 	itreply = port_infop->p_init_type_reply;
2083 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2084 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2085 		/*
2086 		 * Check to see if the subnet part of GID0 has changed. If
2087 		 * not, check the simple case first to see if the pkey
2088 		 * index is the same as before; finally check to see if the
2089 		 * pkey has been relocated to a different index in the table.
2090 		 */
2091 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2092 		if (bcmp(port_infop->p_sgid_tbl,
2093 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2094 
2095 			new_link_state = LINK_STATE_DOWN;
2096 
2097 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2098 		    state->id_pkey) {
2099 
2100 			new_link_state = LINK_STATE_UP;
2101 
2102 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2103 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2104 
2105 			ibt_free_portinfo(port_infop, port_infosz);
2106 			mutex_exit(&state->id_link_mutex);
2107 
2108 			/*
2109 			 * Currently a restart is required if our pkey has moved
2110 			 * in the pkey table. If we get the ibt_recycle_ud() to
2111 			 * work as documented (expected), we may be able to
2112 			 * avoid a complete restart.  Note that we've already
2113 			 * marked both the start and stop 'in-progress' flags,
2114 			 * so it is ok to go ahead and do this restart.
2115 			 */
2116 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2117 			if ((ret = ibd_start(state)) != 0) {
2118 				DPRINT(10, "ibd_restart: cannot restart, "
2119 				    "ret=%d", ret);
2120 			}
2121 
2122 			goto link_mod_return;
2123 		} else {
2124 			new_link_state = LINK_STATE_DOWN;
2125 		}
2126 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2127 	}
2128 
2129 update_link_state:
2130 	if (port_infop) {
2131 		ibt_free_portinfo(port_infop, port_infosz);
2132 	}
2133 
2134 	/*
2135 	 * If the old state is the same as the new state, nothing to do
2136 	 */
2137 	if (state->id_link_state == new_link_state) {
2138 		mutex_exit(&state->id_link_mutex);
2139 		goto link_mod_return;
2140 	}
2141 
2142 	/*
2143 	 * Ok, so there was a link state change; see if it's safe to ask
2144 	 * the async thread to do the work
2145 	 */
2146 	if (!ibd_async_safe(state)) {
2147 		state->id_link_state = new_link_state;
2148 		mutex_exit(&state->id_link_mutex);
2149 		goto link_mod_return;
2150 	}
2151 
2152 	mutex_exit(&state->id_link_mutex);
2153 
2154 	/*
2155 	 * If we're reporting a link up, check InitTypeReply to see if
2156 	 * the SM has ensured that the port's presence in mcg, traps,
2157 	 * etc. is intact.
2158 	 */
2159 	if (new_link_state == LINK_STATE_DOWN) {
2160 		opcode = IBD_LINK_DOWN;
2161 	} else {
2162 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2163 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2164 			opcode = IBD_LINK_UP;
2165 		} else {
2166 			opcode = IBD_LINK_UP_ABSENT;
2167 		}
2168 	}
2169 
2170 	/*
2171 	 * Queue up a request for ibd_async_link() to handle this link
2172 	 * state change event
2173 	 */
2174 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2175 	req->rq_ptr = (void *)opcode;
2176 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2177 
2178 link_mod_return:
2179 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2180 }
2181 
2182 /*
2183  * For the port up/down events, IBTL guarantees there will not be concurrent
2184  * invocations of the handler. IBTL might coalesce link transition events,
2185  * and not invoke the handler for _each_ up/down transition, but it will
2186  * invoke the handler with last known state
2187  */
2188 static void
2189 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2190     ibt_async_code_t code, ibt_async_event_t *event)
2191 {
2192 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2193 
2194 	switch (code) {
2195 	case IBT_ERROR_CATASTROPHIC_CHAN:
2196 		ibd_print_warn(state, "catastrophic channel error");
2197 		break;
2198 	case IBT_ERROR_CQ:
2199 		ibd_print_warn(state, "completion queue error");
2200 		break;
2201 	case IBT_PORT_CHANGE_EVENT:
2202 		/*
2203 		 * Events will be delivered to all instances that have
2204 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2205 		 * Only need to do work for our port; IBTF will deliver
2206 		 * events for other ports on the hca we have ibt_open_hca'ed
2207 		 * too. Note that id_port is initialized in ibd_attach()
2208 		 * before we do an ibt_open_hca() in ibd_attach().
2209 		 */
2210 		ASSERT(state->id_hca_hdl == hca_hdl);
2211 		if (state->id_port != event->ev_port)
2212 			break;
2213 
2214 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2215 		    IBT_PORT_CHANGE_PKEY) {
2216 			ibd_link_mod(state, code);
2217 		}
2218 		break;
2219 	case IBT_ERROR_PORT_DOWN:
2220 	case IBT_CLNT_REREG_EVENT:
2221 	case IBT_EVENT_PORT_UP:
2222 		/*
2223 		 * Events will be delivered to all instances that have
2224 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2225 		 * Only need to do work for our port; IBTF will deliver
2226 		 * events for other ports on the hca we have ibt_open_hca'ed
2227 		 * too. Note that id_port is initialized in ibd_attach()
2228 		 * before we do an ibt_open_hca() in ibd_attach().
2229 		 */
2230 		ASSERT(state->id_hca_hdl == hca_hdl);
2231 		if (state->id_port != event->ev_port)
2232 			break;
2233 
2234 		ibd_link_mod(state, code);
2235 		break;
2236 
2237 	case IBT_HCA_ATTACH_EVENT:
2238 	case IBT_HCA_DETACH_EVENT:
2239 		/*
2240 		 * When a new card is plugged to the system, attach_event is
2241 		 * invoked. Additionally, a cfgadm needs to be run to make the
2242 		 * card known to the system, and an ifconfig needs to be run to
2243 		 * plumb up any ibd interfaces on the card. In the case of card
2244 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2245 		 * unplumb the ibd interfaces on the card; when the card is
2246 		 * actually unplugged, the detach_event is invoked;
2247 		 * additionally, if any ibd instances are still active on the
2248 		 * card (eg there were no associated RCM scripts), driver's
2249 		 * detach routine is invoked.
2250 		 */
2251 		break;
2252 	default:
2253 		break;
2254 	}
2255 }
2256 
2257 static int
2258 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2259 {
2260 	mac_register_t *macp;
2261 	int ret;
2262 
2263 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2264 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2265 		return (DDI_FAILURE);
2266 	}
2267 
2268 	/*
2269 	 * Note that when we register with mac during attach, we don't
2270 	 * have the id_macaddr yet, so we'll simply be registering a
2271 	 * zero macaddr that we'll overwrite later during plumb (in
2272 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2273 	 * update the mac layer with the correct mtu during plumb.
2274 	 */
2275 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2276 	macp->m_driver = state;
2277 	macp->m_dip = dip;
2278 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2279 	macp->m_callbacks = &ibd_m_callbacks;
2280 	macp->m_min_sdu = 0;
2281 	macp->m_max_sdu = IBD_DEF_MAX_SDU;
2282 
2283 	/*
2284 	 *  Register ourselves with the GLDv3 interface
2285 	 */
2286 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2287 		mac_free(macp);
2288 		DPRINT(10,
2289 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2290 		return (DDI_FAILURE);
2291 	}
2292 
2293 	mac_free(macp);
2294 	return (DDI_SUCCESS);
2295 }
2296 
2297 static int
2298 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2299 {
2300 	ibt_hca_attr_t hca_attrs;
2301 	ibt_status_t ibt_status;
2302 
2303 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2304 
2305 	/*
2306 	 * Query the HCA and fetch its attributes
2307 	 */
2308 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2309 	ASSERT(ibt_status == IBT_SUCCESS);
2310 
2311 	/*
2312 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2313 	 *    full checksum offload.
2314 	 */
2315 	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
2316 		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2317 	}
2318 
2319 	/*
2320 	 * 2. Set LSO policy, capability and maximum length
2321 	 */
2322 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
2323 	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2324 		state->id_lso_policy = B_TRUE;
2325 	} else {
2326 		state->id_lso_policy = B_FALSE;
2327 	}
2328 
2329 	if (hca_attrs.hca_max_lso_size > 0) {
2330 		state->id_lso_capable = B_TRUE;
2331 		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2332 			state->id_lso_maxlen = IBD_LSO_MAXLEN;
2333 		else
2334 			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
2335 	} else {
2336 		state->id_lso_capable = B_FALSE;
2337 		state->id_lso_maxlen = 0;
2338 	}
2339 
2340 	/*
2341 	 * 3. Set Reserved L_Key capability
2342 	 */
2343 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2344 		state->id_hca_res_lkey_capab = 1;
2345 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2346 	}
2347 
2348 	/*
2349 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2350 	 *    size information is provided by the hca
2351 	 */
2352 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2353 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2354 	} else {
2355 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2356 	}
2357 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2358 		state->id_max_sqseg = IBD_MAX_SQSEG;
2359 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2360 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2361 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2362 	}
2363 
2364 	/*
2365 	 * Translating the virtual address regions into physical regions
2366 	 * for using the Reserved LKey feature results in a wr sgl that
2367 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2368 	 * we'll fix a high-water mark (65%) for when we should stop.
2369 	 */
2370 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2371 
2372 	/*
2373 	 * 5. Set number of recv and send wqes after checking hca maximum
2374 	 *    channel size
2375 	 */
2376 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2377 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2378 	} else {
2379 		state->id_num_rwqe = IBD_NUM_RWQE;
2380 	}
2381 	state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN;
2382 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2383 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2384 	} else {
2385 		state->id_num_swqe = IBD_NUM_SWQE;
2386 	}
2387 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2388 
2389 	return (DDI_SUCCESS);
2390 }
2391 
2392 static int
2393 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2394 {
2395 	int instance;
2396 	uint32_t progress = state->id_mac_state;
2397 	ibt_status_t ret;
2398 
2399 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2400 		cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n");
2401 		return (DDI_FAILURE);
2402 	}
2403 
2404 	/* make sure rx resources are freed */
2405 	ibd_free_rx_rsrcs(state);
2406 
2407 	if (progress & IBD_DRV_MAC_REGISTERED) {
2408 		(void) mac_unregister(state->id_mh);
2409 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2410 	}
2411 
2412 	if (progress & IBD_DRV_PD_ALLOCD) {
2413 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2414 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2415 			ibd_print_warn(state, "failed to free "
2416 			    "protection domain, ret=%d", ret);
2417 		}
2418 		state->id_pd_hdl = NULL;
2419 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2420 	}
2421 
2422 	if (progress & IBD_DRV_HCA_OPENED) {
2423 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2424 		    IBT_SUCCESS) {
2425 			ibd_print_warn(state, "failed to close "
2426 			    "HCA device, ret=%d", ret);
2427 		}
2428 		state->id_hca_hdl = NULL;
2429 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2430 	}
2431 
2432 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2433 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
2434 			ibd_print_warn(state,
2435 			    "ibt_detach() failed, ret=%d", ret);
2436 		}
2437 		state->id_ibt_hdl = NULL;
2438 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2439 	}
2440 
2441 	if (progress & IBD_DRV_TXINTR_ADDED) {
2442 		ddi_remove_softintr(state->id_tx);
2443 		state->id_tx = NULL;
2444 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2445 	}
2446 
2447 	if (progress & IBD_DRV_RXINTR_ADDED) {
2448 		ddi_remove_softintr(state->id_rx);
2449 		state->id_rx = NULL;
2450 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2451 	}
2452 
2453 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2454 		ibd_state_fini(state);
2455 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2456 	}
2457 
2458 	instance = ddi_get_instance(dip);
2459 	ddi_soft_state_free(ibd_list, instance);
2460 
2461 	return (DDI_SUCCESS);
2462 }
2463 
2464 /*
2465  * Attach device to the IO framework.
2466  */
2467 static int
2468 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2469 {
2470 	ibd_state_t *state = NULL;
2471 	ib_guid_t hca_guid;
2472 	int instance;
2473 	ibt_status_t ret;
2474 	int rv;
2475 
2476 	/*
2477 	 * IBD doesn't support suspend/resume
2478 	 */
2479 	if (cmd != DDI_ATTACH)
2480 		return (DDI_FAILURE);
2481 
2482 	/*
2483 	 * Allocate softstate structure
2484 	 */
2485 	instance = ddi_get_instance(dip);
2486 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2487 		return (DDI_FAILURE);
2488 	state = ddi_get_soft_state(ibd_list, instance);
2489 
2490 	/*
2491 	 * Initialize mutexes and condition variables
2492 	 */
2493 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2494 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2495 		goto attach_fail;
2496 	}
2497 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2498 
2499 	/*
2500 	 * Allocate rx,tx softintr
2501 	 */
2502 	if (ibd_rx_softintr == 1) {
2503 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2504 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2505 			DPRINT(10, "ibd_attach: failed in "
2506 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2507 			goto attach_fail;
2508 		}
2509 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2510 	}
2511 	if (ibd_tx_softintr == 1) {
2512 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2513 		    NULL, NULL, ibd_tx_recycle,
2514 		    (caddr_t)state)) != DDI_SUCCESS) {
2515 			DPRINT(10, "ibd_attach: failed in "
2516 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2517 			goto attach_fail;
2518 		}
2519 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2520 	}
2521 
2522 	/*
2523 	 * Obtain IBA P_Key, port number and HCA guid and validate
2524 	 * them (for P_Key, only full members are allowed as per
2525 	 * IPoIB specification; neither port number nor HCA guid
2526 	 * can be zero)
2527 	 */
2528 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2529 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2530 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2531 		    state->id_pkey);
2532 		goto attach_fail;
2533 	}
2534 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2535 	    "port-number", 0)) == 0) {
2536 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2537 		    state->id_port);
2538 		goto attach_fail;
2539 	}
2540 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2541 	    "hca-guid", 0)) == 0) {
2542 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2543 		    hca_guid);
2544 		goto attach_fail;
2545 	}
2546 
2547 	/*
2548 	 * Attach to IBTL
2549 	 */
2550 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2551 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2552 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
2553 		goto attach_fail;
2554 	}
2555 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2556 
2557 	/*
2558 	 * Open the HCA
2559 	 */
2560 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2561 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2562 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2563 		goto attach_fail;
2564 	}
2565 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2566 
2567 	/*
2568 	 * Record capabilities
2569 	 */
2570 	(void) ibd_record_capab(state, dip);
2571 
2572 	/*
2573 	 * Allocate a protection domain on the HCA
2574 	 */
2575 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2576 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2577 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2578 		goto attach_fail;
2579 	}
2580 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2581 
2582 
2583 	/*
2584 	 * Register ibd interfaces with the Nemo framework
2585 	 */
2586 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2587 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2588 		goto attach_fail;
2589 	}
2590 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2591 
2592 	/*
2593 	 * We're done with everything we could to make the attach
2594 	 * succeed.  All the buffer allocations and IPoIB broadcast
2595 	 * group joins are deferred to when the interface instance
2596 	 * is actually plumbed to avoid wasting memory.
2597 	 */
2598 	return (DDI_SUCCESS);
2599 
2600 attach_fail:
2601 	(void) ibd_unattach(state, dip);
2602 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2603 	return (DDI_FAILURE);
2604 }
2605 
2606 /*
2607  * Detach device from the IO framework.
2608  */
2609 static int
2610 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2611 {
2612 	ibd_state_t *state;
2613 	int instance;
2614 
2615 	/*
2616 	 * IBD doesn't support suspend/resume
2617 	 */
2618 	if (cmd != DDI_DETACH)
2619 		return (DDI_FAILURE);
2620 
2621 	/*
2622 	 * Get the instance softstate
2623 	 */
2624 	instance = ddi_get_instance(dip);
2625 	state = ddi_get_soft_state(ibd_list, instance);
2626 
2627 	/*
2628 	 * Release all resources we're holding still.  Note that if we'd
2629 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2630 	 * so far, we should find all the flags we need in id_mac_state.
2631 	 */
2632 	return (ibd_unattach(state, dip));
2633 }
2634 
2635 /*
2636  * Pre ibt_attach() driver initialization
2637  */
2638 static int
2639 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2640 {
2641 	char buf[64];
2642 
2643 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2644 	state->id_link_state = LINK_STATE_UNKNOWN;
2645 
2646 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2647 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2648 	state->id_trap_stop = B_TRUE;
2649 	state->id_trap_inprog = 0;
2650 
2651 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2652 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2653 	state->id_dip = dip;
2654 
2655 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2656 
2657 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2658 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2659 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2660 	state->id_tx_busy = 0;
2661 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2662 
2663 	state->id_rx_list.dl_bufs_outstanding = 0;
2664 	state->id_rx_list.dl_cnt = 0;
2665 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2666 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2667 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2668 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2669 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2670 
2671 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
2672 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
2673 
2674 	return (DDI_SUCCESS);
2675 }
2676 
2677 /*
2678  * Post ibt_detach() driver deconstruction
2679  */
2680 static void
2681 ibd_state_fini(ibd_state_t *state)
2682 {
2683 	cv_destroy(&state->id_macst_cv);
2684 	mutex_destroy(&state->id_macst_lock);
2685 
2686 	kmem_cache_destroy(state->id_req_kmc);
2687 
2688 	mutex_destroy(&state->id_rx_list.dl_mutex);
2689 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2690 
2691 	mutex_destroy(&state->id_txpost_lock);
2692 	mutex_destroy(&state->id_tx_list.dl_mutex);
2693 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2694 	mutex_destroy(&state->id_lso_lock);
2695 
2696 	mutex_destroy(&state->id_sched_lock);
2697 	mutex_destroy(&state->id_scq_poll_lock);
2698 	mutex_destroy(&state->id_rcq_poll_lock);
2699 
2700 	cv_destroy(&state->id_trap_cv);
2701 	mutex_destroy(&state->id_trap_lock);
2702 	mutex_destroy(&state->id_link_mutex);
2703 }
2704 
2705 /*
2706  * Fetch link speed from SA for snmp ifspeed reporting.
2707  */
2708 static uint64_t
2709 ibd_get_portspeed(ibd_state_t *state)
2710 {
2711 	int			ret;
2712 	ibt_path_info_t		path;
2713 	ibt_path_attr_t		path_attr;
2714 	uint8_t			num_paths;
2715 	uint64_t		ifspeed;
2716 
2717 	/*
2718 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2719 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2720 	 * 2000000000. Start with that as default.
2721 	 */
2722 	ifspeed = 2000000000;
2723 
2724 	bzero(&path_attr, sizeof (path_attr));
2725 
2726 	/*
2727 	 * Get the port speed from Loopback path information.
2728 	 */
2729 	path_attr.pa_dgids = &state->id_sgid;
2730 	path_attr.pa_num_dgids = 1;
2731 	path_attr.pa_sgid = state->id_sgid;
2732 
2733 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2734 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2735 		goto earlydone;
2736 
2737 	if (num_paths < 1)
2738 		goto earlydone;
2739 
2740 	/*
2741 	 * In case SA does not return an expected value, report the default
2742 	 * speed as 1X.
2743 	 */
2744 	ret = 1;
2745 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2746 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2747 			ret = 1;
2748 			break;
2749 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2750 			ret = 4;
2751 			break;
2752 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2753 			ret = 12;
2754 			break;
2755 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2756 			ret = 2;
2757 			break;
2758 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2759 			ret = 8;
2760 			break;
2761 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2762 			ret = 16;
2763 			break;
2764 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2765 			ret = 24;
2766 			break;
2767 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2768 			ret = 32;
2769 			break;
2770 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2771 			ret = 48;
2772 			break;
2773 	}
2774 
2775 	ifspeed *= ret;
2776 
2777 earlydone:
2778 	return (ifspeed);
2779 }
2780 
2781 /*
2782  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2783  * representing the input mcg mgid.
2784  */
2785 static ibd_mce_t *
2786 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2787 {
2788 	ibd_mce_t *ptr = list_head(mlist);
2789 
2790 	/*
2791 	 * Do plain linear search.
2792 	 */
2793 	while (ptr != NULL) {
2794 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2795 		    sizeof (ib_gid_t)) == 0)
2796 			return (ptr);
2797 		ptr = list_next(mlist, ptr);
2798 	}
2799 	return (NULL);
2800 }
2801 
2802 /*
2803  * Execute IBA JOIN.
2804  */
2805 static ibt_status_t
2806 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2807 {
2808 	ibt_mcg_attr_t mcg_attr;
2809 
2810 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2811 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2812 	mcg_attr.mc_mgid = mgid;
2813 	mcg_attr.mc_join_state = mce->mc_jstate;
2814 	mcg_attr.mc_scope = state->id_scope;
2815 	mcg_attr.mc_pkey = state->id_pkey;
2816 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2817 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2818 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2819 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2820 	    NULL, NULL));
2821 }
2822 
2823 /*
2824  * This code JOINs the port in the proper way (depending on the join
2825  * state) so that IBA fabric will forward mcg packets to/from the port.
2826  * It also attaches the QPN to the mcg so it can receive those mcg
2827  * packets. This code makes sure not to attach the mcg to the QP if
2828  * that has been previously done due to the mcg being joined with a
2829  * different join state, even though this is not required by SWG_0216,
2830  * refid 3610.
2831  */
2832 static ibd_mce_t *
2833 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2834 {
2835 	ibt_status_t ibt_status;
2836 	ibd_mce_t *mce, *tmce, *omce = NULL;
2837 	boolean_t do_attach = B_TRUE;
2838 
2839 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2840 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2841 
2842 	/*
2843 	 * For enable_multicast Full member joins, we need to do some
2844 	 * extra work. If there is already an mce on the list that
2845 	 * indicates full membership, that means the membership has
2846 	 * not yet been dropped (since the disable_multicast was issued)
2847 	 * because there are pending Tx's to the mcg; in that case, just
2848 	 * mark the mce not to be reaped when the Tx completion queues
2849 	 * an async reap operation.
2850 	 *
2851 	 * If there is already an mce on the list indicating sendonly
2852 	 * membership, try to promote to full membership. Be careful
2853 	 * not to deallocate the old mce, since there might be an AH
2854 	 * pointing to it; instead, update the old mce with new data
2855 	 * that tracks the full membership.
2856 	 */
2857 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2858 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2859 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2860 			ASSERT(omce->mc_fullreap);
2861 			omce->mc_fullreap = B_FALSE;
2862 			return (omce);
2863 		} else {
2864 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2865 		}
2866 	}
2867 
2868 	/*
2869 	 * Allocate the ibd_mce_t to track this JOIN.
2870 	 */
2871 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2872 	mce->mc_fullreap = B_FALSE;
2873 	mce->mc_jstate = jstate;
2874 
2875 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2876 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2877 		    ibt_status);
2878 		kmem_free(mce, sizeof (ibd_mce_t));
2879 		return (NULL);
2880 	}
2881 
2882 	/*
2883 	 * Is an IBA attach required? Not if the interface is already joined
2884 	 * to the mcg in a different appropriate join state.
2885 	 */
2886 	if (jstate == IB_MC_JSTATE_NON) {
2887 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2888 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2889 			do_attach = B_FALSE;
2890 	} else if (jstate == IB_MC_JSTATE_FULL) {
2891 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2892 			do_attach = B_FALSE;
2893 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2894 		do_attach = B_FALSE;
2895 	}
2896 
2897 	if (do_attach) {
2898 		/*
2899 		 * Do the IBA attach.
2900 		 */
2901 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2902 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2903 		    &mce->mc_info)) != IBT_SUCCESS) {
2904 			DPRINT(10, "ibd_join_group : failed qp attachment "
2905 			    "%d\n", ibt_status);
2906 			/*
2907 			 * NOTE that we should probably preserve the join info
2908 			 * in the list and later try to leave again at detach
2909 			 * time.
2910 			 */
2911 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2912 			    state->id_sgid, jstate);
2913 			kmem_free(mce, sizeof (ibd_mce_t));
2914 			return (NULL);
2915 		}
2916 	}
2917 
2918 	/*
2919 	 * Insert the ibd_mce_t in the proper list.
2920 	 */
2921 	if (jstate == IB_MC_JSTATE_NON) {
2922 		IBD_MCACHE_INSERT_NON(state, mce);
2923 	} else {
2924 		/*
2925 		 * Set up the mc_req fields used for reaping the
2926 		 * mcg in case of delayed tx completion (see
2927 		 * ibd_tx_cleanup()). Also done for sendonly join in
2928 		 * case we are promoted to fullmembership later and
2929 		 * keep using the same mce.
2930 		 */
2931 		mce->mc_req.rq_gid = mgid;
2932 		mce->mc_req.rq_ptr = mce;
2933 		/*
2934 		 * Check whether this is the case of trying to join
2935 		 * full member, and we were already joined send only.
2936 		 * We try to drop our SendOnly membership, but it is
2937 		 * possible that the mcg does not exist anymore (and
2938 		 * the subnet trap never reached us), so the leave
2939 		 * operation might fail.
2940 		 */
2941 		if (omce != NULL) {
2942 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2943 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2944 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2945 			bcopy(&mce->mc_info, &omce->mc_info,
2946 			    sizeof (ibt_mcg_info_t));
2947 			kmem_free(mce, sizeof (ibd_mce_t));
2948 			return (omce);
2949 		}
2950 		mutex_enter(&state->id_mc_mutex);
2951 		IBD_MCACHE_INSERT_FULL(state, mce);
2952 		mutex_exit(&state->id_mc_mutex);
2953 	}
2954 
2955 	return (mce);
2956 }
2957 
2958 /*
2959  * Called during port up event handling to attempt to reacquire full
2960  * membership to an mcg. Stripped down version of ibd_join_group().
2961  * Note that it is possible that the mcg might have gone away, and
2962  * gets recreated at this point.
2963  */
2964 static void
2965 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2966 {
2967 	ib_gid_t mgid;
2968 
2969 	/*
2970 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2971 	 * reap/leave is going to try to leave the group. We could prevent
2972 	 * that by adding a boolean flag into ibd_mce_t, if required.
2973 	 */
2974 	if (mce->mc_fullreap)
2975 		return;
2976 
2977 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2978 
2979 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2980 	    mgid.gid_guid);
2981 
2982 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2983 		ibd_print_warn(state, "Failure on port up to rejoin "
2984 		    "multicast gid %016llx:%016llx",
2985 		    (u_longlong_t)mgid.gid_prefix,
2986 		    (u_longlong_t)mgid.gid_guid);
2987 }
2988 
2989 /*
2990  * This code handles delayed Tx completion cleanups for mcg's to which
2991  * disable_multicast has been issued, regular mcg related cleanups during
2992  * disable_multicast, disable_promiscuous and mcg traps, as well as
2993  * cleanups during driver detach time. Depending on the join state,
2994  * it deletes the mce from the appropriate list and issues the IBA
2995  * leave/detach; except in the disable_multicast case when the mce
2996  * is left on the active list for a subsequent Tx completion cleanup.
2997  */
2998 static void
2999 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3000     uint8_t jstate)
3001 {
3002 	ibd_mce_t *tmce;
3003 	boolean_t do_detach = B_TRUE;
3004 
3005 	/*
3006 	 * Before detaching, we must check whether the other list
3007 	 * contains the mcg; if we detach blindly, the consumer
3008 	 * who set up the other list will also stop receiving
3009 	 * traffic.
3010 	 */
3011 	if (jstate == IB_MC_JSTATE_FULL) {
3012 		/*
3013 		 * The following check is only relevant while coming
3014 		 * from the Tx completion path in the reap case.
3015 		 */
3016 		if (!mce->mc_fullreap)
3017 			return;
3018 		mutex_enter(&state->id_mc_mutex);
3019 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3020 		mutex_exit(&state->id_mc_mutex);
3021 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3022 			do_detach = B_FALSE;
3023 	} else if (jstate == IB_MC_JSTATE_NON) {
3024 		IBD_MCACHE_PULLOUT_NON(state, mce);
3025 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3026 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3027 			do_detach = B_FALSE;
3028 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3029 		mutex_enter(&state->id_mc_mutex);
3030 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3031 		mutex_exit(&state->id_mc_mutex);
3032 		do_detach = B_FALSE;
3033 	}
3034 
3035 	/*
3036 	 * If we are reacting to a mcg trap and leaving our sendonly or
3037 	 * non membership, the mcg is possibly already gone, so attempting
3038 	 * to leave might fail. On the other hand, we must try to leave
3039 	 * anyway, since this might be a trap from long ago, and we could
3040 	 * have potentially sendonly joined to a recent incarnation of
3041 	 * the mcg and are about to loose track of this information.
3042 	 */
3043 	if (do_detach) {
3044 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3045 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3046 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3047 	}
3048 
3049 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3050 	kmem_free(mce, sizeof (ibd_mce_t));
3051 }
3052 
3053 /*
3054  * Async code executed due to multicast and promiscuous disable requests
3055  * and mcg trap handling; also executed during driver detach. Mostly, a
3056  * leave and detach is done; except for the fullmember case when Tx
3057  * requests are pending, whence arrangements are made for subsequent
3058  * cleanup on Tx completion.
3059  */
3060 static void
3061 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3062 {
3063 	ipoib_mac_t mcmac;
3064 	boolean_t recycled;
3065 	ibd_mce_t *mce;
3066 
3067 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3068 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3069 
3070 	if (jstate == IB_MC_JSTATE_NON) {
3071 		recycled = B_TRUE;
3072 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3073 		/*
3074 		 * In case we are handling a mcg trap, we might not find
3075 		 * the mcg in the non list.
3076 		 */
3077 		if (mce == NULL) {
3078 			return;
3079 		}
3080 	} else {
3081 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3082 
3083 		/*
3084 		 * In case we are handling a mcg trap, make sure the trap
3085 		 * is not arriving late; if we have an mce that indicates
3086 		 * that we are already a fullmember, that would be a clear
3087 		 * indication that the trap arrived late (ie, is for a
3088 		 * previous incarnation of the mcg).
3089 		 */
3090 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3091 			if ((mce == NULL) || (mce->mc_jstate ==
3092 			    IB_MC_JSTATE_FULL)) {
3093 				return;
3094 			}
3095 		} else {
3096 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3097 
3098 			/*
3099 			 * If join group failed, mce will be NULL here.
3100 			 * This is because in GLDv3 driver, set multicast
3101 			 *  will always return success.
3102 			 */
3103 			if (mce == NULL) {
3104 				return;
3105 			}
3106 
3107 			mce->mc_fullreap = B_TRUE;
3108 		}
3109 
3110 		/*
3111 		 * If no pending Tx's remain that reference the AH
3112 		 * for the mcg, recycle it from active to free list.
3113 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3114 		 * so the last completing Tx will cause an async reap
3115 		 * operation to be invoked, at which time we will drop our
3116 		 * membership to the mcg so that the pending Tx's complete
3117 		 * successfully. Refer to comments on "AH and MCE active
3118 		 * list manipulation" at top of this file. The lock protects
3119 		 * against Tx fast path and Tx cleanup code.
3120 		 */
3121 		mutex_enter(&state->id_ac_mutex);
3122 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3123 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3124 		    IB_MC_JSTATE_SEND_ONLY_NON));
3125 		mutex_exit(&state->id_ac_mutex);
3126 	}
3127 
3128 	if (recycled) {
3129 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3130 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3131 		ibd_async_reap_group(state, mce, mgid, jstate);
3132 	}
3133 }
3134 
3135 /*
3136  * Find the broadcast address as defined by IPoIB; implicitly
3137  * determines the IBA scope, mtu, tclass etc of the link the
3138  * interface is going to be a member of.
3139  */
3140 static ibt_status_t
3141 ibd_find_bgroup(ibd_state_t *state)
3142 {
3143 	ibt_mcg_attr_t mcg_attr;
3144 	uint_t numg;
3145 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3146 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3147 	    IB_MC_SCOPE_GLOBAL };
3148 	int i, mcgmtu;
3149 	boolean_t found = B_FALSE;
3150 	int ret;
3151 	ibt_mcg_info_t mcg_info;
3152 
3153 	state->id_bgroup_created = B_FALSE;
3154 
3155 query_bcast_grp:
3156 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3157 	mcg_attr.mc_pkey = state->id_pkey;
3158 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3159 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3160 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3161 
3162 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3163 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3164 
3165 		/*
3166 		 * Look for the IPoIB broadcast group.
3167 		 */
3168 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3169 		state->id_mgid.gid_prefix =
3170 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3171 		    ((uint64_t)state->id_scope << 48) |
3172 		    ((uint32_t)(state->id_pkey << 16)));
3173 		mcg_attr.mc_mgid = state->id_mgid;
3174 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3175 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3176 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3177 			found = B_TRUE;
3178 			break;
3179 		}
3180 	}
3181 
3182 	if (!found) {
3183 		if (ibd_create_broadcast_group) {
3184 			/*
3185 			 * If we created the broadcast group, but failed to
3186 			 * find it, we can't do anything except leave the
3187 			 * one we created and return failure.
3188 			 */
3189 			if (state->id_bgroup_created) {
3190 				ibd_print_warn(state, "IPoIB broadcast group "
3191 				    "absent. Unable to query after create.");
3192 				goto find_bgroup_fail;
3193 			}
3194 
3195 			/*
3196 			 * Create the ipoib broadcast group if it didn't exist
3197 			 */
3198 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3199 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3200 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3201 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3202 			mcg_attr.mc_pkey = state->id_pkey;
3203 			mcg_attr.mc_flow = 0;
3204 			mcg_attr.mc_sl = 0;
3205 			mcg_attr.mc_tclass = 0;
3206 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3207 			state->id_mgid.gid_prefix =
3208 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3209 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3210 			    ((uint32_t)(state->id_pkey << 16)));
3211 			mcg_attr.mc_mgid = state->id_mgid;
3212 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3213 
3214 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3215 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3216 				ibd_print_warn(state, "IPoIB broadcast group "
3217 				    "absent, create failed: ret = %d\n", ret);
3218 				state->id_bgroup_created = B_FALSE;
3219 				return (IBT_FAILURE);
3220 			}
3221 			state->id_bgroup_created = B_TRUE;
3222 			goto query_bcast_grp;
3223 		} else {
3224 			ibd_print_warn(state, "IPoIB broadcast group absent");
3225 			return (IBT_FAILURE);
3226 		}
3227 	}
3228 
3229 	/*
3230 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3231 	 */
3232 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3233 	if (state->id_mtu < mcgmtu) {
3234 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3235 		    "greater than port's maximum MTU %d", mcgmtu,
3236 		    state->id_mtu);
3237 		ibt_free_mcg_info(state->id_mcinfo, 1);
3238 		goto find_bgroup_fail;
3239 	}
3240 	state->id_mtu = mcgmtu;
3241 
3242 	return (IBT_SUCCESS);
3243 
3244 find_bgroup_fail:
3245 	if (state->id_bgroup_created) {
3246 		(void) ibt_leave_mcg(state->id_sgid,
3247 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3248 		    IB_MC_JSTATE_FULL);
3249 	}
3250 
3251 	return (IBT_FAILURE);
3252 }
3253 
3254 static int
3255 ibd_alloc_tx_copybufs(ibd_state_t *state)
3256 {
3257 	ibt_mr_attr_t mem_attr;
3258 
3259 	/*
3260 	 * Allocate one big chunk for all regular tx copy bufs
3261 	 */
3262 	state->id_tx_buf_sz = state->id_mtu;
3263 	if (state->id_lso_policy && state->id_lso_capable &&
3264 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3265 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3266 	}
3267 
3268 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3269 	    state->id_tx_buf_sz, KM_SLEEP);
3270 
3271 	state->id_tx_wqes = kmem_zalloc(state->id_num_swqe *
3272 	    sizeof (ibd_swqe_t), KM_SLEEP);
3273 
3274 	/*
3275 	 * Do one memory registration on the entire txbuf area
3276 	 */
3277 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3278 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3279 	mem_attr.mr_as = NULL;
3280 	mem_attr.mr_flags = IBT_MR_SLEEP;
3281 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3282 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3283 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3284 		kmem_free(state->id_tx_wqes,
3285 		    state->id_num_swqe * sizeof (ibd_swqe_t));
3286 		kmem_free(state->id_tx_bufs,
3287 		    state->id_num_swqe * state->id_tx_buf_sz);
3288 		state->id_tx_bufs = NULL;
3289 		return (DDI_FAILURE);
3290 	}
3291 
3292 	return (DDI_SUCCESS);
3293 }
3294 
3295 static int
3296 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3297 {
3298 	ibt_mr_attr_t mem_attr;
3299 	ibd_lsobuf_t *buflist;
3300 	ibd_lsobuf_t *lbufp;
3301 	ibd_lsobuf_t *tail;
3302 	ibd_lsobkt_t *bktp;
3303 	uint8_t *membase;
3304 	uint8_t *memp;
3305 	uint_t memsz;
3306 	int i;
3307 
3308 	/*
3309 	 * Allocate the lso bucket
3310 	 */
3311 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3312 
3313 	/*
3314 	 * Allocate the entire lso memory and register it
3315 	 */
3316 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3317 	membase = kmem_zalloc(memsz, KM_SLEEP);
3318 
3319 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3320 	mem_attr.mr_len = memsz;
3321 	mem_attr.mr_as = NULL;
3322 	mem_attr.mr_flags = IBT_MR_SLEEP;
3323 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3324 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3325 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3326 		kmem_free(membase, memsz);
3327 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3328 		return (DDI_FAILURE);
3329 	}
3330 
3331 	mutex_enter(&state->id_lso_lock);
3332 
3333 	/*
3334 	 * Now allocate the buflist.  Note that the elements in the buflist and
3335 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3336 	 * can always derive the address of a buflist entry from the address of
3337 	 * an lso buffer.
3338 	 */
3339 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3340 	    KM_SLEEP);
3341 
3342 	/*
3343 	 * Set up the lso buf chain
3344 	 */
3345 	memp = membase;
3346 	lbufp = buflist;
3347 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3348 		lbufp->lb_isfree = 1;
3349 		lbufp->lb_buf = memp;
3350 		lbufp->lb_next = lbufp + 1;
3351 
3352 		tail = lbufp;
3353 
3354 		memp += IBD_LSO_BUFSZ;
3355 		lbufp++;
3356 	}
3357 	tail->lb_next = NULL;
3358 
3359 	/*
3360 	 * Set up the LSO buffer information in ibd state
3361 	 */
3362 	bktp->bkt_bufl = buflist;
3363 	bktp->bkt_free_head = buflist;
3364 	bktp->bkt_mem = membase;
3365 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3366 	bktp->bkt_nfree = bktp->bkt_nelem;
3367 
3368 	state->id_lso = bktp;
3369 	mutex_exit(&state->id_lso_lock);
3370 
3371 	return (DDI_SUCCESS);
3372 }
3373 
3374 /*
3375  * Statically allocate Tx buffer list(s).
3376  */
3377 static int
3378 ibd_init_txlist(ibd_state_t *state)
3379 {
3380 	ibd_swqe_t *swqe;
3381 	ibt_lkey_t lkey;
3382 	int i;
3383 	uint_t len;
3384 	uint8_t *bufaddr;
3385 
3386 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3387 		return (DDI_FAILURE);
3388 
3389 	if (state->id_lso_policy && state->id_lso_capable) {
3390 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3391 			state->id_lso_policy = B_FALSE;
3392 	}
3393 
3394 	mutex_enter(&state->id_tx_list.dl_mutex);
3395 	state->id_tx_list.dl_head = NULL;
3396 	state->id_tx_list.dl_pending_sends = B_FALSE;
3397 	state->id_tx_list.dl_cnt = 0;
3398 	mutex_exit(&state->id_tx_list.dl_mutex);
3399 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3400 	state->id_tx_rel_list.dl_head = NULL;
3401 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3402 	state->id_tx_rel_list.dl_cnt = 0;
3403 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3404 
3405 	/*
3406 	 * Allocate and setup the swqe list
3407 	 */
3408 	lkey = state->id_tx_mr_desc.md_lkey;
3409 	bufaddr = state->id_tx_bufs;
3410 	len = state->id_tx_buf_sz;
3411 	swqe = state->id_tx_wqes;
3412 	mutex_enter(&state->id_tx_list.dl_mutex);
3413 	for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) {
3414 		swqe->swqe_next = NULL;
3415 		swqe->swqe_im_mblk = NULL;
3416 
3417 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3418 		    bufaddr;
3419 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3420 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3421 
3422 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3423 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3424 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3425 
3426 		/* These are set in send */
3427 		swqe->w_swr.wr_nds = 0;
3428 		swqe->w_swr.wr_sgl = NULL;
3429 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3430 
3431 		/* add to list */
3432 		state->id_tx_list.dl_cnt++;
3433 		swqe->swqe_next = state->id_tx_list.dl_head;
3434 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3435 	}
3436 	mutex_exit(&state->id_tx_list.dl_mutex);
3437 
3438 	return (DDI_SUCCESS);
3439 }
3440 
3441 static int
3442 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3443     uint32_t *nds_p)
3444 {
3445 	ibd_lsobkt_t *bktp;
3446 	ibd_lsobuf_t *lbufp;
3447 	ibd_lsobuf_t *nextp;
3448 	ibt_lkey_t lso_lkey;
3449 	uint_t frag_sz;
3450 	uint_t num_needed;
3451 	int i;
3452 
3453 	ASSERT(sgl_p != NULL);
3454 	ASSERT(nds_p != NULL);
3455 	ASSERT(req_sz != 0);
3456 
3457 	/*
3458 	 * Determine how many bufs we'd need for the size requested
3459 	 */
3460 	num_needed = req_sz / IBD_LSO_BUFSZ;
3461 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3462 		num_needed++;
3463 
3464 	mutex_enter(&state->id_lso_lock);
3465 
3466 	/*
3467 	 * If we don't have enough lso bufs, return failure
3468 	 */
3469 	ASSERT(state->id_lso != NULL);
3470 	bktp = state->id_lso;
3471 	if (bktp->bkt_nfree < num_needed) {
3472 		mutex_exit(&state->id_lso_lock);
3473 		return (-1);
3474 	}
3475 
3476 	/*
3477 	 * Pick the first 'num_needed' bufs from the free list
3478 	 */
3479 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3480 	lbufp = bktp->bkt_free_head;
3481 	for (i = 0; i < num_needed; i++) {
3482 		ASSERT(lbufp->lb_isfree != 0);
3483 		ASSERT(lbufp->lb_buf != NULL);
3484 
3485 		nextp = lbufp->lb_next;
3486 
3487 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3488 		sgl_p[i].ds_key = lso_lkey;
3489 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3490 
3491 		lbufp->lb_isfree = 0;
3492 		lbufp->lb_next = NULL;
3493 
3494 		lbufp = nextp;
3495 	}
3496 	bktp->bkt_free_head = lbufp;
3497 
3498 	/*
3499 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3500 	 * to adjust the last sgl entry's length. Since we know we need atleast
3501 	 * one, the i-1 use below is ok.
3502 	 */
3503 	if (frag_sz) {
3504 		sgl_p[i-1].ds_len = frag_sz;
3505 	}
3506 
3507 	/*
3508 	 * Update nfree count and return
3509 	 */
3510 	bktp->bkt_nfree -= num_needed;
3511 
3512 	mutex_exit(&state->id_lso_lock);
3513 
3514 	*nds_p = num_needed;
3515 
3516 	return (0);
3517 }
3518 
3519 static void
3520 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3521 {
3522 	ibd_lsobkt_t *bktp;
3523 	ibd_lsobuf_t *lbufp;
3524 	uint8_t *lso_mem_end;
3525 	uint_t ndx;
3526 	int i;
3527 
3528 	mutex_enter(&state->id_lso_lock);
3529 
3530 	bktp = state->id_lso;
3531 	ASSERT(bktp != NULL);
3532 
3533 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3534 	for (i = 0; i < nds; i++) {
3535 		uint8_t *va;
3536 
3537 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3538 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3539 
3540 		/*
3541 		 * Figure out the buflist element this sgl buffer corresponds
3542 		 * to and put it back at the head
3543 		 */
3544 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3545 		lbufp = bktp->bkt_bufl + ndx;
3546 
3547 		ASSERT(lbufp->lb_isfree == 0);
3548 		ASSERT(lbufp->lb_buf == va);
3549 
3550 		lbufp->lb_isfree = 1;
3551 		lbufp->lb_next = bktp->bkt_free_head;
3552 		bktp->bkt_free_head = lbufp;
3553 	}
3554 	bktp->bkt_nfree += nds;
3555 
3556 	mutex_exit(&state->id_lso_lock);
3557 }
3558 
3559 static void
3560 ibd_free_tx_copybufs(ibd_state_t *state)
3561 {
3562 	/*
3563 	 * Unregister txbuf mr
3564 	 */
3565 	if (ibt_deregister_mr(state->id_hca_hdl,
3566 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3567 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3568 	}
3569 	state->id_tx_mr_hdl = NULL;
3570 
3571 	/*
3572 	 * Free txbuf memory
3573 	 */
3574 	kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t));
3575 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3576 	state->id_tx_wqes = NULL;
3577 	state->id_tx_bufs = NULL;
3578 }
3579 
3580 static void
3581 ibd_free_tx_lsobufs(ibd_state_t *state)
3582 {
3583 	ibd_lsobkt_t *bktp;
3584 
3585 	mutex_enter(&state->id_lso_lock);
3586 
3587 	if ((bktp = state->id_lso) == NULL) {
3588 		mutex_exit(&state->id_lso_lock);
3589 		return;
3590 	}
3591 
3592 	/*
3593 	 * First, free the buflist
3594 	 */
3595 	ASSERT(bktp->bkt_bufl != NULL);
3596 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3597 
3598 	/*
3599 	 * Unregister the LSO memory and free it
3600 	 */
3601 	ASSERT(bktp->bkt_mr_hdl != NULL);
3602 	if (ibt_deregister_mr(state->id_hca_hdl,
3603 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3604 		DPRINT(10,
3605 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3606 	}
3607 	ASSERT(bktp->bkt_mem);
3608 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3609 
3610 	/*
3611 	 * Finally free the bucket
3612 	 */
3613 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3614 	state->id_lso = NULL;
3615 
3616 	mutex_exit(&state->id_lso_lock);
3617 }
3618 
3619 /*
3620  * Free the statically allocated Tx buffer list.
3621  */
3622 static void
3623 ibd_fini_txlist(ibd_state_t *state)
3624 {
3625 	/*
3626 	 * Free the allocated swqes
3627 	 */
3628 	mutex_enter(&state->id_tx_list.dl_mutex);
3629 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3630 	state->id_tx_list.dl_head = NULL;
3631 	state->id_tx_list.dl_pending_sends = B_FALSE;
3632 	state->id_tx_list.dl_cnt = 0;
3633 	state->id_tx_rel_list.dl_head = NULL;
3634 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3635 	state->id_tx_rel_list.dl_cnt = 0;
3636 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3637 	mutex_exit(&state->id_tx_list.dl_mutex);
3638 
3639 	ibd_free_tx_lsobufs(state);
3640 	ibd_free_tx_copybufs(state);
3641 }
3642 
3643 /*
3644  * post a list of rwqes, NULL terminated.
3645  */
3646 static void
3647 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3648 {
3649 	uint_t		i;
3650 	uint_t		num_posted;
3651 	ibt_status_t	ibt_status;
3652 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3653 
3654 	while (rwqe) {
3655 		/* Post up to IBD_RX_POST_CNT receive work requests */
3656 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
3657 			wrs[i] = rwqe->w_rwr;
3658 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3659 			if (rwqe == NULL) {
3660 				i++;
3661 				break;
3662 			}
3663 		}
3664 
3665 		/*
3666 		 * If posting fails for some reason, we'll never receive
3667 		 * completion intimation, so we'll need to cleanup. But
3668 		 * we need to make sure we don't clean up nodes whose
3669 		 * wrs have been successfully posted. We assume that the
3670 		 * hca driver returns on the first failure to post and
3671 		 * therefore the first 'num_posted' entries don't need
3672 		 * cleanup here.
3673 		 */
3674 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
3675 
3676 		num_posted = 0;
3677 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3678 		    &num_posted);
3679 		if (ibt_status != IBT_SUCCESS) {
3680 			/* This cannot happen unless the device has an error. */
3681 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
3682 			    "posting multiple wrs failed: "
3683 			    "requested=%d, done=%d, ret=%d",
3684 			    IBD_RX_POST_CNT, num_posted, ibt_status);
3685 			atomic_add_32(&state->id_rx_list.dl_cnt,
3686 			    num_posted - i);
3687 		}
3688 	}
3689 }
3690 
3691 /*
3692  * Grab a list of rwqes from the array of lists, and post the list.
3693  */
3694 static void
3695 ibd_post_recv_intr(ibd_state_t *state)
3696 {
3697 	ibd_rx_queue_t	*rxp;
3698 	ibd_rwqe_t *list;
3699 
3700 	/* rotate through the rx_queue array, expecting an adequate number */
3701 	state->id_rx_post_queue_index =
3702 	    (state->id_rx_post_queue_index + 1) &
3703 	    (state->id_rx_nqueues - 1);
3704 
3705 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3706 	mutex_enter(&rxp->rx_post_lock);
3707 	list = WQE_TO_RWQE(rxp->rx_head);
3708 	rxp->rx_head = NULL;
3709 	rxp->rx_cnt = 0;
3710 	mutex_exit(&rxp->rx_post_lock);
3711 	ibd_post_recv_list(state, list);
3712 }
3713 
3714 /* macro explained below */
3715 #define	RX_QUEUE_HASH(rwqe) \
3716 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3717 
3718 /*
3719  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3720  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3721  *
3722  * Note: one of 2^N lists is chosen via a hash.  This is done
3723  * because using one list is contentious.  If the first list is busy
3724  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3725  *
3726  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3727  * even distribution of mapping rwqes to the 2^N queues.
3728  */
3729 static void
3730 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3731 {
3732 	ibd_rx_queue_t	*rxp;
3733 
3734 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3735 
3736 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
3737 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
3738 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3739 		mutex_enter(&rxp->rx_post_lock);
3740 	}
3741 	rwqe->rwqe_next = rxp->rx_head;
3742 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
3743 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
3744 
3745 		/* only call ibt_post_recv() every Nth time through here */
3746 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
3747 			rxp->rx_head = NULL;
3748 			rxp->rx_cnt = 0;
3749 			mutex_exit(&rxp->rx_post_lock);
3750 			ibd_post_recv_list(state, rwqe);
3751 			return;
3752 		}
3753 	}
3754 	rxp->rx_head = RWQE_TO_WQE(rwqe);
3755 	mutex_exit(&rxp->rx_post_lock);
3756 }
3757 
3758 static int
3759 ibd_alloc_rx_copybufs(ibd_state_t *state)
3760 {
3761 	ibt_mr_attr_t mem_attr;
3762 	int i;
3763 
3764 	/*
3765 	 * Allocate one big chunk for all regular rx copy bufs
3766 	 */
3767 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
3768 
3769 	state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe *
3770 	    state->id_rx_buf_sz, KM_SLEEP);
3771 
3772 	state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe *
3773 	    sizeof (ibd_rwqe_t), KM_SLEEP);
3774 
3775 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
3776 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
3777 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
3778 	for (i = 0; i < state->id_rx_nqueues; i++) {
3779 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3780 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
3781 	}
3782 
3783 	/*
3784 	 * Do one memory registration on the entire rxbuf area
3785 	 */
3786 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
3787 	mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz;
3788 	mem_attr.mr_as = NULL;
3789 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3790 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3791 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
3792 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
3793 		kmem_free(state->id_rx_wqes,
3794 		    state->id_num_rwqe * sizeof (ibd_rwqe_t));
3795 		kmem_free(state->id_rx_bufs,
3796 		    state->id_num_rwqe * state->id_rx_buf_sz);
3797 		state->id_rx_bufs = NULL;
3798 		state->id_rx_wqes = NULL;
3799 		return (DDI_FAILURE);
3800 	}
3801 
3802 	return (DDI_SUCCESS);
3803 }
3804 
3805 /*
3806  * Allocate the statically allocated Rx buffer list.
3807  */
3808 static int
3809 ibd_init_rxlist(ibd_state_t *state)
3810 {
3811 	ibd_rwqe_t *rwqe, *next;
3812 	ibd_wqe_t *list;
3813 	ibt_lkey_t lkey;
3814 	int i;
3815 	uint_t len;
3816 	uint8_t *bufaddr;
3817 
3818 	mutex_enter(&state->id_rx_free_list.dl_mutex);
3819 	if (state->id_rx_free_list.dl_head != NULL) {
3820 		/* rx rsrcs were never freed.  Just repost them */
3821 		len = state->id_rx_buf_sz;
3822 		list = state->id_rx_free_list.dl_head;
3823 		state->id_rx_free_list.dl_head = NULL;
3824 		state->id_rx_free_list.dl_cnt = 0;
3825 		mutex_exit(&state->id_rx_free_list.dl_mutex);
3826 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3827 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3828 			if ((rwqe->rwqe_im_mblk = desballoc(
3829 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
3830 			    &rwqe->w_freemsg_cb)) == NULL) {
3831 				/* allow freemsg_cb to free the rwqes */
3832 				if (atomic_dec_32_nv(&state->id_running) != 0) {
3833 					cmn_err(CE_WARN, "ibd_init_rxlist: "
3834 					    "id_running was not 1\n");
3835 				}
3836 				DPRINT(10, "ibd_init_rxlist : "
3837 				    "failed in desballoc()");
3838 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3839 				    rwqe = next) {
3840 					next = WQE_TO_RWQE(rwqe->rwqe_next);
3841 					freemsg(rwqe->rwqe_im_mblk);
3842 				}
3843 				atomic_inc_32(&state->id_running);
3844 				return (DDI_FAILURE);
3845 			}
3846 		}
3847 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
3848 		return (DDI_SUCCESS);
3849 	}
3850 	mutex_exit(&state->id_rx_free_list.dl_mutex);
3851 
3852 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
3853 		return (DDI_FAILURE);
3854 
3855 	/*
3856 	 * Allocate and setup the rwqe list
3857 	 */
3858 	len = state->id_rx_buf_sz;
3859 	lkey = state->id_rx_mr_desc.md_lkey;
3860 	rwqe = state->id_rx_wqes;
3861 	bufaddr = state->id_rx_bufs;
3862 	list = NULL;
3863 	for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) {
3864 		rwqe->w_state = state;
3865 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3866 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3867 
3868 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
3869 
3870 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
3871 		    &rwqe->w_freemsg_cb)) == NULL) {
3872 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
3873 			/* allow freemsg_cb to free the rwqes */
3874 			if (atomic_dec_32_nv(&state->id_running) != 0) {
3875 				cmn_err(CE_WARN, "ibd_init_rxlist: "
3876 				    "id_running was not 1\n");
3877 			}
3878 			DPRINT(10, "ibd_init_rxlist : "
3879 			    "failed in desballoc()");
3880 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3881 			    rwqe = next) {
3882 				next = WQE_TO_RWQE(rwqe->rwqe_next);
3883 				freemsg(rwqe->rwqe_im_mblk);
3884 			}
3885 			atomic_inc_32(&state->id_running);
3886 			return (DDI_FAILURE);
3887 		}
3888 
3889 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
3890 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
3891 		    (ib_vaddr_t)(uintptr_t)bufaddr;
3892 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
3893 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3894 		rwqe->w_rwr.wr_nds = 1;
3895 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3896 
3897 		rwqe->rwqe_next = list;
3898 		list = RWQE_TO_WQE(rwqe);
3899 	}
3900 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
3901 
3902 	return (DDI_SUCCESS);
3903 }
3904 
3905 static void
3906 ibd_free_rx_copybufs(ibd_state_t *state)
3907 {
3908 	int i;
3909 
3910 	/*
3911 	 * Unregister rxbuf mr
3912 	 */
3913 	if (ibt_deregister_mr(state->id_hca_hdl,
3914 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
3915 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
3916 	}
3917 	state->id_rx_mr_hdl = NULL;
3918 
3919 	/*
3920 	 * Free rxbuf memory
3921 	 */
3922 	for (i = 0; i < state->id_rx_nqueues; i++) {
3923 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3924 		mutex_destroy(&rxp->rx_post_lock);
3925 	}
3926 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
3927 	    sizeof (ibd_rx_queue_t));
3928 	kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t));
3929 	kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz);
3930 	state->id_rx_queues = NULL;
3931 	state->id_rx_wqes = NULL;
3932 	state->id_rx_bufs = NULL;
3933 }
3934 
3935 static void
3936 ibd_free_rx_rsrcs(ibd_state_t *state)
3937 {
3938 	mutex_enter(&state->id_rx_free_list.dl_mutex);
3939 	if (state->id_rx_free_list.dl_head == NULL) {
3940 		/* already freed */
3941 		mutex_exit(&state->id_rx_free_list.dl_mutex);
3942 		return;
3943 	}
3944 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe);
3945 	ibd_free_rx_copybufs(state);
3946 	state->id_rx_free_list.dl_cnt = 0;
3947 	state->id_rx_free_list.dl_head = NULL;
3948 	mutex_exit(&state->id_rx_free_list.dl_mutex);
3949 }
3950 
3951 /*
3952  * Free the statically allocated Rx buffer list.
3953  *
3954  */
3955 static void
3956 ibd_fini_rxlist(ibd_state_t *state)
3957 {
3958 	ibd_rwqe_t *rwqe;
3959 	int i;
3960 
3961 	/* run through the rx_queue's, calling freemsg() */
3962 	for (i = 0; i < state->id_rx_nqueues; i++) {
3963 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3964 		mutex_enter(&rxp->rx_post_lock);
3965 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
3966 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3967 			freemsg(rwqe->rwqe_im_mblk);
3968 			rxp->rx_cnt--;
3969 		}
3970 		rxp->rx_head = NULL;
3971 		mutex_exit(&rxp->rx_post_lock);
3972 	}
3973 
3974 	/* cannot free rx resources unless gld returned everything */
3975 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
3976 		ibd_free_rx_rsrcs(state);
3977 }
3978 
3979 /*
3980  * Free an allocated recv wqe.
3981  */
3982 /* ARGSUSED */
3983 static void
3984 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3985 {
3986 	/*
3987 	 * desballoc() failed (no memory).
3988 	 *
3989 	 * This rwqe is placed on a free list so that it
3990 	 * can be reinstated when memory is available.
3991 	 *
3992 	 * NOTE: no code currently exists to reinstate
3993 	 * these "lost" rwqes.
3994 	 */
3995 	mutex_enter(&state->id_rx_free_list.dl_mutex);
3996 	state->id_rx_free_list.dl_cnt++;
3997 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
3998 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
3999 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4000 }
4001 
4002 /*
4003  * IBA Rx completion queue handler. Guaranteed to be single
4004  * threaded and nonreentrant for this CQ.
4005  */
4006 /* ARGSUSED */
4007 static void
4008 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4009 {
4010 	ibd_state_t *state = (ibd_state_t *)arg;
4011 
4012 	atomic_inc_64(&state->id_num_intrs);
4013 
4014 	if (ibd_rx_softintr == 1) {
4015 		mutex_enter(&state->id_rcq_poll_lock);
4016 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4017 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4018 			mutex_exit(&state->id_rcq_poll_lock);
4019 			return;
4020 		} else {
4021 			mutex_exit(&state->id_rcq_poll_lock);
4022 			ddi_trigger_softintr(state->id_rx);
4023 		}
4024 	} else
4025 		(void) ibd_intr((caddr_t)state);
4026 }
4027 
4028 /*
4029  * CQ handler for Tx completions, when the Tx CQ is in
4030  * interrupt driven mode.
4031  */
4032 /* ARGSUSED */
4033 static void
4034 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4035 {
4036 	ibd_state_t *state = (ibd_state_t *)arg;
4037 
4038 	atomic_inc_64(&state->id_num_intrs);
4039 
4040 	if (ibd_tx_softintr == 1) {
4041 		mutex_enter(&state->id_scq_poll_lock);
4042 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4043 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4044 			mutex_exit(&state->id_scq_poll_lock);
4045 			return;
4046 		} else {
4047 			mutex_exit(&state->id_scq_poll_lock);
4048 			ddi_trigger_softintr(state->id_tx);
4049 		}
4050 	} else
4051 		(void) ibd_tx_recycle((caddr_t)state);
4052 }
4053 
4054 /*
4055  * Multicast group create/delete trap handler. These will be delivered
4056  * on a kernel thread (handling can thus block) and can be invoked
4057  * concurrently. The handler can be invoked anytime after it is
4058  * registered and before ibt_detach().
4059  */
4060 /* ARGSUSED */
4061 static void
4062 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4063     ibt_subnet_event_t *event)
4064 {
4065 	ibd_state_t *state = (ibd_state_t *)arg;
4066 	ibd_req_t *req;
4067 
4068 	/*
4069 	 * The trap handler will get invoked once for every event for
4070 	 * every port. The input "gid" is the GID0 of the port the
4071 	 * trap came in on; we just need to act on traps that came
4072 	 * to our port, meaning the port on which the ipoib interface
4073 	 * resides. Since ipoib uses GID0 of the port, we just match
4074 	 * the gids to check whether we need to handle the trap.
4075 	 */
4076 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4077 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4078 		return;
4079 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4080 
4081 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4082 
4083 	switch (code) {
4084 		case IBT_SM_EVENT_UNAVAILABLE:
4085 			/*
4086 			 * If we are in promiscuous mode or have
4087 			 * sendnonmembers, we need to print a warning
4088 			 * message right now. Else, just store the
4089 			 * information, print when we enter promiscuous
4090 			 * mode or attempt nonmember send. We might
4091 			 * also want to stop caching sendnonmember.
4092 			 */
4093 			ibd_print_warn(state, "IBA multicast support "
4094 			    "degraded due to unavailability of multicast "
4095 			    "traps");
4096 			break;
4097 		case IBT_SM_EVENT_AVAILABLE:
4098 			/*
4099 			 * If we printed a warning message above or
4100 			 * while trying to nonmember send or get into
4101 			 * promiscuous mode, print an okay message.
4102 			 */
4103 			ibd_print_warn(state, "IBA multicast support "
4104 			    "restored due to availability of multicast "
4105 			    "traps");
4106 			break;
4107 		case IBT_SM_EVENT_MCG_CREATED:
4108 		case IBT_SM_EVENT_MCG_DELETED:
4109 			/*
4110 			 * Common processing of creation/deletion traps.
4111 			 * First check if the instance is being
4112 			 * [de]initialized; back off then, without doing
4113 			 * anything more, since we are not sure if the
4114 			 * async thread is around, or whether we might
4115 			 * be racing with the detach code in ibd_m_stop()
4116 			 * that scans the mcg list.
4117 			 */
4118 			if (!ibd_async_safe(state))
4119 				return;
4120 
4121 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4122 			req->rq_gid = event->sm_notice_gid;
4123 			req->rq_ptr = (void *)code;
4124 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4125 			break;
4126 	}
4127 }
4128 
4129 static void
4130 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4131 {
4132 	ib_gid_t mgid = req->rq_gid;
4133 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4134 
4135 	DPRINT(10, "ibd_async_trap : %d\n", code);
4136 
4137 	/*
4138 	 * Atomically search the nonmember and sendonlymember lists and
4139 	 * delete.
4140 	 */
4141 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4142 
4143 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4144 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4145 
4146 		/*
4147 		 * If in promiscuous mode, try to join/attach to the new
4148 		 * mcg. Given the unreliable out-of-order mode of trap
4149 		 * delivery, we can never be sure whether it is a problem
4150 		 * if the join fails. Thus, we warn the admin of a failure
4151 		 * if this was a creation trap. Note that the trap might
4152 		 * actually be reporting a long past event, and the mcg
4153 		 * might already have been deleted, thus we might be warning
4154 		 * in vain.
4155 		 */
4156 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4157 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4158 			ibd_print_warn(state, "IBA promiscuous mode missed "
4159 			    "new multicast gid %016llx:%016llx",
4160 			    (u_longlong_t)mgid.gid_prefix,
4161 			    (u_longlong_t)mgid.gid_guid);
4162 	}
4163 
4164 	/*
4165 	 * Free the request slot allocated by the subnet event thread.
4166 	 */
4167 	ibd_async_done(state);
4168 }
4169 
4170 /*
4171  * GLDv3 entry point to get capabilities.
4172  */
4173 static boolean_t
4174 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4175 {
4176 	ibd_state_t *state = arg;
4177 
4178 	switch (cap) {
4179 	case MAC_CAPAB_HCKSUM: {
4180 		uint32_t *txflags = cap_data;
4181 
4182 		/*
4183 		 * We either do full checksum or not do it at all
4184 		 */
4185 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4186 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4187 		else
4188 			return (B_FALSE);
4189 		break;
4190 	}
4191 
4192 	case MAC_CAPAB_LSO: {
4193 		mac_capab_lso_t *cap_lso = cap_data;
4194 
4195 		/*
4196 		 * In addition to the capability and policy, since LSO
4197 		 * relies on hw checksum, we'll not enable LSO if we
4198 		 * don't have hw checksum.  Of course, if the HCA doesn't
4199 		 * provide the reserved lkey capability, enabling LSO will
4200 		 * actually affect performance adversely, so we'll disable
4201 		 * LSO even for that case.
4202 		 */
4203 		if (!state->id_lso_policy || !state->id_lso_capable)
4204 			return (B_FALSE);
4205 
4206 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4207 			return (B_FALSE);
4208 
4209 		if (state->id_hca_res_lkey_capab == 0) {
4210 			ibd_print_warn(state, "no reserved-lkey capability, "
4211 			    "disabling LSO");
4212 			return (B_FALSE);
4213 		}
4214 
4215 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4216 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4217 		break;
4218 	}
4219 
4220 	default:
4221 		return (B_FALSE);
4222 	}
4223 
4224 	return (B_TRUE);
4225 }
4226 
4227 static int
4228 ibd_get_port_details(ibd_state_t *state)
4229 {
4230 	ibt_hca_portinfo_t *port_infop;
4231 	ibt_status_t ret;
4232 	uint_t psize, port_infosz;
4233 
4234 	mutex_enter(&state->id_link_mutex);
4235 
4236 	/*
4237 	 * Query for port information
4238 	 */
4239 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4240 	    &port_infop, &psize, &port_infosz);
4241 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
4242 		mutex_exit(&state->id_link_mutex);
4243 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4244 		    "failed, ret=%d", ret);
4245 		return (ENETDOWN);
4246 	}
4247 
4248 	/*
4249 	 * If the link already went down by the time we get here,
4250 	 * give up
4251 	 */
4252 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
4253 		mutex_exit(&state->id_link_mutex);
4254 		ibt_free_portinfo(port_infop, port_infosz);
4255 		DPRINT(10, "ibd_get_port_details: port is not active");
4256 		return (ENETDOWN);
4257 	}
4258 
4259 	/*
4260 	 * If the link is active, verify the pkey
4261 	 */
4262 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4263 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4264 		mutex_exit(&state->id_link_mutex);
4265 		ibt_free_portinfo(port_infop, port_infosz);
4266 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
4267 		    "failed, ret=%d", ret);
4268 		return (ENONET);
4269 	}
4270 
4271 	state->id_mtu = (128 << port_infop->p_mtu);
4272 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4273 	state->id_sgid = *port_infop->p_sgid_tbl;
4274 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4275 	state->id_link_state = LINK_STATE_UP;
4276 
4277 	mutex_exit(&state->id_link_mutex);
4278 	ibt_free_portinfo(port_infop, port_infosz);
4279 
4280 	/*
4281 	 * Now that the port is active, record the port speed
4282 	 */
4283 	state->id_link_speed = ibd_get_portspeed(state);
4284 
4285 	return (0);
4286 }
4287 
4288 static int
4289 ibd_alloc_cqs(ibd_state_t *state)
4290 {
4291 	ibt_hca_attr_t hca_attrs;
4292 	ibt_cq_attr_t cq_attr;
4293 	ibt_status_t ret;
4294 	uint32_t real_size;
4295 
4296 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4297 	ASSERT(ret == IBT_SUCCESS);
4298 
4299 	/*
4300 	 * Allocate Rx/combined CQ:
4301 	 * Theoretically, there is no point in having more than #rwqe
4302 	 * plus #swqe cqe's, except that the CQ will be signaled for
4303 	 * overflow when the last wqe completes, if none of the previous
4304 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4305 	 * to make sure such overflow does not occur.
4306 	 */
4307 	cq_attr.cq_sched = NULL;
4308 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4309 
4310 	/*
4311 	 * Allocate Receive CQ.
4312 	 */
4313 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4314 		cq_attr.cq_size = state->id_num_rwqe + 1;
4315 	} else {
4316 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4317 		state->id_num_rwqe = cq_attr.cq_size - 1;
4318 	}
4319 
4320 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4321 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4322 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4323 		    "failed, ret=%d\n", ret);
4324 		return (DDI_FAILURE);
4325 	}
4326 
4327 	if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4328 	    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4329 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4330 		    "moderation failed, ret=%d\n", ret);
4331 	}
4332 
4333 	/* make the #rx wc's the same as max rx chain size */
4334 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
4335 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4336 	    state->id_rxwcs_size, KM_SLEEP);
4337 
4338 	/*
4339 	 * Allocate Send CQ.
4340 	 */
4341 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4342 		cq_attr.cq_size = state->id_num_swqe + 1;
4343 	} else {
4344 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4345 		state->id_num_swqe = cq_attr.cq_size - 1;
4346 	}
4347 
4348 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4349 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4350 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4351 		    "failed, ret=%d\n", ret);
4352 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4353 		    state->id_rxwcs_size);
4354 		(void) ibt_free_cq(state->id_rcq_hdl);
4355 		return (DDI_FAILURE);
4356 	}
4357 	if ((ret = ibt_modify_cq(state->id_scq_hdl,
4358 	    ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) {
4359 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4360 		    "moderation failed, ret=%d\n", ret);
4361 	}
4362 
4363 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
4364 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4365 	    state->id_txwcs_size, KM_SLEEP);
4366 
4367 	/*
4368 	 * Print message in case we could not allocate as many wqe's
4369 	 * as was requested.
4370 	 */
4371 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4372 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4373 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4374 	}
4375 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4376 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4377 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4378 	}
4379 
4380 	return (DDI_SUCCESS);
4381 }
4382 
4383 static int
4384 ibd_setup_ud_channel(ibd_state_t *state)
4385 {
4386 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4387 	ibt_ud_chan_query_attr_t ud_chan_attr;
4388 	ibt_status_t ret;
4389 
4390 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
4391 	if (state->id_hca_res_lkey_capab)
4392 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4393 	if (state->id_lso_policy && state->id_lso_capable)
4394 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4395 
4396 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4397 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4398 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4399 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4400 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4401 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4402 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4403 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4404 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4405 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4406 	ud_alloc_attr.ud_clone_chan	= NULL;
4407 
4408 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4409 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4410 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4411 		    "failed, ret=%d\n", ret);
4412 		return (DDI_FAILURE);
4413 	}
4414 
4415 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4416 	    &ud_chan_attr)) != IBT_SUCCESS) {
4417 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4418 		    "failed, ret=%d\n", ret);
4419 		(void) ibt_free_channel(state->id_chnl_hdl);
4420 		return (DDI_FAILURE);
4421 	}
4422 
4423 	state->id_qpnum = ud_chan_attr.ud_qpn;
4424 
4425 	return (DDI_SUCCESS);
4426 }
4427 
4428 static int
4429 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
4430 {
4431 	uint32_t progress = state->id_mac_state;
4432 	uint_t attempts;
4433 	ibt_status_t ret;
4434 	ib_gid_t mgid;
4435 	ibd_mce_t *mce;
4436 	uint8_t jstate;
4437 
4438 	if (atomic_dec_32_nv(&state->id_running) != 0)
4439 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
4440 
4441 	/*
4442 	 * Before we try to stop/undo whatever we did in ibd_start(),
4443 	 * we need to mark the link state appropriately to prevent the
4444 	 * ip layer from using this instance for any new transfers. Note
4445 	 * that if the original state of the link was "up" when we're
4446 	 * here, we'll set the final link state to "unknown", to behave
4447 	 * in the same fashion as other ethernet drivers.
4448 	 */
4449 	mutex_enter(&state->id_link_mutex);
4450 	if (cur_link_state == LINK_STATE_DOWN) {
4451 		state->id_link_state = cur_link_state;
4452 	} else {
4453 		state->id_link_state = LINK_STATE_UNKNOWN;
4454 	}
4455 	mutex_exit(&state->id_link_mutex);
4456 	mac_link_update(state->id_mh, state->id_link_state);
4457 
4458 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4459 	if (progress & IBD_DRV_STARTED) {
4460 		state->id_mac_state &= (~IBD_DRV_STARTED);
4461 	}
4462 
4463 	/*
4464 	 * First, stop receive interrupts; this stops the driver from
4465 	 * handing up buffers to higher layers.  Wait for receive buffers
4466 	 * to be returned and give up after 1 second.
4467 	 */
4468 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4469 		attempts = 10;
4470 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
4471 		    0) > 0) {
4472 			delay(drv_usectohz(100000));
4473 			if (--attempts == 0) {
4474 				/*
4475 				 * There are pending bufs with the network
4476 				 * layer and we have no choice but to wait
4477 				 * for them to be done with. Reap all the
4478 				 * Tx/Rx completions that were posted since
4479 				 * we turned off the notification and
4480 				 * return failure.
4481 				 */
4482 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
4483 				DPRINT(2, "ibd_undo_start: "
4484 				    "reclaiming failed");
4485 				break;
4486 			}
4487 		}
4488 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4489 	}
4490 
4491 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4492 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4493 
4494 		mutex_enter(&state->id_trap_lock);
4495 		state->id_trap_stop = B_TRUE;
4496 		while (state->id_trap_inprog > 0)
4497 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4498 		mutex_exit(&state->id_trap_lock);
4499 
4500 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4501 	}
4502 
4503 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4504 		/*
4505 		 * Flushing the channel ensures that all pending WQE's
4506 		 * are marked with flush_error and handed to the CQ. It
4507 		 * does not guarantee the invocation of the CQ handler.
4508 		 * This call is guaranteed to return successfully for
4509 		 * UD QPNs.
4510 		 */
4511 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
4512 		    IBT_SUCCESS) {
4513 			DPRINT(10, "ibd_undo_start: flush_channel "
4514 			    "failed, ret=%d", ret);
4515 		}
4516 
4517 		/*
4518 		 * Give some time for the TX CQ handler to process the
4519 		 * completions.
4520 		 */
4521 		mutex_enter(&state->id_tx_list.dl_mutex);
4522 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
4523 		attempts = 10;
4524 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
4525 		    != state->id_num_swqe) {
4526 			if (--attempts == 0)
4527 				break;
4528 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
4529 			mutex_exit(&state->id_tx_list.dl_mutex);
4530 			delay(drv_usectohz(100000));
4531 			mutex_enter(&state->id_tx_list.dl_mutex);
4532 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
4533 		}
4534 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4535 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
4536 		    state->id_num_swqe) {
4537 			cmn_err(CE_WARN, "tx resources not freed\n");
4538 		}
4539 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
4540 		mutex_exit(&state->id_tx_list.dl_mutex);
4541 
4542 		attempts = 10;
4543 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
4544 			if (--attempts == 0)
4545 				break;
4546 			delay(drv_usectohz(100000));
4547 		}
4548 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4549 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
4550 			cmn_err(CE_WARN, "rx resources not freed\n");
4551 		}
4552 
4553 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4554 	}
4555 
4556 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4557 		/*
4558 		 * No new async requests will be posted since the device
4559 		 * link state has been marked as unknown; completion handlers
4560 		 * have been turned off, so Tx handler will not cause any
4561 		 * more IBD_ASYNC_REAP requests.
4562 		 *
4563 		 * Queue a request for the async thread to exit, which will
4564 		 * be serviced after any pending ones. This can take a while,
4565 		 * specially if the SM is unreachable, since IBMF will slowly
4566 		 * timeout each SM request issued by the async thread.  Reap
4567 		 * the thread before continuing on, we do not want it to be
4568 		 * lingering in modunloaded code (or we could move the reap
4569 		 * to ibd_detach(), provided we keep track of the current
4570 		 * id_async_thrid somewhere safe).
4571 		 */
4572 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4573 		thread_join(state->id_async_thrid);
4574 
4575 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4576 	}
4577 
4578 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4579 		/*
4580 		 * Drop all residual full/non membership. This includes full
4581 		 * membership to the broadcast group, and any nonmembership
4582 		 * acquired during transmits. We do this after the Tx completion
4583 		 * handlers are done, since those might result in some late
4584 		 * leaves; this also eliminates a potential race with that
4585 		 * path wrt the mc full list insert/delete. Trap handling
4586 		 * has also been suppressed at this point. Thus, no locks
4587 		 * are required while traversing the mc full list.
4588 		 */
4589 		DPRINT(2, "ibd_undo_start: clear full cache entries");
4590 		mce = list_head(&state->id_mc_full);
4591 		while (mce != NULL) {
4592 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4593 			jstate = mce->mc_jstate;
4594 			mce = list_next(&state->id_mc_full, mce);
4595 			ibd_leave_group(state, mgid, jstate);
4596 		}
4597 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4598 	}
4599 
4600 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4601 		ibd_fini_rxlist(state);
4602 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4603 	}
4604 
4605 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4606 		ibd_fini_txlist(state);
4607 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4608 	}
4609 
4610 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4611 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
4612 		    IBT_SUCCESS) {
4613 			DPRINT(10, "ibd_undo_start: free_channel "
4614 			    "failed, ret=%d", ret);
4615 		}
4616 
4617 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4618 	}
4619 
4620 	if (progress & IBD_DRV_CQS_ALLOCD) {
4621 		kmem_free(state->id_txwcs,
4622 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
4623 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
4624 		    IBT_SUCCESS) {
4625 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
4626 			    "failed, ret=%d", ret);
4627 		}
4628 
4629 		kmem_free(state->id_rxwcs,
4630 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4631 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
4632 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
4633 			    "ret=%d", ret);
4634 		}
4635 
4636 		state->id_txwcs = NULL;
4637 		state->id_rxwcs = NULL;
4638 		state->id_scq_hdl = NULL;
4639 		state->id_rcq_hdl = NULL;
4640 
4641 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4642 	}
4643 
4644 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4645 		mutex_enter(&state->id_ac_mutex);
4646 		mod_hash_destroy_hash(state->id_ah_active_hash);
4647 		mutex_exit(&state->id_ac_mutex);
4648 		ibd_acache_fini(state);
4649 
4650 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4651 	}
4652 
4653 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4654 		/*
4655 		 * If we'd created the ipoib broadcast group and had
4656 		 * successfully joined it, leave it now
4657 		 */
4658 		if (state->id_bgroup_created) {
4659 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
4660 			jstate = IB_MC_JSTATE_FULL;
4661 			(void) ibt_leave_mcg(state->id_sgid, mgid,
4662 			    state->id_sgid, jstate);
4663 		}
4664 		ibt_free_mcg_info(state->id_mcinfo, 1);
4665 
4666 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4667 	}
4668 
4669 	return (DDI_SUCCESS);
4670 }
4671 
4672 /*
4673  * These pair of routines are used to set/clear the condition that
4674  * the caller is likely to do something to change the id_mac_state.
4675  * If there's already someone doing either a start or a stop (possibly
4676  * due to the async handler detecting a pkey relocation event, a plumb
4677  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
4678  * that's done.
4679  */
4680 static void
4681 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
4682 {
4683 	mutex_enter(&state->id_macst_lock);
4684 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
4685 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
4686 
4687 	state->id_mac_state |= flag;
4688 	mutex_exit(&state->id_macst_lock);
4689 }
4690 
4691 static void
4692 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
4693 {
4694 	mutex_enter(&state->id_macst_lock);
4695 	state->id_mac_state &= (~flag);
4696 	cv_signal(&state->id_macst_cv);
4697 	mutex_exit(&state->id_macst_lock);
4698 }
4699 
4700 /*
4701  * GLDv3 entry point to start hardware.
4702  */
4703 /*ARGSUSED*/
4704 static int
4705 ibd_m_start(void *arg)
4706 {
4707 	ibd_state_t *state = arg;
4708 	int	ret;
4709 
4710 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4711 
4712 	ret = ibd_start(state);
4713 
4714 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4715 
4716 	return (ret);
4717 }
4718 
4719 static int
4720 ibd_start(ibd_state_t *state)
4721 {
4722 	kthread_t *kht;
4723 	int err;
4724 	ibt_status_t ret;
4725 
4726 	if (state->id_mac_state & IBD_DRV_STARTED)
4727 		return (DDI_SUCCESS);
4728 
4729 	if (atomic_inc_32_nv(&state->id_running) != 1) {
4730 		DPRINT(10, "ibd_start: id_running is non-zero");
4731 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
4732 		atomic_dec_32(&state->id_running);
4733 		return (EINVAL);
4734 	}
4735 
4736 	/*
4737 	 * Get port details; if we fail here, very likely the port
4738 	 * state is inactive or the pkey can't be verified.
4739 	 */
4740 	if ((err = ibd_get_port_details(state)) != 0) {
4741 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
4742 		goto start_fail;
4743 	}
4744 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4745 
4746 	/*
4747 	 * Find the IPoIB broadcast group
4748 	 */
4749 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4750 		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
4751 		err = ENOTACTIVE;
4752 		goto start_fail;
4753 	}
4754 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4755 
4756 	/*
4757 	 * Initialize per-interface caches and lists; if we fail here,
4758 	 * it is most likely due to a lack of resources
4759 	 */
4760 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4761 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
4762 		err = ENOMEM;
4763 		goto start_fail;
4764 	}
4765 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4766 
4767 	/*
4768 	 * Allocate send and receive completion queues
4769 	 */
4770 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4771 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
4772 		err = ENOMEM;
4773 		goto start_fail;
4774 	}
4775 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4776 
4777 	/*
4778 	 * Setup a UD channel
4779 	 */
4780 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4781 		err = ENOMEM;
4782 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
4783 		goto start_fail;
4784 	}
4785 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4786 
4787 	/*
4788 	 * Allocate and initialize the tx buffer list
4789 	 */
4790 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4791 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
4792 		err = ENOMEM;
4793 		goto start_fail;
4794 	}
4795 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4796 
4797 	/*
4798 	 * Create the send cq handler here
4799 	 */
4800 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4801 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
4802 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4803 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
4804 		    "failed, ret=%d", ret);
4805 		err = EINVAL;
4806 		goto start_fail;
4807 	}
4808 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4809 
4810 	/*
4811 	 * Allocate and initialize the rx buffer list
4812 	 */
4813 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4814 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
4815 		err = ENOMEM;
4816 		goto start_fail;
4817 	}
4818 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4819 
4820 	/*
4821 	 * Join IPoIB broadcast group
4822 	 */
4823 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4824 		DPRINT(10, "ibd_start: ibd_join_group() failed");
4825 		err = ENOTACTIVE;
4826 		goto start_fail;
4827 	}
4828 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4829 
4830 	/*
4831 	 * Create the async thread; thread_create never fails.
4832 	 */
4833 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4834 	    TS_RUN, minclsyspri);
4835 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4836 	state->id_async_thrid = kht->t_did;
4837 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4838 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4839 
4840 	/*
4841 	 * When we did mac_register() in ibd_attach(), we didn't register
4842 	 * the real macaddr and we didn't have the true port mtu. Now that
4843 	 * we're almost ready, set the local mac address and broadcast
4844 	 * addresses and update gldv3 about the real values of these
4845 	 * parameters.
4846 	 */
4847 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4848 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4849 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4850 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4851 
4852 	(void) mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
4853 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4854 
4855 	/*
4856 	 * Setup the receive cq handler
4857 	 */
4858 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4859 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
4860 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4861 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
4862 		    "failed, ret=%d", ret);
4863 		err = EINVAL;
4864 		goto start_fail;
4865 	}
4866 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4867 
4868 	/*
4869 	 * Setup the subnet notices handler after we've initialized the acache/
4870 	 * mcache and started the async thread, both of which are required for
4871 	 * the trap handler to function properly.
4872 	 *
4873 	 * Now that the async thread has been started (and we've already done
4874 	 * a mac_register() during attach so mac_tx_update() can be called
4875 	 * if necessary without any problem), we can enable the trap handler
4876 	 * to queue requests to the async thread.
4877 	 */
4878 	ibt_register_subnet_notices(state->id_ibt_hdl,
4879 	    ibd_snet_notices_handler, state);
4880 	mutex_enter(&state->id_trap_lock);
4881 	state->id_trap_stop = B_FALSE;
4882 	mutex_exit(&state->id_trap_lock);
4883 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4884 
4885 	/*
4886 	 * Indicate link status to GLDv3 and higher layers. By default,
4887 	 * we assume we are in up state (which must have been true at
4888 	 * least at the time the broadcast mcg's were probed); if there
4889 	 * were any up/down transitions till the time we come here, the
4890 	 * async handler will have updated last known state, which we
4891 	 * use to tell GLDv3. The async handler will not send any
4892 	 * notifications to GLDv3 till we reach here in the initialization
4893 	 * sequence.
4894 	 */
4895 	state->id_mac_state |= IBD_DRV_STARTED;
4896 	mac_link_update(state->id_mh, state->id_link_state);
4897 
4898 	return (DDI_SUCCESS);
4899 
4900 start_fail:
4901 	/*
4902 	 * If we ran into a problem during ibd_start() and ran into
4903 	 * some other problem during undoing our partial work, we can't
4904 	 * do anything about it.  Ignore any errors we might get from
4905 	 * ibd_undo_start() and just return the original error we got.
4906 	 */
4907 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
4908 	return (err);
4909 }
4910 
4911 /*
4912  * GLDv3 entry point to stop hardware from receiving packets.
4913  */
4914 /*ARGSUSED*/
4915 static void
4916 ibd_m_stop(void *arg)
4917 {
4918 	ibd_state_t *state = (ibd_state_t *)arg;
4919 
4920 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4921 
4922 	(void) ibd_undo_start(state, state->id_link_state);
4923 
4924 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4925 }
4926 
4927 /*
4928  * GLDv3 entry point to modify device's mac address. We do not
4929  * allow address modifications.
4930  */
4931 static int
4932 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4933 {
4934 	ibd_state_t *state = arg;
4935 
4936 	/*
4937 	 * Don't bother even comparing the macaddr if we haven't
4938 	 * completed ibd_m_start().
4939 	 */
4940 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4941 		return (0);
4942 
4943 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4944 		return (0);
4945 	else
4946 		return (EINVAL);
4947 }
4948 
4949 /*
4950  * The blocking part of the IBA join/leave operations are done out
4951  * of here on the async thread.
4952  */
4953 static void
4954 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4955 {
4956 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4957 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4958 
4959 	if (op == IBD_ASYNC_JOIN) {
4960 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4961 			ibd_print_warn(state, "Join multicast group failed :"
4962 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4963 		}
4964 	} else {
4965 		/*
4966 		 * Here, we must search for the proper mcg_info and
4967 		 * use that to leave the group.
4968 		 */
4969 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4970 	}
4971 }
4972 
4973 /*
4974  * GLDv3 entry point for multicast enable/disable requests.
4975  * This function queues the operation to the async thread and
4976  * return success for a valid multicast address.
4977  */
4978 static int
4979 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4980 {
4981 	ibd_state_t *state = (ibd_state_t *)arg;
4982 	ipoib_mac_t maddr, *mcast;
4983 	ib_gid_t mgid;
4984 	ibd_req_t *req;
4985 
4986 	/*
4987 	 * If we haven't completed ibd_m_start(), async thread wouldn't
4988 	 * have been started and id_bcaddr wouldn't be set, so there's
4989 	 * no point in continuing.
4990 	 */
4991 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4992 		return (0);
4993 
4994 	/*
4995 	 * The incoming multicast address might not be aligned properly
4996 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4997 	 * it to look like one though, to get the offsets of the mc gid,
4998 	 * since we know we are not going to dereference any values with
4999 	 * the ipoib_mac_t pointer.
5000 	 */
5001 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
5002 	mcast = &maddr;
5003 
5004 	/*
5005 	 * Check validity of MCG address. We could additionally check
5006 	 * that a enable/disable is not being issued on the "broadcast"
5007 	 * mcg, but since this operation is only invokable by privileged
5008 	 * programs anyway, we allow the flexibility to those dlpi apps.
5009 	 * Note that we do not validate the "scope" of the IBA mcg.
5010 	 */
5011 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
5012 		return (EINVAL);
5013 
5014 	/*
5015 	 * fill in multicast pkey and scope
5016 	 */
5017 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
5018 
5019 	/*
5020 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
5021 	 * nothing (i.e. we stay JOINed to the broadcast group done in
5022 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
5023 	 * requires to be joined to broadcast groups at all times.
5024 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
5025 	 * depends on this.
5026 	 */
5027 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5028 		return (0);
5029 
5030 	ibd_n2h_gid(mcast, &mgid);
5031 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5032 	if (req == NULL)
5033 		return (ENOMEM);
5034 
5035 	req->rq_gid = mgid;
5036 
5037 	if (add) {
5038 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
5039 		    mgid.gid_prefix, mgid.gid_guid);
5040 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
5041 	} else {
5042 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
5043 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5044 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
5045 	}
5046 	return (0);
5047 }
5048 
5049 /*
5050  * The blocking part of the IBA promiscuous operations are done
5051  * out of here on the async thread. The dlpireq parameter indicates
5052  * whether this invocation is due to a dlpi request or due to
5053  * a port up/down event.
5054  */
5055 static void
5056 ibd_async_unsetprom(ibd_state_t *state)
5057 {
5058 	ibd_mce_t *mce = list_head(&state->id_mc_non);
5059 	ib_gid_t mgid;
5060 
5061 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
5062 
5063 	while (mce != NULL) {
5064 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
5065 		mce = list_next(&state->id_mc_non, mce);
5066 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
5067 	}
5068 	state->id_prom_op = IBD_OP_NOTSTARTED;
5069 }
5070 
5071 /*
5072  * The blocking part of the IBA promiscuous operations are done
5073  * out of here on the async thread. The dlpireq parameter indicates
5074  * whether this invocation is due to a dlpi request or due to
5075  * a port up/down event.
5076  */
5077 static void
5078 ibd_async_setprom(ibd_state_t *state)
5079 {
5080 	ibt_mcg_attr_t mcg_attr;
5081 	ibt_mcg_info_t *mcg_info;
5082 	ib_gid_t mgid;
5083 	uint_t numg;
5084 	int i;
5085 	char ret = IBD_OP_COMPLETED;
5086 
5087 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
5088 
5089 	/*
5090 	 * Obtain all active MC groups on the IB fabric with
5091 	 * specified criteria (scope + Pkey + Qkey + mtu).
5092 	 */
5093 	bzero(&mcg_attr, sizeof (mcg_attr));
5094 	mcg_attr.mc_pkey = state->id_pkey;
5095 	mcg_attr.mc_scope = state->id_scope;
5096 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
5097 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
5098 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
5099 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
5100 	    IBT_SUCCESS) {
5101 		ibd_print_warn(state, "Could not get list of IBA multicast "
5102 		    "groups");
5103 		ret = IBD_OP_ERRORED;
5104 		goto done;
5105 	}
5106 
5107 	/*
5108 	 * Iterate over the returned mcg's and join as NonMember
5109 	 * to the IP mcg's.
5110 	 */
5111 	for (i = 0; i < numg; i++) {
5112 		/*
5113 		 * Do a NonMember JOIN on the MC group.
5114 		 */
5115 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
5116 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
5117 			ibd_print_warn(state, "IBA promiscuous mode missed "
5118 			    "multicast gid %016llx:%016llx",
5119 			    (u_longlong_t)mgid.gid_prefix,
5120 			    (u_longlong_t)mgid.gid_guid);
5121 	}
5122 
5123 	ibt_free_mcg_info(mcg_info, numg);
5124 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
5125 done:
5126 	state->id_prom_op = ret;
5127 }
5128 
5129 /*
5130  * GLDv3 entry point for multicast promiscuous enable/disable requests.
5131  * GLDv3 assumes phys state receives more packets than multi state,
5132  * which is not true for IPoIB. Thus, treat the multi and phys
5133  * promiscuous states the same way to work with GLDv3's assumption.
5134  */
5135 static int
5136 ibd_m_promisc(void *arg, boolean_t on)
5137 {
5138 	ibd_state_t *state = (ibd_state_t *)arg;
5139 	ibd_req_t *req;
5140 
5141 	/*
5142 	 * Async thread wouldn't have been started if we haven't
5143 	 * passed ibd_m_start()
5144 	 */
5145 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5146 		return (0);
5147 
5148 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5149 	if (req == NULL)
5150 		return (ENOMEM);
5151 	if (on) {
5152 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
5153 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
5154 	} else {
5155 		DPRINT(1, "ibd_m_promisc : unset_promisc");
5156 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
5157 	}
5158 
5159 	return (0);
5160 }
5161 
5162 /*
5163  * GLDv3 entry point for gathering statistics.
5164  */
5165 static int
5166 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
5167 {
5168 	ibd_state_t *state = (ibd_state_t *)arg;
5169 
5170 	switch (stat) {
5171 	case MAC_STAT_IFSPEED:
5172 		*val = state->id_link_speed;
5173 		break;
5174 	case MAC_STAT_MULTIRCV:
5175 		*val = state->id_multi_rcv;
5176 		break;
5177 	case MAC_STAT_BRDCSTRCV:
5178 		*val = state->id_brd_rcv;
5179 		break;
5180 	case MAC_STAT_MULTIXMT:
5181 		*val = state->id_multi_xmt;
5182 		break;
5183 	case MAC_STAT_BRDCSTXMT:
5184 		*val = state->id_brd_xmt;
5185 		break;
5186 	case MAC_STAT_RBYTES:
5187 		*val = state->id_rcv_bytes;
5188 		break;
5189 	case MAC_STAT_IPACKETS:
5190 		*val = state->id_rcv_pkt;
5191 		break;
5192 	case MAC_STAT_OBYTES:
5193 		*val = state->id_xmt_bytes;
5194 		break;
5195 	case MAC_STAT_OPACKETS:
5196 		*val = state->id_xmt_pkt;
5197 		break;
5198 	case MAC_STAT_OERRORS:
5199 		*val = state->id_ah_error;	/* failed AH translation */
5200 		break;
5201 	case MAC_STAT_IERRORS:
5202 		*val = 0;
5203 		break;
5204 	case MAC_STAT_NOXMTBUF:
5205 		*val = state->id_tx_short;
5206 		break;
5207 	case MAC_STAT_NORCVBUF:
5208 	default:
5209 		return (ENOTSUP);
5210 	}
5211 
5212 	return (0);
5213 }
5214 
5215 static void
5216 ibd_async_txsched(ibd_state_t *state)
5217 {
5218 	ibd_resume_transmission(state);
5219 }
5220 
5221 static void
5222 ibd_resume_transmission(ibd_state_t *state)
5223 {
5224 	int flag;
5225 	int met_thresh = 0;
5226 	int thresh = 0;
5227 	int ret = -1;
5228 
5229 	mutex_enter(&state->id_sched_lock);
5230 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
5231 		mutex_enter(&state->id_tx_list.dl_mutex);
5232 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5233 		met_thresh = state->id_tx_list.dl_cnt +
5234 		    state->id_tx_rel_list.dl_cnt;
5235 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5236 		mutex_exit(&state->id_tx_list.dl_mutex);
5237 		thresh = IBD_FREE_SWQES_THRESH;
5238 		flag = IBD_RSRC_SWQE;
5239 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
5240 		ASSERT(state->id_lso != NULL);
5241 		mutex_enter(&state->id_lso_lock);
5242 		met_thresh = state->id_lso->bkt_nfree;
5243 		thresh = IBD_FREE_LSOS_THRESH;
5244 		mutex_exit(&state->id_lso_lock);
5245 		flag = IBD_RSRC_LSOBUF;
5246 		if (met_thresh > thresh)
5247 			state->id_sched_lso_cnt++;
5248 	}
5249 	if (met_thresh > thresh) {
5250 		state->id_sched_needed &= ~flag;
5251 		state->id_sched_cnt++;
5252 		ret = 0;
5253 	}
5254 	mutex_exit(&state->id_sched_lock);
5255 
5256 	if (ret == 0)
5257 		mac_tx_update(state->id_mh);
5258 }
5259 
5260 /*
5261  * Release the send wqe back into free list.
5262  */
5263 static void
5264 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
5265 {
5266 	/*
5267 	 * Add back on Tx list for reuse.
5268 	 */
5269 	ASSERT(tail->swqe_next == NULL);
5270 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5271 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
5272 	tail->swqe_next = state->id_tx_rel_list.dl_head;
5273 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
5274 	state->id_tx_rel_list.dl_cnt += n;
5275 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
5276 }
5277 
5278 /*
5279  * Acquire a send wqe from free list.
5280  * Returns error number and send wqe pointer.
5281  */
5282 static ibd_swqe_t *
5283 ibd_acquire_swqe(ibd_state_t *state)
5284 {
5285 	ibd_swqe_t *wqe;
5286 
5287 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5288 	if (state->id_tx_rel_list.dl_head != NULL) {
5289 		/* transfer id_tx_rel_list to id_tx_list */
5290 		state->id_tx_list.dl_head =
5291 		    state->id_tx_rel_list.dl_head;
5292 		state->id_tx_list.dl_cnt =
5293 		    state->id_tx_rel_list.dl_cnt;
5294 		state->id_tx_list.dl_pending_sends = B_FALSE;
5295 
5296 		/* clear id_tx_rel_list */
5297 		state->id_tx_rel_list.dl_head = NULL;
5298 		state->id_tx_rel_list.dl_cnt = 0;
5299 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5300 
5301 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
5302 		state->id_tx_list.dl_cnt -= 1;
5303 		state->id_tx_list.dl_head = wqe->swqe_next;
5304 	} else {	/* no free swqe */
5305 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5306 		state->id_tx_list.dl_pending_sends = B_TRUE;
5307 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
5308 		state->id_tx_short++;
5309 		wqe = NULL;
5310 	}
5311 	return (wqe);
5312 }
5313 
5314 static int
5315 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
5316     ibt_ud_dest_hdl_t ud_dest)
5317 {
5318 	mblk_t	*nmp;
5319 	int iph_len, tcph_len;
5320 	ibt_wr_lso_t *lso;
5321 	uintptr_t ip_start, tcp_start;
5322 	uint8_t *dst;
5323 	uint_t pending, mblen;
5324 
5325 	/*
5326 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
5327 	 * we need to adjust it here for lso.
5328 	 */
5329 	lso = &(node->w_swr.wr.ud_lso);
5330 	lso->lso_ud_dest = ud_dest;
5331 	lso->lso_mss = mss;
5332 
5333 	/*
5334 	 * Calculate the LSO header size and set it in the UD LSO structure.
5335 	 * Note that the only assumption we make is that each of the IPoIB,
5336 	 * IP and TCP headers will be contained in a single mblk fragment;
5337 	 * together, the headers may span multiple mblk fragments.
5338 	 */
5339 	nmp = mp;
5340 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
5341 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
5342 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
5343 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
5344 		nmp = nmp->b_cont;
5345 
5346 	}
5347 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
5348 
5349 	tcp_start = ip_start + iph_len;
5350 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
5351 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
5352 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
5353 		nmp = nmp->b_cont;
5354 	}
5355 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5356 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5357 
5358 	/*
5359 	 * If the lso header fits entirely within a single mblk fragment,
5360 	 * we'll avoid an additional copy of the lso header here and just
5361 	 * pass the b_rptr of the mblk directly.
5362 	 *
5363 	 * If this isn't true, we'd have to allocate for it explicitly.
5364 	 */
5365 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5366 		lso->lso_hdr = mp->b_rptr;
5367 	} else {
5368 		/* On work completion, remember to free this allocated hdr */
5369 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5370 		if (lso->lso_hdr == NULL) {
5371 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5372 			    "sz = %d", lso->lso_hdr_sz);
5373 			lso->lso_hdr_sz = 0;
5374 			lso->lso_mss = 0;
5375 			return (-1);
5376 		}
5377 	}
5378 
5379 	/*
5380 	 * Copy in the lso header only if we need to
5381 	 */
5382 	if (lso->lso_hdr != mp->b_rptr) {
5383 		dst = lso->lso_hdr;
5384 		pending = lso->lso_hdr_sz;
5385 
5386 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5387 			mblen = MBLKL(nmp);
5388 			if (pending > mblen) {
5389 				bcopy(nmp->b_rptr, dst, mblen);
5390 				dst += mblen;
5391 				pending -= mblen;
5392 			} else {
5393 				bcopy(nmp->b_rptr, dst, pending);
5394 				break;
5395 			}
5396 		}
5397 	}
5398 
5399 	return (0);
5400 }
5401 
5402 static void
5403 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5404 {
5405 	ibt_wr_lso_t *lso;
5406 
5407 	if ((!node) || (!mp))
5408 		return;
5409 
5410 	/*
5411 	 * Free any header space that we might've allocated if we
5412 	 * did an LSO
5413 	 */
5414 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5415 		lso = &(node->w_swr.wr.ud_lso);
5416 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5417 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5418 			lso->lso_hdr = NULL;
5419 			lso->lso_hdr_sz = 0;
5420 		}
5421 	}
5422 }
5423 
5424 static void
5425 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5426 {
5427 	uint_t		i;
5428 	uint_t		num_posted;
5429 	uint_t		n_wrs;
5430 	ibt_status_t	ibt_status;
5431 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
5432 	ibd_swqe_t	*tx_head, *elem;
5433 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
5434 
5435 	/* post the one request, then check for more */
5436 	ibt_status = ibt_post_send(state->id_chnl_hdl,
5437 	    &node->w_swr, 1, NULL);
5438 	if (ibt_status != IBT_SUCCESS) {
5439 		ibd_print_warn(state, "ibd_post_send: "
5440 		    "posting one wr failed: ret=%d", ibt_status);
5441 		ibd_tx_cleanup(state, node);
5442 	}
5443 
5444 	tx_head = NULL;
5445 	for (;;) {
5446 		if (tx_head == NULL) {
5447 			mutex_enter(&state->id_txpost_lock);
5448 			tx_head = state->id_tx_head;
5449 			if (tx_head == NULL) {
5450 				state->id_tx_busy = 0;
5451 				mutex_exit(&state->id_txpost_lock);
5452 				return;
5453 			}
5454 			state->id_tx_head = NULL;
5455 			mutex_exit(&state->id_txpost_lock);
5456 		}
5457 
5458 		/*
5459 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
5460 		 * at a time if possible, and keep posting them.
5461 		 */
5462 		for (n_wrs = 0, elem = tx_head;
5463 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
5464 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5465 			nodes[n_wrs] = elem;
5466 			wrs[n_wrs] = elem->w_swr;
5467 		}
5468 		tx_head = elem;
5469 
5470 		ASSERT(n_wrs != 0);
5471 
5472 		/*
5473 		 * If posting fails for some reason, we'll never receive
5474 		 * completion intimation, so we'll need to cleanup. But
5475 		 * we need to make sure we don't clean up nodes whose
5476 		 * wrs have been successfully posted. We assume that the
5477 		 * hca driver returns on the first failure to post and
5478 		 * therefore the first 'num_posted' entries don't need
5479 		 * cleanup here.
5480 		 */
5481 		num_posted = 0;
5482 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5483 		    wrs, n_wrs, &num_posted);
5484 		if (ibt_status != IBT_SUCCESS) {
5485 			ibd_print_warn(state, "ibd_post_send: "
5486 			    "posting multiple wrs failed: "
5487 			    "requested=%d, done=%d, ret=%d",
5488 			    n_wrs, num_posted, ibt_status);
5489 
5490 			for (i = num_posted; i < n_wrs; i++)
5491 				ibd_tx_cleanup(state, nodes[i]);
5492 		}
5493 	}
5494 }
5495 
5496 static int
5497 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5498     uint_t lsohdr_sz)
5499 {
5500 	ibt_wr_ds_t *sgl;
5501 	ibt_status_t ibt_status;
5502 	mblk_t *nmp;
5503 	mblk_t *data_mp;
5504 	uchar_t *bufp;
5505 	size_t blksize;
5506 	size_t skip;
5507 	size_t avail;
5508 	uint_t pktsize;
5509 	uint_t frag_len;
5510 	uint_t pending_hdr;
5511 	int nmblks;
5512 	int i;
5513 
5514 	/*
5515 	 * Let's skip ahead to the data if this is LSO
5516 	 */
5517 	data_mp = mp;
5518 	pending_hdr = 0;
5519 	if (lsohdr_sz) {
5520 		pending_hdr = lsohdr_sz;
5521 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5522 			frag_len = nmp->b_wptr - nmp->b_rptr;
5523 			if (frag_len > pending_hdr)
5524 				break;
5525 			pending_hdr -= frag_len;
5526 		}
5527 		data_mp = nmp;	/* start of data past lso header */
5528 		ASSERT(data_mp != NULL);
5529 	}
5530 
5531 	/*
5532 	 * Calculate the size of message data and number of msg blocks
5533 	 */
5534 	pktsize = 0;
5535 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5536 	    nmp = nmp->b_cont, nmblks++) {
5537 		pktsize += MBLKL(nmp);
5538 	}
5539 	pktsize -= pending_hdr;
5540 
5541 	/*
5542 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5543 	 * "copy-threshold", and if the number of mp fragments is less than
5544 	 * the maximum acceptable.
5545 	 */
5546 	if ((state->id_hca_res_lkey_capab) &&
5547 	    (pktsize > IBD_TX_COPY_THRESH) &&
5548 	    (nmblks < state->id_max_sqseg_hiwm)) {
5549 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5550 		ibt_iov_attr_t iov_attr;
5551 
5552 		iov_attr.iov_as = NULL;
5553 		iov_attr.iov = iov_arr;
5554 		iov_attr.iov_buf = NULL;
5555 		iov_attr.iov_list_len = nmblks;
5556 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5557 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5558 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5559 
5560 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5561 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5562 			iov_arr[i].iov_len = MBLKL(nmp);
5563 			if (i == 0) {
5564 				iov_arr[i].iov_addr += pending_hdr;
5565 				iov_arr[i].iov_len -= pending_hdr;
5566 			}
5567 		}
5568 
5569 		node->w_buftype = IBD_WQE_MAPPED;
5570 		node->w_swr.wr_sgl = node->w_sgl;
5571 
5572 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5573 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5574 		if (ibt_status != IBT_SUCCESS) {
5575 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5576 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5577 			goto ibd_copy_path;
5578 		}
5579 
5580 		return (0);
5581 	}
5582 
5583 ibd_copy_path:
5584 	if (pktsize <= state->id_tx_buf_sz) {
5585 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5586 		node->w_swr.wr_nds = 1;
5587 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5588 		node->w_buftype = IBD_WQE_TXBUF;
5589 
5590 		/*
5591 		 * Even though this is the copy path for transfers less than
5592 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5593 		 * is possible the first data mblk fragment (data_mp) still
5594 		 * contains part of the LSO header that we need to skip.
5595 		 */
5596 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5597 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5598 			blksize = MBLKL(nmp) - pending_hdr;
5599 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5600 			bufp += blksize;
5601 			pending_hdr = 0;
5602 		}
5603 
5604 		return (0);
5605 	}
5606 
5607 	/*
5608 	 * Copy path for transfers greater than id_tx_buf_sz
5609 	 */
5610 	node->w_swr.wr_sgl = node->w_sgl;
5611 	if (ibd_acquire_lsobufs(state, pktsize,
5612 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5613 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5614 		return (-1);
5615 	}
5616 	node->w_buftype = IBD_WQE_LSOBUF;
5617 
5618 	/*
5619 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5620 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5621 	 * need to skip part of the LSO header in the first fragment
5622 	 * as before.
5623 	 */
5624 	nmp = data_mp;
5625 	skip = pending_hdr;
5626 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5627 		sgl = node->w_swr.wr_sgl + i;
5628 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5629 		avail = IBD_LSO_BUFSZ;
5630 		while (nmp && avail) {
5631 			blksize = MBLKL(nmp) - skip;
5632 			if (blksize > avail) {
5633 				bcopy(nmp->b_rptr + skip, bufp, avail);
5634 				skip += avail;
5635 				avail = 0;
5636 			} else {
5637 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5638 				skip = 0;
5639 				avail -= blksize;
5640 				bufp += blksize;
5641 				nmp = nmp->b_cont;
5642 			}
5643 		}
5644 	}
5645 
5646 	return (0);
5647 }
5648 
5649 /*
5650  * Schedule a completion queue polling to reap the resource we're
5651  * short on.  If we implement the change to reap tx completions
5652  * in a separate thread, we'll need to wake up that thread here.
5653  */
5654 static int
5655 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5656 {
5657 	ibd_req_t *req;
5658 
5659 	mutex_enter(&state->id_sched_lock);
5660 	state->id_sched_needed |= resource_type;
5661 	mutex_exit(&state->id_sched_lock);
5662 
5663 	/*
5664 	 * If we are asked to queue a work entry, we need to do it
5665 	 */
5666 	if (q_flag) {
5667 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5668 		if (req == NULL)
5669 			return (-1);
5670 
5671 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5672 	}
5673 
5674 	return (0);
5675 }
5676 
5677 /*
5678  * The passed in packet has this format:
5679  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5680  */
5681 static boolean_t
5682 ibd_send(ibd_state_t *state, mblk_t *mp)
5683 {
5684 	ibd_ace_t *ace;
5685 	ibd_swqe_t *node;
5686 	ipoib_mac_t *dest;
5687 	ib_header_info_t *ipibp;
5688 	ip6_t *ip6h;
5689 	uint_t pktsize;
5690 	uint32_t mss;
5691 	uint32_t hckflags;
5692 	uint32_t lsoflags = 0;
5693 	uint_t lsohdr_sz = 0;
5694 	int ret, len;
5695 	boolean_t dofree = B_FALSE;
5696 	boolean_t rc;
5697 
5698 	/*
5699 	 * If we aren't done with the device initialization and start,
5700 	 * we shouldn't be here.
5701 	 */
5702 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5703 		return (B_FALSE);
5704 
5705 	/*
5706 	 * Obtain an address handle for the destination.
5707 	 */
5708 	ipibp = (ib_header_info_t *)mp->b_rptr;
5709 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5710 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5711 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5712 
5713 	ace = ibd_acache_lookup(state, dest, &ret, 1);
5714 
5715 	mutex_enter(&state->id_tx_list.dl_mutex);
5716 	node = WQE_TO_SWQE(state->id_tx_list.dl_head);
5717 	if (node != NULL) {
5718 		state->id_tx_list.dl_cnt -= 1;
5719 		state->id_tx_list.dl_head = node->swqe_next;
5720 	} else {
5721 		node = ibd_acquire_swqe(state);
5722 	}
5723 	mutex_exit(&state->id_tx_list.dl_mutex);
5724 	if (node == NULL) {
5725 		/*
5726 		 * If we don't have an swqe available, schedule a transmit
5727 		 * completion queue cleanup and hold off on sending more
5728 		 * more packets until we have some free swqes
5729 		 */
5730 		if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0)
5731 			return (B_FALSE);
5732 
5733 		/*
5734 		 * If a poll cannot be scheduled, we have no choice but
5735 		 * to drop this packet
5736 		 */
5737 		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5738 		return (B_TRUE);
5739 	}
5740 
5741 	/*
5742 	 * Initialize the commonly used fields in swqe to NULL to protect
5743 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5744 	 * failure.
5745 	 */
5746 	node->swqe_im_mblk = NULL;
5747 	node->w_swr.wr_nds = 0;
5748 	node->w_swr.wr_sgl = NULL;
5749 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5750 
5751 	pktsize = msgsize(mp);
5752 
5753 	atomic_add_64(&state->id_xmt_bytes, pktsize);
5754 	atomic_inc_64(&state->id_xmt_pkt);
5755 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5756 		atomic_inc_64(&state->id_brd_xmt);
5757 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5758 		atomic_inc_64(&state->id_multi_xmt);
5759 
5760 	if (ace != NULL) {
5761 		node->w_ahandle = ace;
5762 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5763 	} else {
5764 		DPRINT(5,
5765 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5766 		    ((ret == EFAULT) ? "failed" : "queued"),
5767 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5768 		    htonl(dest->ipoib_gidpref[1]),
5769 		    htonl(dest->ipoib_gidsuff[0]),
5770 		    htonl(dest->ipoib_gidsuff[1]));
5771 		node->w_ahandle = NULL;
5772 
5773 		/*
5774 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5775 		 * can not find a path for the specific dest address. We
5776 		 * should get rid of this kind of packet.  We also should get
5777 		 * rid of the packet if we cannot schedule a poll via the
5778 		 * async thread.  For the normal case, ibd will return the
5779 		 * packet to upper layer and wait for AH creating.
5780 		 *
5781 		 * Note that we always queue a work slot entry for the async
5782 		 * thread when we fail AH lookup (even in intr mode); this is
5783 		 * due to the convoluted way the code currently looks for AH.
5784 		 */
5785 		if (ret == EFAULT) {
5786 			dofree = B_TRUE;
5787 			rc = B_TRUE;
5788 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5789 			dofree = B_TRUE;
5790 			rc = B_TRUE;
5791 		} else {
5792 			dofree = B_FALSE;
5793 			rc = B_FALSE;
5794 		}
5795 		goto ibd_send_fail;
5796 	}
5797 
5798 	/*
5799 	 * For ND6 packets, padding is at the front of the source lladdr.
5800 	 * Insert the padding at front.
5801 	 */
5802 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
5803 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
5804 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5805 			    sizeof (ib_header_info_t))) {
5806 				DPRINT(10, "ibd_send: pullupmsg failure ");
5807 				dofree = B_TRUE;
5808 				rc = B_TRUE;
5809 				goto ibd_send_fail;
5810 			}
5811 			ipibp = (ib_header_info_t *)mp->b_rptr;
5812 		}
5813 		ip6h = (ip6_t *)((uchar_t *)ipibp +
5814 		    sizeof (ib_header_info_t));
5815 		len = ntohs(ip6h->ip6_plen);
5816 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5817 			mblk_t	*pad;
5818 
5819 			pad = allocb(4, 0);
5820 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
5821 			linkb(mp, pad);
5822 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
5823 			    IPV6_HDR_LEN + len + 4) {
5824 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
5825 				    IPV6_HDR_LEN + len + 4)) {
5826 					DPRINT(10, "ibd_send: pullupmsg "
5827 					    "failure ");
5828 					dofree = B_TRUE;
5829 					rc = B_TRUE;
5830 					goto ibd_send_fail;
5831 				}
5832 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5833 				    sizeof (ib_header_info_t));
5834 			}
5835 
5836 			/* LINTED: E_CONSTANT_CONDITION */
5837 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5838 		}
5839 	}
5840 
5841 	mp->b_rptr += sizeof (ib_addrs_t);
5842 
5843 	/*
5844 	 * Do LSO and checksum related work here.  For LSO send, adjust the
5845 	 * ud destination, the opcode and the LSO header information to the
5846 	 * work request.
5847 	 */
5848 	lso_info_get(mp, &mss, &lsoflags);
5849 	if ((lsoflags & HW_LSO) != HW_LSO) {
5850 		node->w_swr.wr_opcode = IBT_WRC_SEND;
5851 		lsohdr_sz = 0;
5852 	} else {
5853 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
5854 			/*
5855 			 * The routine can only fail if there's no memory; we
5856 			 * can only drop the packet if this happens
5857 			 */
5858 			ibd_print_warn(state,
5859 			    "ibd_send: no memory, lso posting failed");
5860 			dofree = B_TRUE;
5861 			rc = B_TRUE;
5862 			goto ibd_send_fail;
5863 		}
5864 
5865 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
5866 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
5867 	}
5868 
5869 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
5870 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
5871 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
5872 	else
5873 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
5874 
5875 	/*
5876 	 * Prepare the sgl for posting; the routine can only fail if there's
5877 	 * no lso buf available for posting. If this is the case, we should
5878 	 * probably resched for lso bufs to become available and then try again.
5879 	 */
5880 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
5881 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
5882 			dofree = B_TRUE;
5883 			rc = B_TRUE;
5884 		} else {
5885 			dofree = B_FALSE;
5886 			rc = B_FALSE;
5887 		}
5888 		goto ibd_send_fail;
5889 	}
5890 	node->swqe_im_mblk = mp;
5891 
5892 	/*
5893 	 * Queue the wqe to hardware; since we can now simply queue a
5894 	 * post instead of doing it serially, we cannot assume anything
5895 	 * about the 'node' after ibd_post_send() returns.
5896 	 */
5897 	node->swqe_next = NULL;
5898 
5899 	mutex_enter(&state->id_txpost_lock);
5900 	if (state->id_tx_busy) {
5901 		if (state->id_tx_head) {
5902 			state->id_tx_tail->swqe_next =
5903 			    SWQE_TO_WQE(node);
5904 		} else {
5905 			state->id_tx_head = node;
5906 		}
5907 		state->id_tx_tail = node;
5908 		mutex_exit(&state->id_txpost_lock);
5909 	} else {
5910 		state->id_tx_busy = 1;
5911 		mutex_exit(&state->id_txpost_lock);
5912 		ibd_post_send(state, node);
5913 	}
5914 
5915 	return (B_TRUE);
5916 
5917 ibd_send_fail:
5918 	if (node && mp)
5919 		ibd_free_lsohdr(node, mp);
5920 
5921 	if (dofree)
5922 		freemsg(mp);
5923 
5924 	if (node != NULL)
5925 		ibd_tx_cleanup(state, node);
5926 
5927 	return (rc);
5928 }
5929 
5930 /*
5931  * GLDv3 entry point for transmitting datagram.
5932  */
5933 static mblk_t *
5934 ibd_m_tx(void *arg, mblk_t *mp)
5935 {
5936 	ibd_state_t *state = (ibd_state_t *)arg;
5937 	mblk_t *next;
5938 
5939 	if (state->id_link_state != LINK_STATE_UP) {
5940 		freemsgchain(mp);
5941 		mp = NULL;
5942 	}
5943 
5944 	while (mp != NULL) {
5945 		next = mp->b_next;
5946 		mp->b_next = NULL;
5947 		if (ibd_send(state, mp) == B_FALSE) {
5948 			/* Send fail */
5949 			mp->b_next = next;
5950 			break;
5951 		}
5952 		mp = next;
5953 	}
5954 
5955 	return (mp);
5956 }
5957 
5958 /*
5959  * this handles Tx and Rx completions. With separate CQs, this handles
5960  * only Rx completions.
5961  */
5962 static uint_t
5963 ibd_intr(caddr_t arg)
5964 {
5965 	ibd_state_t *state = (ibd_state_t *)arg;
5966 
5967 	ibd_poll_rcq(state, state->id_rcq_hdl);
5968 
5969 	return (DDI_INTR_CLAIMED);
5970 }
5971 
5972 /*
5973  * Poll and fully drain the send cq
5974  */
5975 static void
5976 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5977 {
5978 	ibt_wc_t *wcs = state->id_txwcs;
5979 	uint_t numwcs = state->id_txwcs_size;
5980 	ibd_wqe_t *wqe;
5981 	ibd_swqe_t *head, *tail;
5982 	ibt_wc_t *wc;
5983 	uint_t num_polled;
5984 	int i;
5985 
5986 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5987 		head = tail = NULL;
5988 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5989 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5990 			if (wc->wc_status != IBT_WC_SUCCESS) {
5991 				/*
5992 				 * Channel being torn down.
5993 				 */
5994 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5995 					DPRINT(5, "ibd_drain_scq: flush error");
5996 					DPRINT(10, "ibd_drain_scq: Bad "
5997 					    "status %d", wc->wc_status);
5998 				} else {
5999 					DPRINT(10, "ibd_drain_scq: "
6000 					    "unexpected wc_status %d",
6001 					    wc->wc_status);
6002 				}
6003 				/*
6004 				 * Fallthrough to invoke the Tx handler to
6005 				 * release held resources, e.g., AH refcount.
6006 				 */
6007 			}
6008 			/*
6009 			 * Add this swqe to the list to be cleaned up.
6010 			 */
6011 			if (head)
6012 				tail->swqe_next = wqe;
6013 			else
6014 				head = WQE_TO_SWQE(wqe);
6015 			tail = WQE_TO_SWQE(wqe);
6016 		}
6017 		tail->swqe_next = NULL;
6018 		ibd_tx_cleanup_list(state, head, tail);
6019 
6020 		/*
6021 		 * Resume any blocked transmissions if possible
6022 		 */
6023 		ibd_resume_transmission(state);
6024 	}
6025 }
6026 
6027 /*
6028  * Poll and fully drain the receive cq
6029  */
6030 static void
6031 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6032 {
6033 	ibt_wc_t *wcs = state->id_rxwcs;
6034 	uint_t numwcs = state->id_rxwcs_size;
6035 	ibd_rwqe_t *rwqe;
6036 	ibt_wc_t *wc;
6037 	uint_t num_polled;
6038 	int i;
6039 	mblk_t *head, *tail, *mp;
6040 
6041 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
6042 		head = tail = NULL;
6043 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
6044 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
6045 			if (wc->wc_status != IBT_WC_SUCCESS) {
6046 				/*
6047 				 * Channel being torn down.
6048 				 */
6049 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
6050 					DPRINT(5, "ibd_drain_rcq: "
6051 					    "expected flushed rwqe");
6052 				} else {
6053 					DPRINT(5, "ibd_drain_rcq: "
6054 					    "unexpected wc_status %d",
6055 					    wc->wc_status);
6056 				}
6057 				atomic_inc_32(
6058 				    &state->id_rx_list.dl_bufs_outstanding);
6059 				freemsg(rwqe->rwqe_im_mblk);
6060 				continue;
6061 			}
6062 			mp = ibd_process_rx(state, rwqe, wc);
6063 			if (mp == NULL)
6064 				continue;
6065 
6066 			/*
6067 			 * Add this mp to the list to send to the nw layer.
6068 			 */
6069 			if (head)
6070 				tail->b_next = mp;
6071 			else
6072 				head = mp;
6073 			tail = mp;
6074 		}
6075 		if (head)
6076 			mac_rx(state->id_mh, state->id_rh, head);
6077 
6078 		/*
6079 		 * Account for #rwqes polled.
6080 		 * Post more here, if less than one fourth full.
6081 		 */
6082 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
6083 		    (state->id_num_rwqe / 4))
6084 			ibd_post_recv_intr(state);
6085 	}
6086 }
6087 
6088 /*
6089  * Common code for interrupt handling as well as for polling
6090  * for all completed wqe's while detaching.
6091  */
6092 static void
6093 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6094 {
6095 	int flag, redo_flag;
6096 	int redo = 1;
6097 
6098 	flag = IBD_CQ_POLLING;
6099 	redo_flag = IBD_REDO_CQ_POLLING;
6100 
6101 	mutex_enter(&state->id_scq_poll_lock);
6102 	if (state->id_scq_poll_busy & flag) {
6103 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
6104 		state->id_scq_poll_busy |= redo_flag;
6105 		mutex_exit(&state->id_scq_poll_lock);
6106 		return;
6107 	}
6108 	state->id_scq_poll_busy |= flag;
6109 	mutex_exit(&state->id_scq_poll_lock);
6110 
6111 	/*
6112 	 * In some cases (eg detaching), this code can be invoked on
6113 	 * any cpu after disabling cq notification (thus no concurrency
6114 	 * exists). Apart from that, the following applies normally:
6115 	 * Transmit completion handling could be from any cpu if
6116 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
6117 	 * is interrupt driven.
6118 	 */
6119 
6120 	/*
6121 	 * Poll and drain the CQ
6122 	 */
6123 	ibd_drain_scq(state, cq_hdl);
6124 
6125 	/*
6126 	 * Enable CQ notifications and redrain the cq to catch any
6127 	 * completions we might have missed after the ibd_drain_scq()
6128 	 * above and before the ibt_enable_cq_notify() that follows.
6129 	 * Finally, service any new requests to poll the cq that
6130 	 * could've come in after the ibt_enable_cq_notify().
6131 	 */
6132 	do {
6133 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
6134 		    IBT_SUCCESS) {
6135 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6136 		}
6137 
6138 		ibd_drain_scq(state, cq_hdl);
6139 
6140 		mutex_enter(&state->id_scq_poll_lock);
6141 		if (state->id_scq_poll_busy & redo_flag)
6142 			state->id_scq_poll_busy &= ~redo_flag;
6143 		else {
6144 			state->id_scq_poll_busy &= ~flag;
6145 			redo = 0;
6146 		}
6147 		mutex_exit(&state->id_scq_poll_lock);
6148 
6149 	} while (redo);
6150 }
6151 
6152 /*
6153  * Common code for interrupt handling as well as for polling
6154  * for all completed wqe's while detaching.
6155  */
6156 static void
6157 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
6158 {
6159 	int flag, redo_flag;
6160 	int redo = 1;
6161 
6162 	flag = IBD_CQ_POLLING;
6163 	redo_flag = IBD_REDO_CQ_POLLING;
6164 
6165 	mutex_enter(&state->id_rcq_poll_lock);
6166 	if (state->id_rcq_poll_busy & flag) {
6167 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
6168 		state->id_rcq_poll_busy |= redo_flag;
6169 		mutex_exit(&state->id_rcq_poll_lock);
6170 		return;
6171 	}
6172 	state->id_rcq_poll_busy |= flag;
6173 	mutex_exit(&state->id_rcq_poll_lock);
6174 
6175 	/*
6176 	 * Poll and drain the CQ
6177 	 */
6178 	ibd_drain_rcq(state, rcq);
6179 
6180 	/*
6181 	 * Enable CQ notifications and redrain the cq to catch any
6182 	 * completions we might have missed after the ibd_drain_cq()
6183 	 * above and before the ibt_enable_cq_notify() that follows.
6184 	 * Finally, service any new requests to poll the cq that
6185 	 * could've come in after the ibt_enable_cq_notify().
6186 	 */
6187 	do {
6188 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
6189 		    IBT_SUCCESS) {
6190 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6191 		}
6192 
6193 		ibd_drain_rcq(state, rcq);
6194 
6195 		mutex_enter(&state->id_rcq_poll_lock);
6196 		if (state->id_rcq_poll_busy & redo_flag)
6197 			state->id_rcq_poll_busy &= ~redo_flag;
6198 		else {
6199 			state->id_rcq_poll_busy &= ~flag;
6200 			redo = 0;
6201 		}
6202 		mutex_exit(&state->id_rcq_poll_lock);
6203 
6204 	} while (redo);
6205 }
6206 
6207 /*
6208  * Unmap the memory area associated with a given swqe.
6209  */
6210 static void
6211 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
6212 {
6213 	ibt_status_t stat;
6214 
6215 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
6216 
6217 	if (swqe->w_mi_hdl) {
6218 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
6219 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
6220 			DPRINT(10,
6221 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
6222 		}
6223 		swqe->w_mi_hdl = NULL;
6224 	}
6225 	swqe->w_swr.wr_nds = 0;
6226 }
6227 
6228 static void
6229 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
6230 {
6231 	/*
6232 	 * The recycling logic can be eliminated from here
6233 	 * and put into the async thread if we create another
6234 	 * list to hold ACE's for unjoined mcg's.
6235 	 */
6236 	if (DEC_REF_DO_CYCLE(ace)) {
6237 		ibd_mce_t *mce;
6238 
6239 		/*
6240 		 * Check with the lock taken: we decremented
6241 		 * reference count without the lock, and some
6242 		 * transmitter might already have bumped the
6243 		 * reference count (possible in case of multicast
6244 		 * disable when we leave the AH on the active
6245 		 * list). If not still 0, get out, leaving the
6246 		 * recycle bit intact.
6247 		 *
6248 		 * Atomically transition the AH from active
6249 		 * to free list, and queue a work request to
6250 		 * leave the group and destroy the mce. No
6251 		 * transmitter can be looking at the AH or
6252 		 * the MCE in between, since we have the
6253 		 * ac_mutex lock. In the SendOnly reap case,
6254 		 * it is not necessary to hold the ac_mutex
6255 		 * and recheck the ref count (since the AH was
6256 		 * taken off the active list), we just do it
6257 		 * to have uniform processing with the Full
6258 		 * reap case.
6259 		 */
6260 		mutex_enter(&state->id_ac_mutex);
6261 		mce = ace->ac_mce;
6262 		if (GET_REF_CYCLE(ace) == 0) {
6263 			CLEAR_REFCYCLE(ace);
6264 			/*
6265 			 * Identify the case of fullmember reap as
6266 			 * opposed to mcg trap reap. Also, port up
6267 			 * might set ac_mce to NULL to indicate Tx
6268 			 * cleanup should do no more than put the
6269 			 * AH in the free list (see ibd_async_link).
6270 			 */
6271 			if (mce != NULL) {
6272 				ace->ac_mce = NULL;
6273 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
6274 				/*
6275 				 * mc_req was initialized at mce
6276 				 * creation time.
6277 				 */
6278 				ibd_queue_work_slot(state,
6279 				    &mce->mc_req, IBD_ASYNC_REAP);
6280 			}
6281 			IBD_ACACHE_INSERT_FREE(state, ace);
6282 		}
6283 		mutex_exit(&state->id_ac_mutex);
6284 	}
6285 }
6286 
6287 /*
6288  * Common code that deals with clean ups after a successful or
6289  * erroneous transmission attempt.
6290  */
6291 static void
6292 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
6293 {
6294 	ibd_ace_t *ace = swqe->w_ahandle;
6295 
6296 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
6297 
6298 	/*
6299 	 * If this was a dynamic mapping in ibd_send(), we need to
6300 	 * unmap here. If this was an lso buffer we'd used for sending,
6301 	 * we need to release the lso buf to the pool, since the resource
6302 	 * is scarce. However, if this was simply a normal send using
6303 	 * the copybuf (present in each swqe), we don't need to release it.
6304 	 */
6305 	if (swqe->swqe_im_mblk != NULL) {
6306 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
6307 			ibd_unmap_mem(state, swqe);
6308 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6309 			ibd_release_lsobufs(state,
6310 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6311 		}
6312 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6313 		freemsg(swqe->swqe_im_mblk);
6314 		swqe->swqe_im_mblk = NULL;
6315 	}
6316 
6317 	/*
6318 	 * Drop the reference count on the AH; it can be reused
6319 	 * now for a different destination if there are no more
6320 	 * posted sends that will use it. This can be eliminated
6321 	 * if we can always associate each Tx buffer with an AH.
6322 	 * The ace can be null if we are cleaning up from the
6323 	 * ibd_send() error path.
6324 	 */
6325 	if (ace != NULL) {
6326 		ibd_dec_ref_ace(state, ace);
6327 	}
6328 
6329 	/*
6330 	 * Release the send wqe for reuse.
6331 	 */
6332 	swqe->swqe_next = NULL;
6333 	ibd_release_swqe(state, swqe, swqe, 1);
6334 }
6335 
6336 static void
6337 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
6338 {
6339 	ibd_ace_t *ace;
6340 	ibd_swqe_t *swqe;
6341 	int n = 0;
6342 
6343 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
6344 
6345 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
6346 
6347 		/*
6348 		 * If this was a dynamic mapping in ibd_send(), we need to
6349 		 * unmap here. If this was an lso buffer we'd used for sending,
6350 		 * we need to release the lso buf to the pool, since the
6351 		 * resource is scarce. However, if this was simply a normal
6352 		 * send using the copybuf (present in each swqe), we don't need
6353 		 * to release it.
6354 		 */
6355 		if (swqe->swqe_im_mblk != NULL) {
6356 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
6357 				ibd_unmap_mem(state, swqe);
6358 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6359 				ibd_release_lsobufs(state,
6360 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6361 			}
6362 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6363 			freemsg(swqe->swqe_im_mblk);
6364 			swqe->swqe_im_mblk = NULL;
6365 		}
6366 
6367 		/*
6368 		 * Drop the reference count on the AH; it can be reused
6369 		 * now for a different destination if there are no more
6370 		 * posted sends that will use it. This can be eliminated
6371 		 * if we can always associate each Tx buffer with an AH.
6372 		 * The ace can be null if we are cleaning up from the
6373 		 * ibd_send() error path.
6374 		 */
6375 		ace = swqe->w_ahandle;
6376 		if (ace != NULL) {
6377 			ibd_dec_ref_ace(state, ace);
6378 		}
6379 		n++;
6380 	}
6381 
6382 	/*
6383 	 * Release the send wqes for reuse.
6384 	 */
6385 	ibd_release_swqe(state, head, tail, n);
6386 }
6387 
6388 /*
6389  * Processing to be done after receipt of a packet; hand off to GLD
6390  * in the format expected by GLD.  The received packet has this
6391  * format: 2b sap :: 00 :: data.
6392  */
6393 static mblk_t *
6394 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
6395 {
6396 	ib_header_info_t *phdr;
6397 	mblk_t *mp;
6398 	ipoib_hdr_t *ipibp;
6399 	ipha_t *iphap;
6400 	ip6_t *ip6h;
6401 	int len;
6402 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
6403 	uint32_t bufs;
6404 
6405 	/*
6406 	 * Track number handed to upper layer that need to be returned.
6407 	 */
6408 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
6409 
6410 	/* Never run out of rwqes, use allocb when running low */
6411 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
6412 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
6413 		atomic_inc_32(&state->id_rx_allocb);
6414 		mp = allocb(pkt_len, BPRI_HI);
6415 		if (mp) {
6416 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
6417 			ibd_post_recv(state, rwqe);
6418 		} else {	/* no memory */
6419 			atomic_inc_32(&state->id_rx_allocb_failed);
6420 			ibd_post_recv(state, rwqe);
6421 			return (NULL);
6422 		}
6423 	} else {
6424 		mp = rwqe->rwqe_im_mblk;
6425 	}
6426 
6427 
6428 	/*
6429 	 * Adjust write pointer depending on how much data came in.
6430 	 */
6431 	mp->b_wptr = mp->b_rptr + pkt_len;
6432 
6433 	/*
6434 	 * Make sure this is NULL or we're in trouble.
6435 	 */
6436 	if (mp->b_next != NULL) {
6437 		ibd_print_warn(state,
6438 		    "ibd_process_rx: got duplicate mp from rcq?");
6439 		mp->b_next = NULL;
6440 	}
6441 
6442 	/*
6443 	 * the IB link will deliver one of the IB link layer
6444 	 * headers called, the Global Routing Header (GRH).
6445 	 * ibd driver uses the information in GRH to build the
6446 	 * Header_info structure and pass it with the datagram up
6447 	 * to GLDv3.
6448 	 * If the GRH is not valid, indicate to GLDv3 by setting
6449 	 * the VerTcFlow field to 0.
6450 	 */
6451 	phdr = (ib_header_info_t *)mp->b_rptr;
6452 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
6453 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
6454 
6455 		/* if it is loop back packet, just drop it. */
6456 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
6457 		    IPOIB_ADDRL) == 0) {
6458 			freemsg(mp);
6459 			return (NULL);
6460 		}
6461 
6462 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
6463 		    sizeof (ipoib_mac_t));
6464 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
6465 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
6466 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
6467 		} else {
6468 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
6469 		}
6470 	} else {
6471 		/*
6472 		 * It can not be a IBA multicast packet. Must have been
6473 		 * unicast for us. Just copy the interface address to dst.
6474 		 */
6475 		phdr->ib_grh.ipoib_vertcflow = 0;
6476 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
6477 		    sizeof (ipoib_mac_t));
6478 	}
6479 
6480 	/*
6481 	 * For ND6 packets, padding is at the front of the source/target
6482 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6483 	 * the padding from such packets.
6484 	 */
6485 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6486 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
6487 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6488 		len = ntohs(ip6h->ip6_plen);
6489 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6490 			/* LINTED: E_CONSTANT_CONDITION */
6491 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6492 		}
6493 	}
6494 
6495 	/*
6496 	 * Update statistics
6497 	 */
6498 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
6499 	atomic_inc_64(&state->id_rcv_pkt);
6500 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6501 		atomic_inc_64(&state->id_brd_rcv);
6502 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6503 		atomic_inc_64(&state->id_multi_rcv);
6504 
6505 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6506 	/*
6507 	 * Set receive checksum status in mp
6508 	 * Hardware checksumming can be considered valid only if:
6509 	 * 1. CQE.IP_OK bit is set
6510 	 * 2. CQE.CKSUM = 0xffff
6511 	 * 3. IPv6 routing header is not present in the packet
6512 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6513 	 */
6514 
6515 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6516 	    (wc->wc_cksum == 0xFFFF) &&
6517 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6518 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
6519 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
6520 	}
6521 
6522 	return (mp);
6523 }
6524 
6525 /*
6526  * Callback code invoked from STREAMs when the receive data buffer is
6527  * free for recycling.
6528  */
6529 static void
6530 ibd_freemsg_cb(char *arg)
6531 {
6532 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6533 	ibd_state_t *state = rwqe->w_state;
6534 
6535 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
6536 
6537 	/*
6538 	 * If the driver is stopped, just free the rwqe.
6539 	 */
6540 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
6541 		DPRINT(6, "ibd_freemsg: wqe being freed");
6542 		rwqe->rwqe_im_mblk = NULL;
6543 		ibd_free_rwqe(state, rwqe);
6544 		return;
6545 	}
6546 
6547 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6548 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6549 	if (rwqe->rwqe_im_mblk == NULL) {
6550 		ibd_free_rwqe(state, rwqe);
6551 		DPRINT(6, "ibd_freemsg: desballoc failed");
6552 		return;
6553 	}
6554 
6555 	ibd_post_recv(state, rwqe);
6556 }
6557 
6558 static uint_t
6559 ibd_tx_recycle(caddr_t arg)
6560 {
6561 	ibd_state_t *state = (ibd_state_t *)arg;
6562 
6563 	/*
6564 	 * Poll for completed entries
6565 	 */
6566 	ibd_poll_scq(state, state->id_scq_hdl);
6567 
6568 	return (DDI_INTR_CLAIMED);
6569 }
6570 
6571 #ifdef IBD_LOGGING
6572 static void
6573 ibd_log_init(void)
6574 {
6575 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
6576 	ibd_lbuf_ndx = 0;
6577 
6578 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
6579 }
6580 
6581 static void
6582 ibd_log_fini(void)
6583 {
6584 	if (ibd_lbuf)
6585 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
6586 	ibd_lbuf_ndx = 0;
6587 	ibd_lbuf = NULL;
6588 
6589 	mutex_destroy(&ibd_lbuf_lock);
6590 }
6591 
6592 static void
6593 ibd_log(const char *fmt, ...)
6594 {
6595 	va_list	ap;
6596 	uint32_t off;
6597 	uint32_t msglen;
6598 	char tmpbuf[IBD_DMAX_LINE];
6599 
6600 	if (ibd_lbuf == NULL)
6601 		return;
6602 
6603 	va_start(ap, fmt);
6604 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
6605 	va_end(ap);
6606 
6607 	if (msglen >= IBD_DMAX_LINE)
6608 		msglen = IBD_DMAX_LINE - 1;
6609 
6610 	mutex_enter(&ibd_lbuf_lock);
6611 
6612 	off = ibd_lbuf_ndx;		/* current msg should go here */
6613 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
6614 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
6615 
6616 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
6617 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
6618 
6619 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
6620 		ibd_lbuf_ndx = 0;
6621 
6622 	mutex_exit(&ibd_lbuf_lock);
6623 
6624 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
6625 }
6626 #endif
6627