xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision bd2ee4f4d736b3a98de7cb84206a8cd8d65ccdda)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip_if.h>		/* for IP6_DL_SAP */
54 #include <inet/ip6.h>		/* for ip6_t */
55 #include <inet/tcp.h>		/* for tcph_t */
56 #include <netinet/icmp6.h>	/* for icmp6_t */
57 #include <sys/callb.h>
58 #include <sys/modhash.h>
59 
60 #include <sys/ib/clients/ibd/ibd.h>
61 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
62 #include <sys/note.h>
63 #include <sys/multidata.h>
64 
65 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
66 
67 /*
68  * Per-interface tunables
69  *
70  * ibd_tx_copy_thresh
71  *     This sets the threshold at which ibd will attempt to do a bcopy of the
72  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
73  *     is restricted by various parameters, so setting of this value must be
74  *     made after careful considerations only.  For instance, IB HCAs currently
75  *     impose a relatively small limit (when compared to ethernet NICs) on the
76  *     length of the SGL for transmit. On the other hand, the ip stack could
77  *     send down mp chains that are quite long when LSO is enabled.
78  *
79  * ibd_num_swqe
80  *     Number of "send WQE" elements that will be allocated and used by ibd.
81  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
82  *     buffer in each of these send wqes must be taken into account. This
83  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
84  *     currently set to the same value of ibd_tx_copy_thresh, but may be
85  *     changed independently if needed).
86  *
87  * ibd_num_rwqe
88  *     Number of "receive WQE" elements that will be allocated and used by
89  *     ibd. This parameter is limited by the maximum channel size of the HCA.
90  *     Each buffer in the receive wqe will be of MTU size.
91  *
92  * ibd_num_lso_bufs
93  *     Number of "larger-than-MTU" copy buffers to use for cases when the
94  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
95  *     and too large to be used with regular MTU-sized copy buffers. It is
96  *     not recommended to tune this variable without understanding the
97  *     application environment and/or memory resources. The size of each of
98  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
99  *
100  * ibd_num_ah
101  *     Number of AH cache entries to allocate
102  *
103  * ibd_hash_size
104  *     Hash table size for the active AH list
105  *
106  * ibd_separate_cqs
107  * ibd_txcomp_poll
108  *     These boolean variables (1 or 0) may be used to tune the behavior of
109  *     ibd in managing the send and receive completion queues and in deciding
110  *     whether or not transmit completions should be polled or interrupt
111  *     driven (when the completion queues are separate). If both the completion
112  *     queues are interrupt driven, it may not be possible for the handlers to
113  *     be invoked concurrently, depending on how the interrupts are tied on
114  *     the PCI intr line.  Note that some combination of these two parameters
115  *     may not be meaningful (and therefore not allowed).
116  *
117  * ibd_tx_softintr
118  * ibd_rx_softintr
119  *     The softintr mechanism allows ibd to avoid event queue overflows if
120  *     the receive/completion handlers are to be expensive. These are enabled
121  *     by default.
122  *
123  * ibd_log_sz
124  *     This specifies the size of the ibd log buffer in bytes. The buffer is
125  *     allocated and logging is enabled only when IBD_LOGGING is defined.
126  *
127  */
128 uint_t ibd_tx_copy_thresh = 0x1000;
129 uint_t ibd_num_swqe = 4000;
130 uint_t ibd_num_rwqe = 4000;
131 uint_t ibd_num_lso_bufs = 0x400;
132 uint_t ibd_num_ah = 64;
133 uint_t ibd_hash_size = 32;
134 uint_t ibd_separate_cqs = 1;
135 uint_t ibd_txcomp_poll = 0;
136 uint_t ibd_rx_softintr = 1;
137 uint_t ibd_tx_softintr = 1;
138 uint_t ibd_create_broadcast_group = 1;
139 #ifdef IBD_LOGGING
140 uint_t ibd_log_sz = 0x20000;
141 #endif
142 
143 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
144 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
145 #define	IBD_NUM_SWQE			ibd_num_swqe
146 #define	IBD_NUM_RWQE			ibd_num_rwqe
147 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
148 #define	IBD_NUM_AH			ibd_num_ah
149 #define	IBD_HASH_SIZE			ibd_hash_size
150 #ifdef IBD_LOGGING
151 #define	IBD_LOG_SZ			ibd_log_sz
152 #endif
153 
154 /*
155  * Receive CQ moderation parameters: NOT tunables
156  */
157 static uint_t ibd_rxcomp_count = 4;
158 static uint_t ibd_rxcomp_usec = 10;
159 
160 /*
161  * Send CQ moderation parameters: NOT tunables
162  */
163 #define	IBD_TXCOMP_COUNT		10
164 #define	IBD_TXCOMP_USEC			300
165 
166 /*
167  * Thresholds
168  *
169  * When waiting for resources (swqes or lso buffers) to become available,
170  * the first two thresholds below determine how long to wait before informing
171  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
172  * determines how low the available swqes should go before we start polling
173  * the completion queue.
174  */
175 #define	IBD_FREE_LSOS_THRESH		8
176 #define	IBD_FREE_SWQES_THRESH		20
177 #define	IBD_TX_POLL_THRESH		80
178 
179 /*
180  * When doing multiple-send-wr or multiple-recv-wr posts, this value
181  * determines how many to do at a time (in a single ibt_post_send/recv).
182  */
183 #define	IBD_MAX_POST_MULTIPLE		4
184 
185 /*
186  * Maximum length for returning chained mps back to crossbow
187  */
188 #define	IBD_MAX_RX_MP_LEN		16
189 
190 /*
191  * LSO parameters
192  */
193 #define	IBD_LSO_MAXLEN			65536
194 #define	IBD_LSO_BUFSZ			8192
195 #define	IBD_PROP_LSO_POLICY		"lso-policy"
196 
197 /*
198  * Completion queue polling control
199  */
200 #define	IBD_RX_CQ_POLLING		0x1
201 #define	IBD_TX_CQ_POLLING		0x2
202 #define	IBD_REDO_RX_CQ_POLLING		0x4
203 #define	IBD_REDO_TX_CQ_POLLING		0x8
204 
205 /*
206  * Flag bits for resources to reap
207  */
208 #define	IBD_RSRC_SWQE			0x1
209 #define	IBD_RSRC_LSOBUF			0x2
210 
211 /*
212  * Async operation types
213  */
214 #define	IBD_ASYNC_GETAH			1
215 #define	IBD_ASYNC_JOIN			2
216 #define	IBD_ASYNC_LEAVE			3
217 #define	IBD_ASYNC_PROMON		4
218 #define	IBD_ASYNC_PROMOFF		5
219 #define	IBD_ASYNC_REAP			6
220 #define	IBD_ASYNC_TRAP			7
221 #define	IBD_ASYNC_SCHED			8
222 #define	IBD_ASYNC_LINK			9
223 #define	IBD_ASYNC_EXIT			10
224 
225 /*
226  * Async operation states
227  */
228 #define	IBD_OP_NOTSTARTED		0
229 #define	IBD_OP_ONGOING			1
230 #define	IBD_OP_COMPLETED		2
231 #define	IBD_OP_ERRORED			3
232 #define	IBD_OP_ROUTERED			4
233 
234 /*
235  * State of IBD driver initialization during attach/m_start
236  */
237 #define	IBD_DRV_STATE_INITIALIZED	0x00001
238 #define	IBD_DRV_RXINTR_ADDED		0x00002
239 #define	IBD_DRV_TXINTR_ADDED		0x00004
240 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
241 #define	IBD_DRV_HCA_OPENED		0x00010
242 #define	IBD_DRV_PD_ALLOCD		0x00020
243 #define	IBD_DRV_MAC_REGISTERED		0x00040
244 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
245 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
246 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
247 #define	IBD_DRV_CQS_ALLOCD		0x00400
248 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
249 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
250 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
251 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
252 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
253 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
254 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
255 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
256 #define	IBD_DRV_STARTED			0x80000
257 
258 /*
259  * Miscellaneous constants
260  */
261 #define	IBD_SEND			0
262 #define	IBD_RECV			1
263 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
264 #define	IBD_DEF_MAX_SDU			2044
265 #define	IBD_DEFAULT_QKEY		0xB1B
266 #ifdef IBD_LOGGING
267 #define	IBD_DMAX_LINE			100
268 #endif
269 
270 /*
271  * Enumerations for link states
272  */
273 typedef enum {
274 	IBD_LINK_DOWN,
275 	IBD_LINK_UP,
276 	IBD_LINK_UP_ABSENT
277 } ibd_link_op_t;
278 
279 /*
280  * Driver State Pointer
281  */
282 void *ibd_list;
283 
284 /*
285  * Logging
286  */
287 #ifdef IBD_LOGGING
288 kmutex_t ibd_lbuf_lock;
289 uint8_t *ibd_lbuf;
290 uint32_t ibd_lbuf_ndx;
291 #endif
292 
293 /*
294  * Required system entry points
295  */
296 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
297 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
298 
299 /*
300  * Required driver entry points for GLDv3
301  */
302 static int ibd_m_stat(void *, uint_t, uint64_t *);
303 static int ibd_m_start(void *);
304 static void ibd_m_stop(void *);
305 static int ibd_m_promisc(void *, boolean_t);
306 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
307 static int ibd_m_unicst(void *, const uint8_t *);
308 static mblk_t *ibd_m_tx(void *, mblk_t *);
309 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
310 
311 /*
312  * Private driver entry points for GLDv3
313  */
314 
315 /*
316  * Initialization
317  */
318 static int ibd_state_init(ibd_state_t *, dev_info_t *);
319 static int ibd_init_txlist(ibd_state_t *);
320 static int ibd_init_rxlist(ibd_state_t *);
321 static int ibd_acache_init(ibd_state_t *);
322 #ifdef IBD_LOGGING
323 static void ibd_log_init(void);
324 #endif
325 
326 /*
327  * Termination/cleanup
328  */
329 static void ibd_state_fini(ibd_state_t *);
330 static void ibd_fini_txlist(ibd_state_t *);
331 static void ibd_fini_rxlist(ibd_state_t *);
332 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
333 static void ibd_acache_fini(ibd_state_t *);
334 #ifdef IBD_LOGGING
335 static void ibd_log_fini(void);
336 #endif
337 
338 /*
339  * Allocation/acquire/map routines
340  */
341 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
342 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
343 static int ibd_alloc_tx_copybufs(ibd_state_t *);
344 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
345 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
346 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
347     uint32_t *);
348 
349 /*
350  * Free/release/unmap routines
351  */
352 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
353 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
354 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
355 static void ibd_free_tx_copybufs(ibd_state_t *);
356 static void ibd_free_tx_lsobufs(ibd_state_t *);
357 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
358 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
359 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
360 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
361 
362 /*
363  * Handlers/callback routines
364  */
365 static uint_t ibd_intr(char *);
366 static uint_t ibd_tx_recycle(char *);
367 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
368 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
369 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
370 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
371 static void ibd_freemsg_cb(char *);
372 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
373     ibt_async_event_t *);
374 static void ibd_snet_notices_handler(void *, ib_gid_t,
375     ibt_subnet_event_code_t, ibt_subnet_event_t *);
376 
377 /*
378  * Send/receive routines
379  */
380 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
381 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
382 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
383 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
384 static void ibd_flush_rx(ibd_state_t *, mblk_t *);
385 
386 /*
387  * Threads
388  */
389 static void ibd_async_work(ibd_state_t *);
390 
391 /*
392  * Async tasks
393  */
394 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
395 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
396 static void ibd_async_setprom(ibd_state_t *);
397 static void ibd_async_unsetprom(ibd_state_t *);
398 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
399 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
400 static void ibd_async_txsched(ibd_state_t *);
401 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
402 
403 /*
404  * Async task helpers
405  */
406 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
407 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
408 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
409 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
410     ipoib_mac_t *, ipoib_mac_t *);
411 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
412 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
413 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
414 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
415 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
416 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
417 static uint64_t ibd_get_portspeed(ibd_state_t *);
418 static boolean_t ibd_async_safe(ibd_state_t *);
419 static void ibd_async_done(ibd_state_t *);
420 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
421 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
422 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
423 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
424 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
425 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
426 
427 /*
428  * Helpers for attach/start routines
429  */
430 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
431 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
432 static int ibd_unattach(ibd_state_t *, dev_info_t *);
433 static int ibd_get_port_details(ibd_state_t *);
434 static int ibd_alloc_cqs(ibd_state_t *);
435 static int ibd_setup_ud_channel(ibd_state_t *);
436 static int ibd_undo_m_start(ibd_state_t *);
437 
438 
439 /*
440  * Miscellaneous helpers
441  */
442 static int ibd_sched_poll(ibd_state_t *, int, int);
443 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
444 static int ibd_resume_transmission(ibd_state_t *);
445 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
446 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
447 static void *list_get_head(list_t *);
448 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
449 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
450 static void ibd_print_warn(ibd_state_t *, char *, ...);
451 #ifdef IBD_LOGGING
452 static void ibd_log(const char *, ...);
453 #endif
454 
455 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
456     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
457 
458 /* Module Driver Info */
459 static struct modldrv ibd_modldrv = {
460 	&mod_driverops,			/* This one is a driver */
461 	"InfiniBand GLDv3 Driver",	/* short description */
462 	&ibd_dev_ops			/* driver specific ops */
463 };
464 
465 /* Module Linkage */
466 static struct modlinkage ibd_modlinkage = {
467 	MODREV_1, (void *)&ibd_modldrv, NULL
468 };
469 
470 /*
471  * Module (static) info passed to IBTL during ibt_attach
472  */
473 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
474 	IBTI_V_CURR,
475 	IBT_NETWORK,
476 	ibd_async_handler,
477 	NULL,
478 	"IPIB"
479 };
480 
481 /*
482  * GLDv3 entry points
483  */
484 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
485 static mac_callbacks_t ibd_m_callbacks = {
486 	IBD_M_CALLBACK_FLAGS,
487 	ibd_m_stat,
488 	ibd_m_start,
489 	ibd_m_stop,
490 	ibd_m_promisc,
491 	ibd_m_multicst,
492 	ibd_m_unicst,
493 	ibd_m_tx,
494 	NULL,
495 	ibd_m_getcapab
496 };
497 
498 /*
499  * Fill/clear <scope> and <p_key> in multicast/broadcast address
500  */
501 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
502 {							\
503 	*(uint32_t *)((char *)(maddr) + 4) |=		\
504 	    htonl((uint32_t)(scope) << 16);		\
505 	*(uint32_t *)((char *)(maddr) + 8) |=		\
506 	    htonl((uint32_t)(pkey) << 16);		\
507 }
508 
509 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
510 {							\
511 	*(uint32_t *)((char *)(maddr) + 4) &=		\
512 	    htonl(~((uint32_t)0xF << 16));		\
513 	*(uint32_t *)((char *)(maddr) + 8) &=		\
514 	    htonl(~((uint32_t)0xFFFF << 16));		\
515 }
516 
517 /*
518  * Rudimentary debugging support
519  */
520 #ifdef DEBUG
521 int ibd_debuglevel = 100;
522 static void
523 debug_print(int l, char *fmt, ...)
524 {
525 	va_list ap;
526 
527 	if (l < ibd_debuglevel)
528 		return;
529 	va_start(ap, fmt);
530 	vcmn_err(CE_CONT, fmt, ap);
531 	va_end(ap);
532 }
533 #define	DPRINT		debug_print
534 #else
535 #define	DPRINT
536 #endif
537 
538 /*
539  * Common routine to print warning messages; adds in hca guid, port number
540  * and pkey to be able to identify the IBA interface.
541  */
542 static void
543 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
544 {
545 	ib_guid_t hca_guid;
546 	char ibd_print_buf[256];
547 	int len;
548 	va_list ap;
549 
550 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
551 	    0, "hca-guid", 0);
552 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
553 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
554 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
555 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
556 	va_start(ap, fmt);
557 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
558 	    fmt, ap);
559 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
560 	va_end(ap);
561 }
562 
563 /*
564  * Warlock directives
565  */
566 
567 /*
568  * id_lso_lock
569  *
570  * state->id_lso->bkt_nfree may be accessed without a lock to
571  * determine the threshold at which we have to ask the nw layer
572  * to resume transmission (see ibd_resume_transmission()).
573  */
574 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
575     ibd_state_t::id_lso))
576 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
577 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
578 
579 /*
580  * id_cq_poll_lock
581  */
582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
583     ibd_state_t::id_cq_poll_busy))
584 
585 /*
586  * id_txpost_lock
587  */
588 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
589     ibd_state_t::id_tx_head))
590 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
591     ibd_state_t::id_tx_busy))
592 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
593     ibd_state_t::id_tx_tailp))
594 
595 /*
596  * id_rxpost_lock
597  */
598 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
599     ibd_state_t::id_rx_head))
600 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
601     ibd_state_t::id_rx_busy))
602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
603     ibd_state_t::id_rx_tailp))
604 
605 /*
606  * id_acache_req_lock
607  */
608 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
609     ibd_state_t::id_acache_req_cv))
610 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
611     ibd_state_t::id_req_list))
612 
613 /*
614  * id_ac_mutex
615  *
616  * This mutex is actually supposed to protect id_ah_op as well,
617  * but this path of the code isn't clean (see update of id_ah_op
618  * in ibd_async_acache(), immediately after the call to
619  * ibd_async_mcache()). For now, we'll skip this check by
620  * declaring that id_ah_op is protected by some internal scheme
621  * that warlock isn't aware of.
622  */
623 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
624     ibd_state_t::id_ah_active))
625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
626     ibd_state_t::id_ah_free))
627 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
628     ibd_state_t::id_ah_addr))
629 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
630     ibd_state_t::id_ah_op))
631 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
632     ibd_state_t::id_ah_error))
633 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
634 
635 /*
636  * id_mc_mutex
637  */
638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
639     ibd_state_t::id_mc_full))
640 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
641     ibd_state_t::id_mc_non))
642 
643 /*
644  * id_trap_lock
645  */
646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
647     ibd_state_t::id_trap_cv))
648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
649     ibd_state_t::id_trap_stop))
650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
651     ibd_state_t::id_trap_inprog))
652 
653 /*
654  * id_prom_op
655  */
656 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
657     ibd_state_t::id_prom_op))
658 
659 /*
660  * id_sched_lock
661  */
662 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
663     ibd_state_t::id_sched_needed))
664 
665 /*
666  * id_link_mutex
667  */
668 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
669     ibd_state_t::id_link_state))
670 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
671 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
672     ibd_state_t::id_link_speed))
673 
674 /*
675  * id_tx_list.dl_mutex
676  */
677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
678     ibd_state_t::id_tx_list.dl_head))
679 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
680     ibd_state_t::id_tx_list.dl_tail))
681 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
682     ibd_state_t::id_tx_list.dl_pending_sends))
683 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
684     ibd_state_t::id_tx_list.dl_cnt))
685 
686 /*
687  * id_rx_list.dl_mutex
688  */
689 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
690     ibd_state_t::id_rx_list.dl_head))
691 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
692     ibd_state_t::id_rx_list.dl_tail))
693 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
694     ibd_state_t::id_rx_list.dl_bufs_outstanding))
695 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
696     ibd_state_t::id_rx_list.dl_cnt))
697 
698 
699 /*
700  * Items protected by atomic updates
701  */
702 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
703     ibd_state_s::id_brd_rcv
704     ibd_state_s::id_brd_xmt
705     ibd_state_s::id_multi_rcv
706     ibd_state_s::id_multi_xmt
707     ibd_state_s::id_num_intrs
708     ibd_state_s::id_rcv_bytes
709     ibd_state_s::id_rcv_pkt
710     ibd_state_s::id_tx_short
711     ibd_state_s::id_xmt_bytes
712     ibd_state_s::id_xmt_pkt))
713 
714 /*
715  * Non-mutex protection schemes for data elements. Almost all of
716  * these are non-shared items.
717  */
718 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
719     callb_cpr
720     ib_gid_s
721     ib_header_info
722     ibd_acache_rq
723     ibd_acache_s::ac_mce
724     ibd_mcache::mc_fullreap
725     ibd_mcache::mc_jstate
726     ibd_mcache::mc_req
727     ibd_rwqe_s
728     ibd_swqe_s
729     ibd_wqe_s
730     ibt_wr_ds_s::ds_va
731     ibt_wr_lso_s
732     ipoib_mac::ipoib_qpn
733     mac_capab_lso_s
734     msgb::b_next
735     msgb::b_rptr
736     msgb::b_wptr))
737 
738 int
739 _init()
740 {
741 	int status;
742 
743 	/*
744 	 * Sanity check some parameter settings. Tx completion polling
745 	 * only makes sense with separate CQs for Tx and Rx.
746 	 */
747 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
748 		cmn_err(CE_NOTE, "!ibd: %s",
749 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
750 		ibd_txcomp_poll = 0;
751 	}
752 
753 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
754 	if (status != 0) {
755 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
756 		return (status);
757 	}
758 
759 	mac_init_ops(&ibd_dev_ops, "ibd");
760 	status = mod_install(&ibd_modlinkage);
761 	if (status != 0) {
762 		DPRINT(10, "_init:failed in mod_install()");
763 		ddi_soft_state_fini(&ibd_list);
764 		mac_fini_ops(&ibd_dev_ops);
765 		return (status);
766 	}
767 
768 #ifdef IBD_LOGGING
769 	ibd_log_init();
770 #endif
771 	return (0);
772 }
773 
774 int
775 _info(struct modinfo *modinfop)
776 {
777 	return (mod_info(&ibd_modlinkage, modinfop));
778 }
779 
780 int
781 _fini()
782 {
783 	int status;
784 
785 	status = mod_remove(&ibd_modlinkage);
786 	if (status != 0)
787 		return (status);
788 
789 	mac_fini_ops(&ibd_dev_ops);
790 	ddi_soft_state_fini(&ibd_list);
791 #ifdef IBD_LOGGING
792 	ibd_log_fini();
793 #endif
794 	return (0);
795 }
796 
797 /*
798  * Convert the GID part of the mac address from network byte order
799  * to host order.
800  */
801 static void
802 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
803 {
804 	ib_sn_prefix_t nbopref;
805 	ib_guid_t nboguid;
806 
807 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
808 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
809 	dgid->gid_prefix = b2h64(nbopref);
810 	dgid->gid_guid = b2h64(nboguid);
811 }
812 
813 /*
814  * Create the IPoIB address in network byte order from host order inputs.
815  */
816 static void
817 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
818     ib_guid_t guid)
819 {
820 	ib_sn_prefix_t nbopref;
821 	ib_guid_t nboguid;
822 
823 	mac->ipoib_qpn = htonl(qpn);
824 	nbopref = h2b64(prefix);
825 	nboguid = h2b64(guid);
826 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
827 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
828 }
829 
830 /*
831  * Send to the appropriate all-routers group when the IBA multicast group
832  * does not exist, based on whether the target group is v4 or v6.
833  */
834 static boolean_t
835 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
836     ipoib_mac_t *rmac)
837 {
838 	boolean_t retval = B_TRUE;
839 	uint32_t adjscope = state->id_scope << 16;
840 	uint32_t topword;
841 
842 	/*
843 	 * Copy the first 4 bytes in without assuming any alignment of
844 	 * input mac address; this will have IPoIB signature, flags and
845 	 * scope bits.
846 	 */
847 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
848 	topword = ntohl(topword);
849 
850 	/*
851 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
852 	 */
853 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
854 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
855 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
856 		    ((uint32_t)(state->id_pkey << 16))),
857 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
858 	else
859 		/*
860 		 * Does not have proper bits in the mgid address.
861 		 */
862 		retval = B_FALSE;
863 
864 	return (retval);
865 }
866 
867 /*
868  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
869  * front of optional src/tgt link layer address. Right now Solaris inserts
870  * padding by default at the end. The routine which is doing is nce_xmit()
871  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
872  * the packet comes down from IP layer to the IBD driver, it is in the
873  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
874  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
875  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
876  *
877  * The send routine at IBD driver changes this packet as follows:
878  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
879  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
880  * aligned.
881  *
882  * At the receiving side again ibd_process_rx takes the above packet and
883  * removes the two bytes of front padding and inserts it at the end. This
884  * is since the IP layer does not understand padding at the front.
885  */
886 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
887 	uchar_t 	*nd_lla_ptr;					\
888 	icmp6_t 	*icmp6;						\
889 	nd_opt_hdr_t	*opt;						\
890 	int 		i;						\
891 									\
892 	icmp6 = (icmp6_t *)&ip6h[1];					\
893 	len -= sizeof (nd_neighbor_advert_t);				\
894 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
895 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
896 	    (len != 0)) {						\
897 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
898 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
899 		ASSERT(opt != NULL);					\
900 		nd_lla_ptr = (uchar_t *)&opt[1];			\
901 		if (type == IBD_SEND) {					\
902 			for (i = IPOIB_ADDRL; i > 0; i--)		\
903 				*(nd_lla_ptr + i + 1) =			\
904 				    *(nd_lla_ptr + i - 1);		\
905 		} else {						\
906 			for (i = 0; i < IPOIB_ADDRL; i++)		\
907 				*(nd_lla_ptr + i) =			\
908 				    *(nd_lla_ptr + i + 2);		\
909 		}							\
910 		*(nd_lla_ptr + i) = 0;					\
911 		*(nd_lla_ptr + i + 1) = 0;				\
912 	}								\
913 }
914 
915 /*
916  * Address handle entries maintained by the driver are kept in the
917  * free and active lists. Each entry starts out in the free list;
918  * it migrates to the active list when primed using ibt_get_paths()
919  * and ibt_modify_ud_dest() for transmission to a specific destination.
920  * In the active list, the entry has a reference count indicating the
921  * number of ongoing/uncompleted transmits that reference it. The
922  * entry is left in the active list even after the reference count
923  * goes to 0, since successive transmits can find it there and do
924  * not need to set up another entry (ie the path information is
925  * cached using the active list). Entries on the active list are
926  * also hashed using the destination link address as a key for faster
927  * lookups during transmits.
928  *
929  * For any destination address (unicast or multicast, whatever the
930  * join states), there will be at most one entry in the active list.
931  * Entries with a 0 reference count on the active list can be reused
932  * for a transmit to a new destination, if the free list is empty.
933  *
934  * The AH free list insertion/deletion is protected with the id_ac_mutex,
935  * since the async thread and Tx callback handlers insert/delete. The
936  * active list does not need a lock (all operations are done by the
937  * async thread) but updates to the reference count are atomically
938  * done (increments done by Tx path, decrements by the Tx callback handler).
939  */
940 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
941 	list_insert_head(&state->id_ah_free, ce)
942 #define	IBD_ACACHE_GET_FREE(state) \
943 	list_get_head(&state->id_ah_free)
944 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
945 	int _ret_;						\
946 	list_insert_head(&state->id_ah_active, ce);		\
947 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
948 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
949 	ASSERT(_ret_ == 0);					\
950 }
951 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
952 	list_remove(&state->id_ah_active, ce);			\
953 	(void) mod_hash_remove(state->id_ah_active_hash,	\
954 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
955 }
956 #define	IBD_ACACHE_GET_ACTIVE(state) \
957 	list_get_head(&state->id_ah_active)
958 
959 /*
960  * Membership states for different mcg's are tracked by two lists:
961  * the "non" list is used for promiscuous mode, when all mcg traffic
962  * needs to be inspected. This type of membership is never used for
963  * transmission, so there can not be an AH in the active list
964  * corresponding to a member in this list. This list does not need
965  * any protection, since all operations are performed by the async
966  * thread.
967  *
968  * "Full" and "SendOnly" membership is tracked using a single list,
969  * the "full" list. This is because this single list can then be
970  * searched during transmit to a multicast group (if an AH for the
971  * mcg is not found in the active list), since at least one type
972  * of membership must be present before initiating the transmit.
973  * This list is also emptied during driver detach, since sendonly
974  * membership acquired during transmit is dropped at detach time
975  * alongwith ipv4 broadcast full membership. Insert/deletes to
976  * this list are done only by the async thread, but it is also
977  * searched in program context (see multicast disable case), thus
978  * the id_mc_mutex protects the list. The driver detach path also
979  * deconstructs the "full" list, but it ensures that the async
980  * thread will not be accessing the list (by blocking out mcg
981  * trap handling and making sure no more Tx reaping will happen).
982  *
983  * Currently, an IBA attach is done in the SendOnly case too,
984  * although this is not required.
985  */
986 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
987 	list_insert_head(&state->id_mc_full, mce)
988 #define	IBD_MCACHE_INSERT_NON(state, mce) \
989 	list_insert_head(&state->id_mc_non, mce)
990 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
991 	ibd_mcache_find(mgid, &state->id_mc_full)
992 #define	IBD_MCACHE_FIND_NON(state, mgid) \
993 	ibd_mcache_find(mgid, &state->id_mc_non)
994 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
995 	list_remove(&state->id_mc_full, mce)
996 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
997 	list_remove(&state->id_mc_non, mce)
998 
999 /*
1000  * AH and MCE active list manipulation:
1001  *
1002  * Multicast disable requests and MCG delete traps are two cases
1003  * where the active AH entry for the mcg (if any unreferenced one exists)
1004  * will be moved to the free list (to force the next Tx to the mcg to
1005  * join the MCG in SendOnly mode). Port up handling will also move AHs
1006  * from active to free list.
1007  *
1008  * In the case when some transmits are still pending on an entry
1009  * for an mcg, but a multicast disable has already been issued on the
1010  * mcg, there are some options to consider to preserve the join state
1011  * to ensure the emitted packet is properly routed on the IBA fabric.
1012  * For the AH, we can
1013  * 1. take out of active list at multicast disable time.
1014  * 2. take out of active list only when last pending Tx completes.
1015  * For the MCE, we can
1016  * 3. take out of active list at multicast disable time.
1017  * 4. take out of active list only when last pending Tx completes.
1018  * 5. move from active list to stale list at multicast disable time.
1019  * We choose to use 2,4. We use option 4 so that if a multicast enable
1020  * is tried before the pending Tx completes, the enable code finds the
1021  * mce in the active list and just has to make sure it will not be reaped
1022  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1023  * a stale list (#5) that would be checked in the enable code would need
1024  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1025  * after the multicast disable would try to put an AH in the active list,
1026  * and associate the mce it finds in the active list to this new AH,
1027  * whereas the mce is already associated with the previous AH (taken off
1028  * the active list), and will be removed once the pending Tx's complete
1029  * (unless a reference count on mce's is implemented). One implication of
1030  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1031  * grab new references on the AH, further delaying the leave.
1032  *
1033  * In the case of mcg delete (or create) trap when the port is sendonly
1034  * joined, the AH and MCE handling is different: the AH and MCE has to be
1035  * immediately taken off the active lists (forcing a join and path lookup
1036  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1037  * to an mcg as it is repeatedly created and deleted and goes thru
1038  * reincarnations).
1039  *
1040  * When a port is already sendonly joined, and a multicast enable is
1041  * attempted, the same mce structure is promoted; this ensures only a
1042  * single mce on the active list tracks the most powerful join state.
1043  *
1044  * In the case of port up event handling, the MCE for sendonly membership
1045  * is freed up, and the ACE is put into the free list as soon as possible
1046  * (depending on whether posted Tx's have completed). For fullmembership
1047  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1048  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1049  * done; else the mce is deconstructed (mc_fullreap case).
1050  *
1051  * MCG creation and deletion trap handling:
1052  *
1053  * These traps are unreliable (meaning sometimes the trap might never
1054  * be delivered to the subscribed nodes) and may arrive out-of-order
1055  * since they use UD transport. An alternative to relying on these
1056  * unreliable traps is to poll for mcg presence every so often, but
1057  * instead of doing that, we try to be as conservative as possible
1058  * while handling the traps, and hope that the traps do arrive at
1059  * the subscribed nodes soon. Note that if a node is fullmember
1060  * joined to an mcg, it can not possibly receive a mcg create/delete
1061  * trap for that mcg (by fullmember definition); if it does, it is
1062  * an old trap from a previous incarnation of the mcg.
1063  *
1064  * Whenever a trap is received, the driver cleans up its sendonly
1065  * membership to the group; we choose to do a sendonly leave even
1066  * on a creation trap to handle the case of a prior deletion of the mcg
1067  * having gone unnoticed. Consider an example scenario:
1068  * T1: MCG M is deleted, and fires off deletion trap D1.
1069  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1070  * T3: Node N tries to transmit to M, joining in sendonly mode.
1071  * T4: MCG M is deleted, and fires off deletion trap D2.
1072  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1073  *     If the trap is D2, then a LEAVE is not required, since the mcg
1074  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1075  *     approach is to always LEAVE, but the SM may be confused if it
1076  *     receives a LEAVE without a prior JOIN.
1077  *
1078  * Management of the non-membership to an mcg is similar to the above,
1079  * except that if the interface is in promiscuous mode, it is required
1080  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1081  * if the re-join attempt fails (in which case a warning message needs
1082  * to be printed), it is not clear whether it failed due to the mcg not
1083  * existing, or some fabric/hca issues, due to the delayed nature of
1084  * trap delivery. Querying the SA to establish presence/absence of the
1085  * mcg is also racy at best. Thus, the driver just prints a warning
1086  * message when it can not rejoin after receiving a create trap, although
1087  * this might be (on rare occassions) a mis-warning if the create trap is
1088  * received after the mcg was deleted.
1089  */
1090 
1091 /*
1092  * Implementation of atomic "recycle" bits and reference count
1093  * on address handles. This utilizes the fact that max reference
1094  * count on any handle is limited by number of send wqes, thus
1095  * high bits in the ac_ref field can be used as the recycle bits,
1096  * and only the low bits hold the number of pending Tx requests.
1097  * This atomic AH reference counting allows the Tx completion
1098  * handler not to acquire the id_ac_mutex to process every completion,
1099  * thus reducing lock contention problems between completion and
1100  * the Tx path.
1101  */
1102 #define	CYCLEVAL		0x80000
1103 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1104 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1105 #define	GET_REF(ace)		((ace)->ac_ref)
1106 #define	GET_REF_CYCLE(ace) (				\
1107 	/*						\
1108 	 * Make sure "cycle" bit is set.		\
1109 	 */						\
1110 	ASSERT(CYCLE_SET(ace)),				\
1111 	((ace)->ac_ref & ~(CYCLEVAL))			\
1112 )
1113 #define	INC_REF(ace, num) {				\
1114 	atomic_add_32(&(ace)->ac_ref, num);		\
1115 }
1116 #define	SET_CYCLE_IF_REF(ace) (				\
1117 	CYCLE_SET(ace) ? B_TRUE :			\
1118 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1119 		CYCLEVAL ?				\
1120 		/*					\
1121 		 * Clear the "cycle" bit we just set;	\
1122 		 * ref count known to be 0 from above.	\
1123 		 */					\
1124 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1125 		/*					\
1126 		 * We set "cycle" bit; let caller know.	\
1127 		 */					\
1128 		B_TRUE					\
1129 )
1130 #define	DEC_REF_DO_CYCLE(ace) (				\
1131 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1132 	    CYCLEVAL ?					\
1133 		/*					\
1134 		 * Ref count known to be 0 from above.	\
1135 		 */					\
1136 		B_TRUE :				\
1137 		B_FALSE					\
1138 )
1139 
1140 static void *
1141 list_get_head(list_t *list)
1142 {
1143 	list_node_t *lhead = list_head(list);
1144 
1145 	if (lhead != NULL)
1146 		list_remove(list, lhead);
1147 	return (lhead);
1148 }
1149 
1150 /*
1151  * This is always guaranteed to be able to queue the work.
1152  */
1153 static void
1154 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1155 {
1156 	/* Initialize request */
1157 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1158 	ptr->rq_op = op;
1159 
1160 	/*
1161 	 * Queue provided slot onto request pool.
1162 	 */
1163 	mutex_enter(&state->id_acache_req_lock);
1164 	list_insert_tail(&state->id_req_list, ptr);
1165 
1166 	/* Go, fetch, async thread */
1167 	cv_signal(&state->id_acache_req_cv);
1168 	mutex_exit(&state->id_acache_req_lock);
1169 }
1170 
1171 /*
1172  * Main body of the per interface async thread.
1173  */
1174 static void
1175 ibd_async_work(ibd_state_t *state)
1176 {
1177 	ibd_req_t *ptr;
1178 	callb_cpr_t cprinfo;
1179 
1180 	mutex_enter(&state->id_acache_req_lock);
1181 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1182 	    callb_generic_cpr, "ibd_async_work");
1183 
1184 	for (;;) {
1185 		ptr = list_get_head(&state->id_req_list);
1186 		if (ptr != NULL) {
1187 			mutex_exit(&state->id_acache_req_lock);
1188 
1189 			/*
1190 			 * Once we have done the operation, there is no
1191 			 * guarantee the request slot is going to be valid,
1192 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1193 			 * TRAP).
1194 			 *
1195 			 * Perform the request.
1196 			 */
1197 			switch (ptr->rq_op) {
1198 				case IBD_ASYNC_GETAH:
1199 					ibd_async_acache(state, &ptr->rq_mac);
1200 					break;
1201 				case IBD_ASYNC_JOIN:
1202 				case IBD_ASYNC_LEAVE:
1203 					ibd_async_multicast(state,
1204 					    ptr->rq_gid, ptr->rq_op);
1205 					break;
1206 				case IBD_ASYNC_PROMON:
1207 					ibd_async_setprom(state);
1208 					break;
1209 				case IBD_ASYNC_PROMOFF:
1210 					ibd_async_unsetprom(state);
1211 					break;
1212 				case IBD_ASYNC_REAP:
1213 					ibd_async_reap_group(state,
1214 					    ptr->rq_ptr, ptr->rq_gid,
1215 					    IB_MC_JSTATE_FULL);
1216 					/*
1217 					 * the req buf contains in mce
1218 					 * structure, so we do not need
1219 					 * to free it here.
1220 					 */
1221 					ptr = NULL;
1222 					break;
1223 				case IBD_ASYNC_TRAP:
1224 					ibd_async_trap(state, ptr);
1225 					break;
1226 				case IBD_ASYNC_SCHED:
1227 					ibd_async_txsched(state);
1228 					break;
1229 				case IBD_ASYNC_LINK:
1230 					ibd_async_link(state, ptr);
1231 					break;
1232 				case IBD_ASYNC_EXIT:
1233 					mutex_enter(&state->id_acache_req_lock);
1234 #ifndef __lock_lint
1235 					CALLB_CPR_EXIT(&cprinfo);
1236 #else
1237 					mutex_exit(&state->id_acache_req_lock);
1238 #endif
1239 					return;
1240 			}
1241 			if (ptr != NULL)
1242 				kmem_cache_free(state->id_req_kmc, ptr);
1243 
1244 			mutex_enter(&state->id_acache_req_lock);
1245 		} else {
1246 #ifndef __lock_lint
1247 			/*
1248 			 * Nothing to do: wait till new request arrives.
1249 			 */
1250 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1251 			cv_wait(&state->id_acache_req_cv,
1252 			    &state->id_acache_req_lock);
1253 			CALLB_CPR_SAFE_END(&cprinfo,
1254 			    &state->id_acache_req_lock);
1255 #endif
1256 		}
1257 	}
1258 
1259 	/*NOTREACHED*/
1260 	_NOTE(NOT_REACHED)
1261 }
1262 
1263 /*
1264  * Return when it is safe to queue requests to the async daemon; primarily
1265  * for subnet trap and async event handling. Disallow requests before the
1266  * daemon is created, and when interface deinitilization starts.
1267  */
1268 static boolean_t
1269 ibd_async_safe(ibd_state_t *state)
1270 {
1271 	mutex_enter(&state->id_trap_lock);
1272 	if (state->id_trap_stop) {
1273 		mutex_exit(&state->id_trap_lock);
1274 		return (B_FALSE);
1275 	}
1276 	state->id_trap_inprog++;
1277 	mutex_exit(&state->id_trap_lock);
1278 	return (B_TRUE);
1279 }
1280 
1281 /*
1282  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1283  * trap or event handling to complete to kill the async thread and deconstruct
1284  * the mcg/ace list.
1285  */
1286 static void
1287 ibd_async_done(ibd_state_t *state)
1288 {
1289 	mutex_enter(&state->id_trap_lock);
1290 	if (--state->id_trap_inprog == 0)
1291 		cv_signal(&state->id_trap_cv);
1292 	mutex_exit(&state->id_trap_lock);
1293 }
1294 
1295 /*
1296  * Hash functions:
1297  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1298  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1299  * These operate on mac addresses input into ibd_send, but there is no
1300  * guarantee on the alignment of the ipoib_mac_t structure.
1301  */
1302 /*ARGSUSED*/
1303 static uint_t
1304 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1305 {
1306 	ulong_t ptraddr = (ulong_t)key;
1307 	uint_t hval;
1308 
1309 	/*
1310 	 * If the input address is 4 byte aligned, we can just dereference
1311 	 * it. This is most common, since IP will send in a 4 byte aligned
1312 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1313 	 * 4 byte aligned too.
1314 	 */
1315 	if ((ptraddr & 3) == 0)
1316 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1317 
1318 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1319 	return (hval);
1320 }
1321 
1322 static int
1323 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1324 {
1325 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1326 		return (0);
1327 	else
1328 		return (1);
1329 }
1330 
1331 /*
1332  * Initialize all the per interface caches and lists; AH cache,
1333  * MCG list etc.
1334  */
1335 static int
1336 ibd_acache_init(ibd_state_t *state)
1337 {
1338 	ibd_ace_t *ce;
1339 	int i;
1340 
1341 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1342 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1343 
1344 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1345 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1346 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1347 	    offsetof(ibd_ace_t, ac_list));
1348 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1349 	    offsetof(ibd_ace_t, ac_list));
1350 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1351 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1352 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1353 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1354 	    offsetof(ibd_mce_t, mc_list));
1355 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1356 	    offsetof(ibd_mce_t, mc_list));
1357 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1358 	    offsetof(ibd_req_t, rq_list));
1359 
1360 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1361 	    IBD_NUM_AH, KM_SLEEP);
1362 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1363 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1364 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1365 			ibd_acache_fini(state);
1366 			return (DDI_FAILURE);
1367 		} else {
1368 			CLEAR_REFCYCLE(ce);
1369 			ce->ac_mce = NULL;
1370 			IBD_ACACHE_INSERT_FREE(state, ce);
1371 		}
1372 	}
1373 	return (DDI_SUCCESS);
1374 }
1375 
1376 static void
1377 ibd_acache_fini(ibd_state_t *state)
1378 {
1379 	ibd_ace_t *ptr;
1380 
1381 	mutex_enter(&state->id_ac_mutex);
1382 
1383 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1384 		ASSERT(GET_REF(ptr) == 0);
1385 		(void) ibt_free_ud_dest(ptr->ac_dest);
1386 	}
1387 
1388 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1389 		ASSERT(GET_REF(ptr) == 0);
1390 		(void) ibt_free_ud_dest(ptr->ac_dest);
1391 	}
1392 
1393 	list_destroy(&state->id_ah_free);
1394 	list_destroy(&state->id_ah_active);
1395 	list_destroy(&state->id_mc_full);
1396 	list_destroy(&state->id_mc_non);
1397 	list_destroy(&state->id_req_list);
1398 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1399 	mutex_exit(&state->id_ac_mutex);
1400 	mutex_destroy(&state->id_ac_mutex);
1401 	mutex_destroy(&state->id_mc_mutex);
1402 	mutex_destroy(&state->id_acache_req_lock);
1403 	cv_destroy(&state->id_acache_req_cv);
1404 }
1405 
1406 /*
1407  * Search AH active hash list for a cached path to input destination.
1408  * If we are "just looking", hold == F. When we are in the Tx path,
1409  * we set hold == T to grab a reference on the AH so that it can not
1410  * be recycled to a new destination while the Tx request is posted.
1411  */
1412 static ibd_ace_t *
1413 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1414 {
1415 	ibd_ace_t *ptr;
1416 
1417 	ASSERT(mutex_owned(&state->id_ac_mutex));
1418 
1419 	/*
1420 	 * Do hash search.
1421 	 */
1422 	if (mod_hash_find(state->id_ah_active_hash,
1423 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1424 		if (hold)
1425 			INC_REF(ptr, num);
1426 		return (ptr);
1427 	}
1428 	return (NULL);
1429 }
1430 
1431 /*
1432  * This is called by the tx side; if an initialized AH is found in
1433  * the active list, it is locked down and can be used; if no entry
1434  * is found, an async request is queued to do path resolution.
1435  */
1436 static ibd_ace_t *
1437 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1438 {
1439 	ibd_ace_t *ptr;
1440 	ibd_req_t *req;
1441 
1442 	/*
1443 	 * Only attempt to print when we can; in the mdt pattr case, the
1444 	 * address is not aligned properly.
1445 	 */
1446 	if (((ulong_t)mac & 3) == 0) {
1447 		DPRINT(4,
1448 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1449 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1450 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1451 		    htonl(mac->ipoib_gidsuff[1]));
1452 	}
1453 
1454 	mutex_enter(&state->id_ac_mutex);
1455 
1456 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1457 		mutex_exit(&state->id_ac_mutex);
1458 		return (ptr);
1459 	}
1460 
1461 	/*
1462 	 * Implementation of a single outstanding async request; if
1463 	 * the operation is not started yet, queue a request and move
1464 	 * to ongoing state. Remember in id_ah_addr for which address
1465 	 * we are queueing the request, in case we need to flag an error;
1466 	 * Any further requests, for the same or different address, until
1467 	 * the operation completes, is sent back to GLDv3 to be retried.
1468 	 * The async thread will update id_ah_op with an error indication
1469 	 * or will set it to indicate the next look up can start; either
1470 	 * way, it will mac_tx_update() so that all blocked requests come
1471 	 * back here.
1472 	 */
1473 	*err = EAGAIN;
1474 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1475 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1476 		if (req != NULL) {
1477 			/*
1478 			 * We did not even find the entry; queue a request
1479 			 * for it.
1480 			 */
1481 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1482 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1483 			state->id_ah_op = IBD_OP_ONGOING;
1484 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1485 		}
1486 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1487 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1488 		/*
1489 		 * Check the status of the pathrecord lookup request
1490 		 * we had queued before.
1491 		 */
1492 		if (state->id_ah_op == IBD_OP_ERRORED) {
1493 			*err = EFAULT;
1494 			state->id_ah_error++;
1495 		} else {
1496 			/*
1497 			 * IBD_OP_ROUTERED case: We need to send to the
1498 			 * all-router MCG. If we can find the AH for
1499 			 * the mcg, the Tx will be attempted. If we
1500 			 * do not find the AH, we return NORESOURCES
1501 			 * to retry.
1502 			 */
1503 			ipoib_mac_t routermac;
1504 
1505 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1506 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1507 			    numwqe);
1508 		}
1509 		state->id_ah_op = IBD_OP_NOTSTARTED;
1510 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1511 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1512 		/*
1513 		 * This case can happen when we get a higher band
1514 		 * packet. The easiest way is to reset the state machine
1515 		 * to accommodate the higher priority packet.
1516 		 */
1517 		state->id_ah_op = IBD_OP_NOTSTARTED;
1518 	}
1519 	mutex_exit(&state->id_ac_mutex);
1520 
1521 	return (ptr);
1522 }
1523 
1524 /*
1525  * Grab a not-currently-in-use AH/PathRecord from the active
1526  * list to recycle to a new destination. Only the async thread
1527  * executes this code.
1528  */
1529 static ibd_ace_t *
1530 ibd_acache_get_unref(ibd_state_t *state)
1531 {
1532 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1533 
1534 	ASSERT(mutex_owned(&state->id_ac_mutex));
1535 
1536 	/*
1537 	 * Do plain linear search.
1538 	 */
1539 	while (ptr != NULL) {
1540 		/*
1541 		 * Note that it is possible that the "cycle" bit
1542 		 * is set on the AH w/o any reference count. The
1543 		 * mcg must have been deleted, and the tx cleanup
1544 		 * just decremented the reference count to 0, but
1545 		 * hasn't gotten around to grabbing the id_ac_mutex
1546 		 * to move the AH into the free list.
1547 		 */
1548 		if (GET_REF(ptr) == 0) {
1549 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1550 			break;
1551 		}
1552 		ptr = list_next(&state->id_ah_active, ptr);
1553 	}
1554 	return (ptr);
1555 }
1556 
1557 /*
1558  * Invoked to clean up AH from active list in case of multicast
1559  * disable and to handle sendonly memberships during mcg traps.
1560  * And for port up processing for multicast and unicast AHs.
1561  * Normally, the AH is taken off the active list, and put into
1562  * the free list to be recycled for a new destination. In case
1563  * Tx requests on the AH have not completed yet, the AH is marked
1564  * for reaping (which will put the AH on the free list) once the Tx's
1565  * complete; in this case, depending on the "force" input, we take
1566  * out the AH from the active list right now, or leave it also for
1567  * the reap operation. Returns TRUE if the AH is taken off the active
1568  * list (and either put into the free list right now, or arranged for
1569  * later), FALSE otherwise.
1570  */
1571 static boolean_t
1572 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1573 {
1574 	ibd_ace_t *acactive;
1575 	boolean_t ret = B_TRUE;
1576 
1577 	ASSERT(mutex_owned(&state->id_ac_mutex));
1578 
1579 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1580 
1581 		/*
1582 		 * Note that the AH might already have the cycle bit set
1583 		 * on it; this might happen if sequences of multicast
1584 		 * enables and disables are coming so fast, that posted
1585 		 * Tx's to the mcg have not completed yet, and the cycle
1586 		 * bit is set successively by each multicast disable.
1587 		 */
1588 		if (SET_CYCLE_IF_REF(acactive)) {
1589 			if (!force) {
1590 				/*
1591 				 * The ace is kept on the active list, further
1592 				 * Tx's can still grab a reference on it; the
1593 				 * ace is reaped when all pending Tx's
1594 				 * referencing the AH complete.
1595 				 */
1596 				ret = B_FALSE;
1597 			} else {
1598 				/*
1599 				 * In the mcg trap case, we always pull the
1600 				 * AH from the active list. And also the port
1601 				 * up multi/unicast case.
1602 				 */
1603 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1604 				acactive->ac_mce = NULL;
1605 			}
1606 		} else {
1607 			/*
1608 			 * Determined the ref count is 0, thus reclaim
1609 			 * immediately after pulling out the ace from
1610 			 * the active list.
1611 			 */
1612 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1613 			acactive->ac_mce = NULL;
1614 			IBD_ACACHE_INSERT_FREE(state, acactive);
1615 		}
1616 
1617 	}
1618 	return (ret);
1619 }
1620 
1621 /*
1622  * Helper function for async path record lookup. If we are trying to
1623  * Tx to a MCG, check our membership, possibly trying to join the
1624  * group if required. If that fails, try to send the packet to the
1625  * all router group (indicated by the redirect output), pointing
1626  * the input mac address to the router mcg address.
1627  */
1628 static ibd_mce_t *
1629 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1630 {
1631 	ib_gid_t mgid;
1632 	ibd_mce_t *mce;
1633 	ipoib_mac_t routermac;
1634 
1635 	*redirect = B_FALSE;
1636 	ibd_n2h_gid(mac, &mgid);
1637 
1638 	/*
1639 	 * Check the FullMember+SendOnlyNonMember list.
1640 	 * Since we are the only one who manipulates the
1641 	 * id_mc_full list, no locks are needed.
1642 	 */
1643 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1644 	if (mce != NULL) {
1645 		DPRINT(4, "ibd_async_mcache : already joined to group");
1646 		return (mce);
1647 	}
1648 
1649 	/*
1650 	 * Not found; try to join(SendOnlyNonMember) and attach.
1651 	 */
1652 	DPRINT(4, "ibd_async_mcache : not joined to group");
1653 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1654 	    NULL) {
1655 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1656 		return (mce);
1657 	}
1658 
1659 	/*
1660 	 * MCGroup not present; try to join the all-router group. If
1661 	 * any of the following steps succeed, we will be redirecting
1662 	 * to the all router group.
1663 	 */
1664 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1665 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1666 		return (NULL);
1667 	*redirect = B_TRUE;
1668 	ibd_n2h_gid(&routermac, &mgid);
1669 	bcopy(&routermac, mac, IPOIB_ADDRL);
1670 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1671 	    mgid.gid_prefix, mgid.gid_guid);
1672 
1673 	/*
1674 	 * Are we already joined to the router group?
1675 	 */
1676 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1677 		DPRINT(4, "ibd_async_mcache : using already joined router"
1678 		    "group\n");
1679 		return (mce);
1680 	}
1681 
1682 	/*
1683 	 * Can we join(SendOnlyNonMember) the router group?
1684 	 */
1685 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1686 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1687 	    NULL) {
1688 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1689 		return (mce);
1690 	}
1691 
1692 	return (NULL);
1693 }
1694 
1695 /*
1696  * Async path record lookup code.
1697  */
1698 static void
1699 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1700 {
1701 	ibd_ace_t *ce;
1702 	ibd_mce_t *mce = NULL;
1703 	ibt_path_attr_t path_attr;
1704 	ibt_path_info_t path_info;
1705 	ib_gid_t destgid;
1706 	char ret = IBD_OP_NOTSTARTED;
1707 
1708 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1709 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1710 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1711 	    htonl(mac->ipoib_gidsuff[1]));
1712 
1713 	/*
1714 	 * Check whether we are trying to transmit to a MCG.
1715 	 * In that case, we need to make sure we are a member of
1716 	 * the MCG.
1717 	 */
1718 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1719 		boolean_t redirected;
1720 
1721 		/*
1722 		 * If we can not find or join the group or even
1723 		 * redirect, error out.
1724 		 */
1725 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1726 		    NULL) {
1727 			state->id_ah_op = IBD_OP_ERRORED;
1728 			return;
1729 		}
1730 
1731 		/*
1732 		 * If we got redirected, we need to determine whether
1733 		 * the AH for the new mcg is in the cache already, and
1734 		 * not pull it in then; otherwise proceed to get the
1735 		 * path for the new mcg. There is no guarantee that
1736 		 * if the AH is currently in the cache, it will still be
1737 		 * there when we look in ibd_acache_lookup(), but that's
1738 		 * okay, we will come back here.
1739 		 */
1740 		if (redirected) {
1741 			ret = IBD_OP_ROUTERED;
1742 			DPRINT(4, "ibd_async_acache :  redirected to "
1743 			    "%08X:%08X:%08X:%08X:%08X",
1744 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1745 			    htonl(mac->ipoib_gidpref[1]),
1746 			    htonl(mac->ipoib_gidsuff[0]),
1747 			    htonl(mac->ipoib_gidsuff[1]));
1748 
1749 			mutex_enter(&state->id_ac_mutex);
1750 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1751 				state->id_ah_op = IBD_OP_ROUTERED;
1752 				mutex_exit(&state->id_ac_mutex);
1753 				DPRINT(4, "ibd_async_acache : router AH found");
1754 				return;
1755 			}
1756 			mutex_exit(&state->id_ac_mutex);
1757 		}
1758 	}
1759 
1760 	/*
1761 	 * Get an AH from the free list.
1762 	 */
1763 	mutex_enter(&state->id_ac_mutex);
1764 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1765 		/*
1766 		 * No free ones; try to grab an unreferenced active
1767 		 * one. Maybe we need to make the active list LRU,
1768 		 * but that will create more work for Tx callbacks.
1769 		 * Is there a way of not having to pull out the
1770 		 * entry from the active list, but just indicate it
1771 		 * is being recycled? Yes, but that creates one more
1772 		 * check in the fast lookup path.
1773 		 */
1774 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1775 			/*
1776 			 * Pretty serious shortage now.
1777 			 */
1778 			state->id_ah_op = IBD_OP_NOTSTARTED;
1779 			mutex_exit(&state->id_ac_mutex);
1780 			DPRINT(10, "ibd_async_acache : failed to find AH "
1781 			    "slot\n");
1782 			return;
1783 		}
1784 		/*
1785 		 * We could check whether ac_mce points to a SendOnly
1786 		 * member and drop that membership now. Or do it lazily
1787 		 * at detach time.
1788 		 */
1789 		ce->ac_mce = NULL;
1790 	}
1791 	mutex_exit(&state->id_ac_mutex);
1792 	ASSERT(ce->ac_mce == NULL);
1793 
1794 	/*
1795 	 * Update the entry.
1796 	 */
1797 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1798 
1799 	bzero(&path_info, sizeof (path_info));
1800 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1801 	path_attr.pa_sgid = state->id_sgid;
1802 	path_attr.pa_num_dgids = 1;
1803 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1804 	path_attr.pa_dgids = &destgid;
1805 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1806 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1807 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1808 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1809 		goto error;
1810 	}
1811 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1812 	    ntohl(ce->ac_mac.ipoib_qpn),
1813 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1814 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1815 		goto error;
1816 	}
1817 
1818 	/*
1819 	 * mce is set whenever an AH is being associated with a
1820 	 * MCG; this will come in handy when we leave the MCG. The
1821 	 * lock protects Tx fastpath from scanning the active list.
1822 	 */
1823 	if (mce != NULL)
1824 		ce->ac_mce = mce;
1825 	mutex_enter(&state->id_ac_mutex);
1826 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1827 	state->id_ah_op = ret;
1828 	mutex_exit(&state->id_ac_mutex);
1829 	return;
1830 error:
1831 	/*
1832 	 * We might want to drop SendOnly membership here if we
1833 	 * joined above. The lock protects Tx callbacks inserting
1834 	 * into the free list.
1835 	 */
1836 	mutex_enter(&state->id_ac_mutex);
1837 	state->id_ah_op = IBD_OP_ERRORED;
1838 	IBD_ACACHE_INSERT_FREE(state, ce);
1839 	mutex_exit(&state->id_ac_mutex);
1840 }
1841 
1842 /*
1843  * While restoring port's presence on the subnet on a port up, it is possible
1844  * that the port goes down again.
1845  */
1846 static void
1847 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1848 {
1849 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1850 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1851 	    LINK_STATE_UP;
1852 	ibd_mce_t *mce, *pmce;
1853 	ibd_ace_t *ace, *pace;
1854 
1855 	DPRINT(10, "ibd_async_link(): %d", opcode);
1856 
1857 	/*
1858 	 * On a link up, revalidate the link speed/width. No point doing
1859 	 * this on a link down, since we will be unable to do SA operations,
1860 	 * defaulting to the lowest speed. Also notice that we update our
1861 	 * notion of speed before calling mac_link_update(), which will do
1862 	 * neccesary higher level notifications for speed changes.
1863 	 */
1864 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1865 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1866 		state->id_link_speed = ibd_get_portspeed(state);
1867 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1868 	}
1869 
1870 	/*
1871 	 * Do all the work required to establish our presence on
1872 	 * the subnet.
1873 	 */
1874 	if (opcode == IBD_LINK_UP_ABSENT) {
1875 		/*
1876 		 * If in promiscuous mode ...
1877 		 */
1878 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1879 			/*
1880 			 * Drop all nonmembership.
1881 			 */
1882 			ibd_async_unsetprom(state);
1883 
1884 			/*
1885 			 * Then, try to regain nonmembership to all mcg's.
1886 			 */
1887 			ibd_async_setprom(state);
1888 
1889 		}
1890 
1891 		/*
1892 		 * Drop all sendonly membership (which also gets rid of the
1893 		 * AHs); try to reacquire all full membership.
1894 		 */
1895 		mce = list_head(&state->id_mc_full);
1896 		while ((pmce = mce) != NULL) {
1897 			mce = list_next(&state->id_mc_full, mce);
1898 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1899 				ibd_leave_group(state,
1900 				    pmce->mc_info.mc_adds_vect.av_dgid,
1901 				    IB_MC_JSTATE_SEND_ONLY_NON);
1902 			else
1903 				ibd_reacquire_group(state, pmce);
1904 		}
1905 
1906 		/*
1907 		 * Recycle all active AHs to free list (and if there are
1908 		 * pending posts, make sure they will go into the free list
1909 		 * once the Tx's complete). Grab the lock to prevent
1910 		 * concurrent Tx's as well as Tx cleanups.
1911 		 */
1912 		mutex_enter(&state->id_ac_mutex);
1913 		ace = list_head(&state->id_ah_active);
1914 		while ((pace = ace) != NULL) {
1915 			boolean_t cycled;
1916 
1917 			ace = list_next(&state->id_ah_active, ace);
1918 			mce = pace->ac_mce;
1919 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
1920 			    B_TRUE);
1921 			/*
1922 			 * If this is for an mcg, it must be for a fullmember,
1923 			 * since we got rid of send-only members above when
1924 			 * processing the mce list.
1925 			 */
1926 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1927 			    IB_MC_JSTATE_FULL)));
1928 
1929 			/*
1930 			 * Check if the fullmember mce needs to be torn down,
1931 			 * ie whether the DLPI disable has already been done.
1932 			 * If so, do some of the work of tx_cleanup, namely
1933 			 * causing leave (which will fail), detach and
1934 			 * mce-freeing. tx_cleanup will put the AH into free
1935 			 * list. The reason to duplicate some of this
1936 			 * tx_cleanup work is because we want to delete the
1937 			 * AH right now instead of waiting for tx_cleanup, to
1938 			 * force subsequent Tx's to reacquire an AH.
1939 			 */
1940 			if ((mce != NULL) && (mce->mc_fullreap))
1941 				ibd_async_reap_group(state, mce,
1942 				    mce->mc_info.mc_adds_vect.av_dgid,
1943 				    mce->mc_jstate);
1944 		}
1945 		mutex_exit(&state->id_ac_mutex);
1946 	}
1947 
1948 	/*
1949 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1950 	 * (which stops further events from being delivered) before
1951 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1952 	 * has already been done.
1953 	 */
1954 	mutex_enter(&state->id_link_mutex);
1955 	state->id_link_state = lstate;
1956 	mac_link_update(state->id_mh, lstate);
1957 	mutex_exit(&state->id_link_mutex);
1958 
1959 	ibd_async_done(state);
1960 }
1961 
1962 /*
1963  * Check the pkey table to see if we can find the pkey we're looking for.
1964  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1965  * failure.
1966  */
1967 static int
1968 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1969     uint16_t *pkix)
1970 {
1971 	uint16_t ndx;
1972 
1973 	ASSERT(pkix != NULL);
1974 
1975 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1976 		if (pkey_tbl[ndx] == pkey) {
1977 			*pkix = ndx;
1978 			return (0);
1979 		}
1980 	}
1981 	return (-1);
1982 }
1983 
1984 /*
1985  * When the link is notified up, we need to do a few things, based
1986  * on the port's current p_init_type_reply claiming a reinit has been
1987  * done or not. The reinit steps are:
1988  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1989  *    the old Pkey and GID0 are correct.
1990  * 2. Register for mcg traps (already done by ibmf).
1991  * 3. If PreservePresenceReply indicates the SM has restored port's presence
1992  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
1993  * 4. Give up all sendonly memberships.
1994  * 5. Acquire all full memberships.
1995  * 6. In promiscuous mode, acquire all non memberships.
1996  * 7. Recycle all AHs to free list.
1997  */
1998 static void
1999 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2000 {
2001 	ibt_hca_portinfo_t *port_infop = NULL;
2002 	ibt_status_t ibt_status;
2003 	uint_t psize, port_infosz;
2004 	ibd_link_op_t opcode;
2005 	ibd_req_t *req;
2006 	link_state_t new_link_state = LINK_STATE_UP;
2007 	uint8_t itreply;
2008 	uint16_t pkix;
2009 
2010 	/*
2011 	 * Do not send a request to the async daemon if it has not
2012 	 * yet been created or is being destroyed. If the async
2013 	 * daemon has not yet been created, we still need to track
2014 	 * last known state of the link. If this code races with the
2015 	 * detach path, then we are assured that the detach path has
2016 	 * not yet done the ibt_close_hca (which waits for all async
2017 	 * events to complete). If the code races with the attach path,
2018 	 * we need to validate the pkey/gid (in the link_up case) if
2019 	 * the initialization path has already set these up and created
2020 	 * IBTF resources based on the values.
2021 	 */
2022 	mutex_enter(&state->id_link_mutex);
2023 
2024 	/*
2025 	 * If the init code in ibd_m_start hasn't yet set up the
2026 	 * pkey/gid, nothing to do; that code will set the link state.
2027 	 */
2028 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2029 		mutex_exit(&state->id_link_mutex);
2030 		return;
2031 	}
2032 
2033 	/*
2034 	 * If this routine was called in response to a port down event,
2035 	 * we just need to see if this should be informed.
2036 	 */
2037 	if (code == IBT_ERROR_PORT_DOWN) {
2038 		new_link_state = LINK_STATE_DOWN;
2039 		goto update_link_state;
2040 	}
2041 
2042 	/*
2043 	 * If it's not a port down event we've received, try to get the port
2044 	 * attributes first. If we fail here, the port is as good as down.
2045 	 * Otherwise, if the link went down by the time the handler gets
2046 	 * here, give up - we cannot even validate the pkey/gid since those
2047 	 * are not valid and this is as bad as a port down anyway.
2048 	 */
2049 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2050 	    &port_infop, &psize, &port_infosz);
2051 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2052 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2053 		new_link_state = LINK_STATE_DOWN;
2054 		goto update_link_state;
2055 	}
2056 
2057 	/*
2058 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2059 	 * PreserveContentReply are 0, we don't know anything about the
2060 	 * data loaded into the port attributes, so we need to verify
2061 	 * if gid0 and pkey are still valid.
2062 	 */
2063 	itreply = port_infop->p_init_type_reply;
2064 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2065 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2066 		/*
2067 		 * Check to see if the subnet part of GID0 has changed. If
2068 		 * not, check the simple case first to see if the pkey
2069 		 * index is the same as before; finally check to see if the
2070 		 * pkey has been relocated to a different index in the table.
2071 		 */
2072 		if (bcmp(port_infop->p_sgid_tbl,
2073 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2074 
2075 			new_link_state = LINK_STATE_DOWN;
2076 
2077 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2078 		    state->id_pkey) {
2079 
2080 			new_link_state = LINK_STATE_UP;
2081 
2082 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2083 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2084 
2085 			ibt_free_portinfo(port_infop, port_infosz);
2086 			mutex_exit(&state->id_link_mutex);
2087 
2088 			ibd_m_stop(state);
2089 			if ((ibt_status = ibd_m_start(state)) != IBT_SUCCESS) {
2090 				DPRINT(10, "link_mod: cannot "
2091 				    "restart, ret=%d", ibt_status);
2092 			}
2093 			return;
2094 		} else {
2095 			new_link_state = LINK_STATE_DOWN;
2096 		}
2097 	}
2098 
2099 update_link_state:
2100 	if (port_infop) {
2101 		ibt_free_portinfo(port_infop, port_infosz);
2102 	}
2103 
2104 	/*
2105 	 * If the old state is the same as the new state, nothing to do
2106 	 */
2107 	if (state->id_link_state == new_link_state) {
2108 		mutex_exit(&state->id_link_mutex);
2109 		return;
2110 	}
2111 
2112 	/*
2113 	 * Ok, so there was a link state change; see if it's safe to ask
2114 	 * the async thread to do the work
2115 	 */
2116 	if (!ibd_async_safe(state)) {
2117 		state->id_link_state = new_link_state;
2118 		mutex_exit(&state->id_link_mutex);
2119 		return;
2120 	}
2121 
2122 	mutex_exit(&state->id_link_mutex);
2123 
2124 	/*
2125 	 * If we're reporting a link up, check InitTypeReply to see if
2126 	 * the SM has ensured that the port's presence in mcg, traps,
2127 	 * etc. is intact.
2128 	 */
2129 	if (new_link_state == LINK_STATE_DOWN) {
2130 		opcode = IBD_LINK_DOWN;
2131 	} else {
2132 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2133 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2134 			opcode = IBD_LINK_UP;
2135 		} else {
2136 			opcode = IBD_LINK_UP_ABSENT;
2137 		}
2138 	}
2139 
2140 	/*
2141 	 * Queue up a request for ibd_async_link() to handle this link
2142 	 * state change event
2143 	 */
2144 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2145 	req->rq_ptr = (void *)opcode;
2146 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2147 }
2148 
2149 /*
2150  * For the port up/down events, IBTL guarantees there will not be concurrent
2151  * invocations of the handler. IBTL might coalesce link transition events,
2152  * and not invoke the handler for _each_ up/down transition, but it will
2153  * invoke the handler with last known state
2154  */
2155 static void
2156 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2157     ibt_async_code_t code, ibt_async_event_t *event)
2158 {
2159 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2160 
2161 	switch (code) {
2162 	case IBT_ERROR_CATASTROPHIC_CHAN:
2163 		ibd_print_warn(state, "catastrophic channel error");
2164 		break;
2165 	case IBT_ERROR_CQ:
2166 		ibd_print_warn(state, "completion queue error");
2167 		break;
2168 	case IBT_PORT_CHANGE_EVENT:
2169 		/*
2170 		 * Events will be delivered to all instances that have
2171 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2172 		 * Only need to do work for our port; IBTF will deliver
2173 		 * events for other ports on the hca we have ibt_open_hca'ed
2174 		 * too. Note that id_port is initialized in ibd_attach()
2175 		 * before we do an ibt_open_hca() in ibd_attach().
2176 		 */
2177 		ASSERT(state->id_hca_hdl == hca_hdl);
2178 		if (state->id_port != event->ev_port)
2179 			break;
2180 
2181 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2182 		    IBT_PORT_CHANGE_PKEY) {
2183 			ibd_link_mod(state, code);
2184 		}
2185 		break;
2186 	case IBT_ERROR_PORT_DOWN:
2187 	case IBT_CLNT_REREG_EVENT:
2188 	case IBT_EVENT_PORT_UP:
2189 		/*
2190 		 * Events will be delivered to all instances that have
2191 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2192 		 * Only need to do work for our port; IBTF will deliver
2193 		 * events for other ports on the hca we have ibt_open_hca'ed
2194 		 * too. Note that id_port is initialized in ibd_attach()
2195 		 * before we do an ibt_open_hca() in ibd_attach().
2196 		 */
2197 		ASSERT(state->id_hca_hdl == hca_hdl);
2198 		if (state->id_port != event->ev_port)
2199 			break;
2200 
2201 		ibd_link_mod(state, code);
2202 		break;
2203 
2204 	case IBT_HCA_ATTACH_EVENT:
2205 	case IBT_HCA_DETACH_EVENT:
2206 		/*
2207 		 * When a new card is plugged to the system, attach_event is
2208 		 * invoked. Additionally, a cfgadm needs to be run to make the
2209 		 * card known to the system, and an ifconfig needs to be run to
2210 		 * plumb up any ibd interfaces on the card. In the case of card
2211 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2212 		 * unplumb the ibd interfaces on the card; when the card is
2213 		 * actually unplugged, the detach_event is invoked;
2214 		 * additionally, if any ibd instances are still active on the
2215 		 * card (eg there were no associated RCM scripts), driver's
2216 		 * detach routine is invoked.
2217 		 */
2218 		break;
2219 	default:
2220 		break;
2221 	}
2222 }
2223 
2224 static int
2225 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2226 {
2227 	mac_register_t *macp;
2228 	int ret;
2229 
2230 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2231 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2232 		return (DDI_FAILURE);
2233 	}
2234 
2235 	/*
2236 	 * Note that when we register with mac during attach, we don't
2237 	 * have the id_macaddr yet, so we'll simply be registering a
2238 	 * zero macaddr that we'll overwrite later during plumb (in
2239 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2240 	 * update the mac layer with the correct mtu during plumb.
2241 	 */
2242 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2243 	macp->m_driver = state;
2244 	macp->m_dip = dip;
2245 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2246 	macp->m_callbacks = &ibd_m_callbacks;
2247 	macp->m_min_sdu = 0;
2248 	macp->m_max_sdu = IBD_DEF_MAX_SDU;
2249 
2250 	/*
2251 	 *  Register ourselves with the GLDv3 interface
2252 	 */
2253 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2254 		mac_free(macp);
2255 		DPRINT(10,
2256 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2257 		return (DDI_FAILURE);
2258 	}
2259 
2260 	mac_free(macp);
2261 	return (DDI_SUCCESS);
2262 }
2263 
2264 static int
2265 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2266 {
2267 	ibt_hca_attr_t hca_attrs;
2268 	ibt_status_t ibt_status;
2269 
2270 	/*
2271 	 * Query the HCA and fetch its attributes
2272 	 */
2273 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2274 	ASSERT(ibt_status == IBT_SUCCESS);
2275 
2276 	/*
2277 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2278 	 *    full checksum offload.
2279 	 */
2280 	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
2281 		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2282 	}
2283 
2284 	/*
2285 	 * 2. Set LSO policy, capability and maximum length
2286 	 */
2287 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
2288 	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2289 		state->id_lso_policy = B_TRUE;
2290 	} else {
2291 		state->id_lso_policy = B_FALSE;
2292 	}
2293 	if (hca_attrs.hca_max_lso_size > 0) {
2294 		state->id_lso_capable = B_TRUE;
2295 		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2296 			state->id_lso_maxlen = IBD_LSO_MAXLEN;
2297 		else
2298 			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
2299 	} else {
2300 		state->id_lso_capable = B_FALSE;
2301 		state->id_lso_maxlen = 0;
2302 	}
2303 
2304 	/*
2305 	 * 3. Set Reserved L_Key capability
2306 	 */
2307 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2308 		state->id_hca_res_lkey_capab = 1;
2309 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2310 	}
2311 
2312 	/*
2313 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2314 	 *    size information is provided by the hca
2315 	 */
2316 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2317 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2318 	} else {
2319 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2320 	}
2321 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2322 		state->id_max_sqseg = IBD_MAX_SQSEG;
2323 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2324 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2325 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2326 	}
2327 
2328 	/*
2329 	 * 5. Set number of recv and send wqes after checking hca maximum
2330 	 *    channel size
2331 	 */
2332 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2333 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2334 	} else {
2335 		state->id_num_rwqe = IBD_NUM_RWQE;
2336 	}
2337 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2338 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2339 	} else {
2340 		state->id_num_swqe = IBD_NUM_SWQE;
2341 	}
2342 
2343 	return (DDI_SUCCESS);
2344 }
2345 
2346 static int
2347 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2348 {
2349 	int instance;
2350 	uint32_t progress = state->id_mac_state;
2351 	ibt_status_t ret;
2352 
2353 	if (progress & IBD_DRV_MAC_REGISTERED) {
2354 		(void) mac_unregister(state->id_mh);
2355 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2356 	}
2357 
2358 	if (progress & IBD_DRV_PD_ALLOCD) {
2359 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2360 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2361 			ibd_print_warn(state, "failed to free "
2362 			    "protection domain, ret=%d", ret);
2363 		}
2364 		state->id_pd_hdl = NULL;
2365 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2366 	}
2367 
2368 	if (progress & IBD_DRV_HCA_OPENED) {
2369 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2370 		    IBT_SUCCESS) {
2371 			ibd_print_warn(state, "failed to close "
2372 			    "HCA device, ret=%d", ret);
2373 		}
2374 		state->id_hca_hdl = NULL;
2375 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2376 	}
2377 
2378 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2379 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
2380 			ibd_print_warn(state,
2381 			    "ibt_detach() failed, ret=%d", ret);
2382 		}
2383 		state->id_ibt_hdl = NULL;
2384 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2385 	}
2386 
2387 	if (progress & IBD_DRV_TXINTR_ADDED) {
2388 		ddi_remove_softintr(state->id_tx);
2389 		state->id_tx = NULL;
2390 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2391 	}
2392 
2393 	if (progress & IBD_DRV_RXINTR_ADDED) {
2394 		ddi_remove_softintr(state->id_rx);
2395 		state->id_rx = NULL;
2396 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2397 	}
2398 
2399 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2400 		ibd_state_fini(state);
2401 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2402 	}
2403 
2404 	instance = ddi_get_instance(dip);
2405 	ddi_soft_state_free(ibd_list, instance);
2406 
2407 	return (DDI_SUCCESS);
2408 }
2409 
2410 /*
2411  * Attach device to the IO framework.
2412  */
2413 static int
2414 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2415 {
2416 	ibd_state_t *state = NULL;
2417 	ib_guid_t hca_guid;
2418 	int instance;
2419 	ibt_status_t ret;
2420 	int rv;
2421 
2422 	/*
2423 	 * IBD doesn't support suspend/resume
2424 	 */
2425 	if (cmd != DDI_ATTACH)
2426 		return (DDI_FAILURE);
2427 
2428 	/*
2429 	 * Allocate softstate structure
2430 	 */
2431 	instance = ddi_get_instance(dip);
2432 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2433 		return (DDI_FAILURE);
2434 	state = ddi_get_soft_state(ibd_list, instance);
2435 
2436 	/*
2437 	 * Initialize mutexes and condition variables
2438 	 */
2439 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2440 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2441 		goto attach_fail;
2442 	}
2443 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2444 
2445 	/*
2446 	 * Allocate rx,tx softintr
2447 	 */
2448 	if (ibd_rx_softintr == 1) {
2449 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2450 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2451 			DPRINT(10, "ibd_attach: failed in "
2452 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2453 			goto attach_fail;
2454 		}
2455 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2456 	}
2457 	if (ibd_tx_softintr == 1) {
2458 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2459 		    NULL, NULL, ibd_tx_recycle,
2460 		    (caddr_t)state)) != DDI_SUCCESS) {
2461 			DPRINT(10, "ibd_attach: failed in "
2462 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2463 			goto attach_fail;
2464 		}
2465 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2466 	}
2467 
2468 	/*
2469 	 * Obtain IBA P_Key, port number and HCA guid and validate
2470 	 * them (for P_Key, only full members are allowed as per
2471 	 * IPoIB specification; neither port number nor HCA guid
2472 	 * can be zero)
2473 	 */
2474 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2475 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2476 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2477 		    state->id_pkey);
2478 		goto attach_fail;
2479 	}
2480 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2481 	    "port-number", 0)) == 0) {
2482 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2483 		    state->id_port);
2484 		goto attach_fail;
2485 	}
2486 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2487 	    "hca-guid", 0)) == 0) {
2488 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2489 		    hca_guid);
2490 		goto attach_fail;
2491 	}
2492 
2493 	/*
2494 	 * Attach to IBTL
2495 	 */
2496 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2497 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2498 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
2499 		goto attach_fail;
2500 	}
2501 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2502 
2503 	/*
2504 	 * Open the HCA
2505 	 */
2506 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2507 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2508 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2509 		goto attach_fail;
2510 	}
2511 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2512 
2513 	/*
2514 	 * Record capabilities
2515 	 */
2516 	(void) ibd_record_capab(state, dip);
2517 
2518 	/*
2519 	 * Allocate a protection domain on the HCA
2520 	 */
2521 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2522 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2523 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2524 		goto attach_fail;
2525 	}
2526 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2527 
2528 
2529 	/*
2530 	 * Register ibd interfaces with the Nemo framework
2531 	 */
2532 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2533 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2534 		goto attach_fail;
2535 	}
2536 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2537 
2538 	/*
2539 	 * We're done with everything we could to make the attach
2540 	 * succeed.  All the buffer allocations and IPoIB broadcast
2541 	 * group joins are deferred to when the interface instance
2542 	 * is actually plumbed to avoid wasting memory.
2543 	 */
2544 	return (DDI_SUCCESS);
2545 
2546 attach_fail:
2547 	ibd_unattach(state, dip);
2548 	return (DDI_FAILURE);
2549 }
2550 
2551 /*
2552  * Detach device from the IO framework.
2553  */
2554 static int
2555 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2556 {
2557 	ibd_state_t *state;
2558 	int instance;
2559 
2560 	/*
2561 	 * IBD doesn't support suspend/resume
2562 	 */
2563 	if (cmd != DDI_DETACH)
2564 		return (DDI_FAILURE);
2565 
2566 	/*
2567 	 * Get the instance softstate
2568 	 */
2569 	instance = ddi_get_instance(dip);
2570 	state = ddi_get_soft_state(ibd_list, instance);
2571 
2572 	/*
2573 	 * Release all resources we're holding still.  Note that if we'd
2574 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2575 	 * so far, we should find all the flags we need in id_mac_state.
2576 	 */
2577 	(void) ibd_unattach(state, dip);
2578 
2579 	return (DDI_SUCCESS);
2580 }
2581 
2582 /*
2583  * Pre ibt_attach() driver initialization
2584  */
2585 static int
2586 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2587 {
2588 	char buf[64];
2589 
2590 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2591 	state->id_link_state = LINK_STATE_UNKNOWN;
2592 
2593 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2594 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2595 	state->id_trap_stop = B_TRUE;
2596 	state->id_trap_inprog = 0;
2597 
2598 	mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2599 	state->id_dip = dip;
2600 
2601 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2602 
2603 	state->id_tx_list.dl_head = NULL;
2604 	state->id_tx_list.dl_tail = NULL;
2605 	state->id_tx_list.dl_pending_sends = B_FALSE;
2606 	state->id_tx_list.dl_cnt = 0;
2607 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2608 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2609 	state->id_tx_busy = 0;
2610 
2611 	state->id_rx_list.dl_head = NULL;
2612 	state->id_rx_list.dl_tail = NULL;
2613 	state->id_rx_list.dl_bufs_outstanding = 0;
2614 	state->id_rx_list.dl_cnt = 0;
2615 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2616 	mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);
2617 
2618 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2619 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2620 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2621 
2622 	return (DDI_SUCCESS);
2623 }
2624 
2625 /*
2626  * Post ibt_detach() driver deconstruction
2627  */
2628 static void
2629 ibd_state_fini(ibd_state_t *state)
2630 {
2631 	kmem_cache_destroy(state->id_req_kmc);
2632 
2633 	mutex_destroy(&state->id_rxpost_lock);
2634 	mutex_destroy(&state->id_rx_list.dl_mutex);
2635 
2636 	mutex_destroy(&state->id_txpost_lock);
2637 	mutex_destroy(&state->id_tx_list.dl_mutex);
2638 
2639 	mutex_destroy(&state->id_sched_lock);
2640 	mutex_destroy(&state->id_cq_poll_lock);
2641 
2642 	cv_destroy(&state->id_trap_cv);
2643 	mutex_destroy(&state->id_trap_lock);
2644 	mutex_destroy(&state->id_link_mutex);
2645 }
2646 
2647 /*
2648  * Fetch link speed from SA for snmp ifspeed reporting.
2649  */
2650 static uint64_t
2651 ibd_get_portspeed(ibd_state_t *state)
2652 {
2653 	int			ret;
2654 	ibt_path_info_t		path;
2655 	ibt_path_attr_t		path_attr;
2656 	uint8_t			num_paths;
2657 	uint64_t		ifspeed;
2658 
2659 	/*
2660 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2661 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2662 	 * 2000000000. Start with that as default.
2663 	 */
2664 	ifspeed = 2000000000;
2665 
2666 	bzero(&path_attr, sizeof (path_attr));
2667 
2668 	/*
2669 	 * Get the port speed from Loopback path information.
2670 	 */
2671 	path_attr.pa_dgids = &state->id_sgid;
2672 	path_attr.pa_num_dgids = 1;
2673 	path_attr.pa_sgid = state->id_sgid;
2674 
2675 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2676 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2677 		goto earlydone;
2678 
2679 	if (num_paths < 1)
2680 		goto earlydone;
2681 
2682 	/*
2683 	 * In case SA does not return an expected value, report the default
2684 	 * speed as 1X.
2685 	 */
2686 	ret = 1;
2687 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2688 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2689 			ret = 1;
2690 			break;
2691 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2692 			ret = 4;
2693 			break;
2694 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2695 			ret = 12;
2696 			break;
2697 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2698 			ret = 2;
2699 			break;
2700 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2701 			ret = 8;
2702 			break;
2703 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2704 			ret = 16;
2705 			break;
2706 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2707 			ret = 24;
2708 			break;
2709 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2710 			ret = 32;
2711 			break;
2712 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2713 			ret = 48;
2714 			break;
2715 	}
2716 
2717 	ifspeed *= ret;
2718 
2719 earlydone:
2720 	return (ifspeed);
2721 }
2722 
2723 /*
2724  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2725  * representing the input mcg mgid.
2726  */
2727 static ibd_mce_t *
2728 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2729 {
2730 	ibd_mce_t *ptr = list_head(mlist);
2731 
2732 	/*
2733 	 * Do plain linear search.
2734 	 */
2735 	while (ptr != NULL) {
2736 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2737 		    sizeof (ib_gid_t)) == 0)
2738 			return (ptr);
2739 		ptr = list_next(mlist, ptr);
2740 	}
2741 	return (NULL);
2742 }
2743 
2744 /*
2745  * Execute IBA JOIN.
2746  */
2747 static ibt_status_t
2748 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2749 {
2750 	ibt_mcg_attr_t mcg_attr;
2751 
2752 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2753 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2754 	mcg_attr.mc_mgid = mgid;
2755 	mcg_attr.mc_join_state = mce->mc_jstate;
2756 	mcg_attr.mc_scope = state->id_scope;
2757 	mcg_attr.mc_pkey = state->id_pkey;
2758 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2759 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2760 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2761 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2762 	    NULL, NULL));
2763 }
2764 
2765 /*
2766  * This code JOINs the port in the proper way (depending on the join
2767  * state) so that IBA fabric will forward mcg packets to/from the port.
2768  * It also attaches the QPN to the mcg so it can receive those mcg
2769  * packets. This code makes sure not to attach the mcg to the QP if
2770  * that has been previously done due to the mcg being joined with a
2771  * different join state, even though this is not required by SWG_0216,
2772  * refid 3610.
2773  */
2774 static ibd_mce_t *
2775 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2776 {
2777 	ibt_status_t ibt_status;
2778 	ibd_mce_t *mce, *tmce, *omce = NULL;
2779 	boolean_t do_attach = B_TRUE;
2780 
2781 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2782 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2783 
2784 	/*
2785 	 * For enable_multicast Full member joins, we need to do some
2786 	 * extra work. If there is already an mce on the list that
2787 	 * indicates full membership, that means the membership has
2788 	 * not yet been dropped (since the disable_multicast was issued)
2789 	 * because there are pending Tx's to the mcg; in that case, just
2790 	 * mark the mce not to be reaped when the Tx completion queues
2791 	 * an async reap operation.
2792 	 *
2793 	 * If there is already an mce on the list indicating sendonly
2794 	 * membership, try to promote to full membership. Be careful
2795 	 * not to deallocate the old mce, since there might be an AH
2796 	 * pointing to it; instead, update the old mce with new data
2797 	 * that tracks the full membership.
2798 	 */
2799 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2800 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2801 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2802 			ASSERT(omce->mc_fullreap);
2803 			omce->mc_fullreap = B_FALSE;
2804 			return (omce);
2805 		} else {
2806 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2807 		}
2808 	}
2809 
2810 	/*
2811 	 * Allocate the ibd_mce_t to track this JOIN.
2812 	 */
2813 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2814 	mce->mc_fullreap = B_FALSE;
2815 	mce->mc_jstate = jstate;
2816 
2817 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2818 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2819 		    ibt_status);
2820 		kmem_free(mce, sizeof (ibd_mce_t));
2821 		return (NULL);
2822 	}
2823 
2824 	/*
2825 	 * Is an IBA attach required? Not if the interface is already joined
2826 	 * to the mcg in a different appropriate join state.
2827 	 */
2828 	if (jstate == IB_MC_JSTATE_NON) {
2829 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2830 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2831 			do_attach = B_FALSE;
2832 	} else if (jstate == IB_MC_JSTATE_FULL) {
2833 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2834 			do_attach = B_FALSE;
2835 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2836 		do_attach = B_FALSE;
2837 	}
2838 
2839 	if (do_attach) {
2840 		/*
2841 		 * Do the IBA attach.
2842 		 */
2843 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2844 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2845 		    &mce->mc_info)) != IBT_SUCCESS) {
2846 			DPRINT(10, "ibd_join_group : failed qp attachment "
2847 			    "%d\n", ibt_status);
2848 			/*
2849 			 * NOTE that we should probably preserve the join info
2850 			 * in the list and later try to leave again at detach
2851 			 * time.
2852 			 */
2853 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2854 			    state->id_sgid, jstate);
2855 			kmem_free(mce, sizeof (ibd_mce_t));
2856 			return (NULL);
2857 		}
2858 	}
2859 
2860 	/*
2861 	 * Insert the ibd_mce_t in the proper list.
2862 	 */
2863 	if (jstate == IB_MC_JSTATE_NON) {
2864 		IBD_MCACHE_INSERT_NON(state, mce);
2865 	} else {
2866 		/*
2867 		 * Set up the mc_req fields used for reaping the
2868 		 * mcg in case of delayed tx completion (see
2869 		 * ibd_tx_cleanup()). Also done for sendonly join in
2870 		 * case we are promoted to fullmembership later and
2871 		 * keep using the same mce.
2872 		 */
2873 		mce->mc_req.rq_gid = mgid;
2874 		mce->mc_req.rq_ptr = mce;
2875 		/*
2876 		 * Check whether this is the case of trying to join
2877 		 * full member, and we were already joined send only.
2878 		 * We try to drop our SendOnly membership, but it is
2879 		 * possible that the mcg does not exist anymore (and
2880 		 * the subnet trap never reached us), so the leave
2881 		 * operation might fail.
2882 		 */
2883 		if (omce != NULL) {
2884 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2885 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2886 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2887 			bcopy(&mce->mc_info, &omce->mc_info,
2888 			    sizeof (ibt_mcg_info_t));
2889 			kmem_free(mce, sizeof (ibd_mce_t));
2890 			return (omce);
2891 		}
2892 		mutex_enter(&state->id_mc_mutex);
2893 		IBD_MCACHE_INSERT_FULL(state, mce);
2894 		mutex_exit(&state->id_mc_mutex);
2895 	}
2896 
2897 	return (mce);
2898 }
2899 
2900 /*
2901  * Called during port up event handling to attempt to reacquire full
2902  * membership to an mcg. Stripped down version of ibd_join_group().
2903  * Note that it is possible that the mcg might have gone away, and
2904  * gets recreated at this point.
2905  */
2906 static void
2907 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2908 {
2909 	ib_gid_t mgid;
2910 
2911 	/*
2912 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2913 	 * reap/leave is going to try to leave the group. We could prevent
2914 	 * that by adding a boolean flag into ibd_mce_t, if required.
2915 	 */
2916 	if (mce->mc_fullreap)
2917 		return;
2918 
2919 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2920 
2921 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2922 	    mgid.gid_guid);
2923 
2924 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2925 		ibd_print_warn(state, "Failure on port up to rejoin "
2926 		    "multicast gid %016llx:%016llx",
2927 		    (u_longlong_t)mgid.gid_prefix,
2928 		    (u_longlong_t)mgid.gid_guid);
2929 }
2930 
2931 /*
2932  * This code handles delayed Tx completion cleanups for mcg's to which
2933  * disable_multicast has been issued, regular mcg related cleanups during
2934  * disable_multicast, disable_promiscous and mcg traps, as well as
2935  * cleanups during driver detach time. Depending on the join state,
2936  * it deletes the mce from the appropriate list and issues the IBA
2937  * leave/detach; except in the disable_multicast case when the mce
2938  * is left on the active list for a subsequent Tx completion cleanup.
2939  */
2940 static void
2941 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2942     uint8_t jstate)
2943 {
2944 	ibd_mce_t *tmce;
2945 	boolean_t do_detach = B_TRUE;
2946 
2947 	/*
2948 	 * Before detaching, we must check whether the other list
2949 	 * contains the mcg; if we detach blindly, the consumer
2950 	 * who set up the other list will also stop receiving
2951 	 * traffic.
2952 	 */
2953 	if (jstate == IB_MC_JSTATE_FULL) {
2954 		/*
2955 		 * The following check is only relevant while coming
2956 		 * from the Tx completion path in the reap case.
2957 		 */
2958 		if (!mce->mc_fullreap)
2959 			return;
2960 		mutex_enter(&state->id_mc_mutex);
2961 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2962 		mutex_exit(&state->id_mc_mutex);
2963 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2964 			do_detach = B_FALSE;
2965 	} else if (jstate == IB_MC_JSTATE_NON) {
2966 		IBD_MCACHE_PULLOUT_NON(state, mce);
2967 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2968 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2969 			do_detach = B_FALSE;
2970 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2971 		mutex_enter(&state->id_mc_mutex);
2972 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2973 		mutex_exit(&state->id_mc_mutex);
2974 		do_detach = B_FALSE;
2975 	}
2976 
2977 	/*
2978 	 * If we are reacting to a mcg trap and leaving our sendonly or
2979 	 * non membership, the mcg is possibly already gone, so attempting
2980 	 * to leave might fail. On the other hand, we must try to leave
2981 	 * anyway, since this might be a trap from long ago, and we could
2982 	 * have potentially sendonly joined to a recent incarnation of
2983 	 * the mcg and are about to loose track of this information.
2984 	 */
2985 	if (do_detach) {
2986 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
2987 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
2988 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
2989 	}
2990 
2991 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
2992 	kmem_free(mce, sizeof (ibd_mce_t));
2993 }
2994 
2995 /*
2996  * Async code executed due to multicast and promiscuous disable requests
2997  * and mcg trap handling; also executed during driver detach. Mostly, a
2998  * leave and detach is done; except for the fullmember case when Tx
2999  * requests are pending, whence arrangements are made for subsequent
3000  * cleanup on Tx completion.
3001  */
3002 static void
3003 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3004 {
3005 	ipoib_mac_t mcmac;
3006 	boolean_t recycled;
3007 	ibd_mce_t *mce;
3008 
3009 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3010 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3011 
3012 	if (jstate == IB_MC_JSTATE_NON) {
3013 		recycled = B_TRUE;
3014 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3015 		/*
3016 		 * In case we are handling a mcg trap, we might not find
3017 		 * the mcg in the non list.
3018 		 */
3019 		if (mce == NULL) {
3020 			return;
3021 		}
3022 	} else {
3023 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3024 
3025 		/*
3026 		 * In case we are handling a mcg trap, make sure the trap
3027 		 * is not arriving late; if we have an mce that indicates
3028 		 * that we are already a fullmember, that would be a clear
3029 		 * indication that the trap arrived late (ie, is for a
3030 		 * previous incarnation of the mcg).
3031 		 */
3032 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3033 			if ((mce == NULL) || (mce->mc_jstate ==
3034 			    IB_MC_JSTATE_FULL)) {
3035 				return;
3036 			}
3037 		} else {
3038 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3039 
3040 			/*
3041 			 * If join group failed, mce will be NULL here.
3042 			 * This is because in GLDv3 driver, set multicast
3043 			 *  will always return success.
3044 			 */
3045 			if (mce == NULL) {
3046 				return;
3047 			}
3048 
3049 			mce->mc_fullreap = B_TRUE;
3050 		}
3051 
3052 		/*
3053 		 * If no pending Tx's remain that reference the AH
3054 		 * for the mcg, recycle it from active to free list.
3055 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3056 		 * so the last completing Tx will cause an async reap
3057 		 * operation to be invoked, at which time we will drop our
3058 		 * membership to the mcg so that the pending Tx's complete
3059 		 * successfully. Refer to comments on "AH and MCE active
3060 		 * list manipulation" at top of this file. The lock protects
3061 		 * against Tx fast path and Tx cleanup code.
3062 		 */
3063 		mutex_enter(&state->id_ac_mutex);
3064 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3065 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3066 		    IB_MC_JSTATE_SEND_ONLY_NON));
3067 		mutex_exit(&state->id_ac_mutex);
3068 	}
3069 
3070 	if (recycled) {
3071 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3072 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3073 		ibd_async_reap_group(state, mce, mgid, jstate);
3074 	}
3075 }
3076 
3077 /*
3078  * Find the broadcast address as defined by IPoIB; implicitly
3079  * determines the IBA scope, mtu, tclass etc of the link the
3080  * interface is going to be a member of.
3081  */
3082 static ibt_status_t
3083 ibd_find_bgroup(ibd_state_t *state)
3084 {
3085 	ibt_mcg_attr_t mcg_attr;
3086 	uint_t numg;
3087 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3088 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3089 	    IB_MC_SCOPE_GLOBAL };
3090 	int i, mcgmtu;
3091 	boolean_t found = B_FALSE;
3092 	int ret;
3093 	ibt_mcg_info_t mcg_info;
3094 
3095 	state->id_bgroup_created = B_FALSE;
3096 
3097 query_bcast_grp:
3098 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3099 	mcg_attr.mc_pkey = state->id_pkey;
3100 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3101 
3102 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3103 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3104 
3105 		/*
3106 		 * Look for the IPoIB broadcast group.
3107 		 */
3108 		state->id_mgid.gid_prefix =
3109 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3110 		    ((uint64_t)state->id_scope << 48) |
3111 		    ((uint32_t)(state->id_pkey << 16)));
3112 		mcg_attr.mc_mgid = state->id_mgid;
3113 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3114 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3115 			found = B_TRUE;
3116 			break;
3117 		}
3118 	}
3119 
3120 	if (!found) {
3121 		if (ibd_create_broadcast_group) {
3122 			/*
3123 			 * If we created the broadcast group, but failed to
3124 			 * find it, we can't do anything except leave the
3125 			 * one we created and return failure.
3126 			 */
3127 			if (state->id_bgroup_created) {
3128 				ibd_print_warn(state, "IPoIB broadcast group "
3129 				    "absent. Unable to query after create.");
3130 				goto find_bgroup_fail;
3131 			}
3132 
3133 			/*
3134 			 * Create the ipoib broadcast group if it didn't exist
3135 			 */
3136 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3137 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3138 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3139 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3140 			mcg_attr.mc_pkey = state->id_pkey;
3141 			mcg_attr.mc_flow = 0;
3142 			mcg_attr.mc_sl = 0;
3143 			mcg_attr.mc_tclass = 0;
3144 			state->id_mgid.gid_prefix =
3145 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3146 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3147 			    ((uint32_t)(state->id_pkey << 16)));
3148 			mcg_attr.mc_mgid = state->id_mgid;
3149 
3150 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3151 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3152 				ibd_print_warn(state, "IPoIB broadcast group "
3153 				    "absent, create failed: ret = %d\n", ret);
3154 				state->id_bgroup_created = B_FALSE;
3155 				return (IBT_FAILURE);
3156 			}
3157 			state->id_bgroup_created = B_TRUE;
3158 			goto query_bcast_grp;
3159 		} else {
3160 			ibd_print_warn(state, "IPoIB broadcast group absent");
3161 			return (IBT_FAILURE);
3162 		}
3163 	}
3164 
3165 	/*
3166 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3167 	 */
3168 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3169 	if (state->id_mtu < mcgmtu) {
3170 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3171 		    "greater than port's maximum MTU %d", mcgmtu,
3172 		    state->id_mtu);
3173 		ibt_free_mcg_info(state->id_mcinfo, 1);
3174 		goto find_bgroup_fail;
3175 	}
3176 	state->id_mtu = mcgmtu;
3177 
3178 	return (IBT_SUCCESS);
3179 
3180 find_bgroup_fail:
3181 	if (state->id_bgroup_created) {
3182 		(void) ibt_leave_mcg(state->id_sgid,
3183 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3184 		    IB_MC_JSTATE_FULL);
3185 	}
3186 
3187 	return (IBT_FAILURE);
3188 }
3189 
3190 static int
3191 ibd_alloc_tx_copybufs(ibd_state_t *state)
3192 {
3193 	ibt_mr_attr_t mem_attr;
3194 
3195 	/*
3196 	 * Allocate one big chunk for all regular tx copy bufs
3197 	 */
3198 	state->id_tx_buf_sz = state->id_mtu;
3199 	if (state->id_lso_policy && state->id_lso_capable &&
3200 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3201 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3202 	}
3203 
3204 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3205 	    state->id_tx_buf_sz, KM_SLEEP);
3206 
3207 	/*
3208 	 * Do one memory registration on the entire txbuf area
3209 	 */
3210 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3211 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3212 	mem_attr.mr_as = NULL;
3213 	mem_attr.mr_flags = IBT_MR_SLEEP;
3214 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3215 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3216 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3217 		kmem_free(state->id_tx_bufs,
3218 		    state->id_num_swqe * state->id_tx_buf_sz);
3219 		state->id_tx_bufs = NULL;
3220 		return (DDI_FAILURE);
3221 	}
3222 
3223 	return (DDI_SUCCESS);
3224 }
3225 
3226 static int
3227 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3228 {
3229 	ibt_mr_attr_t mem_attr;
3230 	ibd_lsobuf_t *buflist;
3231 	ibd_lsobuf_t *lbufp;
3232 	ibd_lsobuf_t *tail;
3233 	ibd_lsobkt_t *bktp;
3234 	uint8_t *membase;
3235 	uint8_t *memp;
3236 	uint_t memsz;
3237 	int i;
3238 
3239 	/*
3240 	 * Allocate the lso bucket
3241 	 */
3242 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3243 
3244 	/*
3245 	 * Allocate the entire lso memory and register it
3246 	 */
3247 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3248 	membase = kmem_zalloc(memsz, KM_SLEEP);
3249 
3250 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3251 	mem_attr.mr_len = memsz;
3252 	mem_attr.mr_as = NULL;
3253 	mem_attr.mr_flags = IBT_MR_SLEEP;
3254 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3255 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3256 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3257 		kmem_free(membase, memsz);
3258 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3259 		return (DDI_FAILURE);
3260 	}
3261 
3262 	/*
3263 	 * Now allocate the buflist.  Note that the elements in the buflist and
3264 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3265 	 * can always derive the address of a buflist entry from the address of
3266 	 * an lso buffer.
3267 	 */
3268 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3269 	    KM_SLEEP);
3270 
3271 	/*
3272 	 * Set up the lso buf chain
3273 	 */
3274 	memp = membase;
3275 	lbufp = buflist;
3276 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3277 		lbufp->lb_isfree = 1;
3278 		lbufp->lb_buf = memp;
3279 		lbufp->lb_next = lbufp + 1;
3280 
3281 		tail = lbufp;
3282 
3283 		memp += IBD_LSO_BUFSZ;
3284 		lbufp++;
3285 	}
3286 	tail->lb_next = NULL;
3287 
3288 	/*
3289 	 * Set up the LSO buffer information in ibd state
3290 	 */
3291 	bktp->bkt_bufl = buflist;
3292 	bktp->bkt_free_head = buflist;
3293 	bktp->bkt_mem = membase;
3294 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3295 	bktp->bkt_nfree = bktp->bkt_nelem;
3296 
3297 	state->id_lso = bktp;
3298 
3299 	return (DDI_SUCCESS);
3300 }
3301 
3302 /*
3303  * Statically allocate Tx buffer list(s).
3304  */
3305 static int
3306 ibd_init_txlist(ibd_state_t *state)
3307 {
3308 	ibd_swqe_t *swqe;
3309 	ibt_lkey_t lkey;
3310 	int i;
3311 
3312 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3313 		return (DDI_FAILURE);
3314 
3315 	if (state->id_lso_policy && state->id_lso_capable) {
3316 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3317 			state->id_lso_policy = B_FALSE;
3318 	}
3319 
3320 	/*
3321 	 * Allocate and setup the swqe list
3322 	 */
3323 	lkey = state->id_tx_mr_desc.md_lkey;
3324 	for (i = 0; i < state->id_num_swqe; i++) {
3325 		if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
3326 			DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
3327 			ibd_fini_txlist(state);
3328 			return (DDI_FAILURE);
3329 		}
3330 
3331 		/* add to list */
3332 		state->id_tx_list.dl_cnt++;
3333 		if (state->id_tx_list.dl_head == NULL) {
3334 			swqe->swqe_prev = NULL;
3335 			swqe->swqe_next = NULL;
3336 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3337 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3338 		} else {
3339 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3340 			swqe->swqe_next = NULL;
3341 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3342 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3343 		}
3344 	}
3345 
3346 	return (DDI_SUCCESS);
3347 }
3348 
3349 static int
3350 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3351     uint32_t *nds_p)
3352 {
3353 	ibd_lsobkt_t *bktp;
3354 	ibd_lsobuf_t *lbufp;
3355 	ibd_lsobuf_t *nextp;
3356 	ibt_lkey_t lso_lkey;
3357 	uint_t frag_sz;
3358 	uint_t num_needed;
3359 	int i;
3360 
3361 	ASSERT(sgl_p != NULL);
3362 	ASSERT(nds_p != NULL);
3363 	ASSERT(req_sz != 0);
3364 
3365 	/*
3366 	 * Determine how many bufs we'd need for the size requested
3367 	 */
3368 	num_needed = req_sz / IBD_LSO_BUFSZ;
3369 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3370 		num_needed++;
3371 
3372 	mutex_enter(&state->id_lso_lock);
3373 
3374 	/*
3375 	 * If we don't have enough lso bufs, return failure
3376 	 */
3377 	ASSERT(state->id_lso != NULL);
3378 	bktp = state->id_lso;
3379 	if (bktp->bkt_nfree < num_needed) {
3380 		mutex_exit(&state->id_lso_lock);
3381 		return (-1);
3382 	}
3383 
3384 	/*
3385 	 * Pick the first 'num_needed' bufs from the free list
3386 	 */
3387 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3388 	lbufp = bktp->bkt_free_head;
3389 	for (i = 0; i < num_needed; i++) {
3390 		ASSERT(lbufp->lb_isfree != 0);
3391 		ASSERT(lbufp->lb_buf != NULL);
3392 
3393 		nextp = lbufp->lb_next;
3394 
3395 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3396 		sgl_p[i].ds_key = lso_lkey;
3397 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3398 
3399 		lbufp->lb_isfree = 0;
3400 		lbufp->lb_next = NULL;
3401 
3402 		lbufp = nextp;
3403 	}
3404 	bktp->bkt_free_head = lbufp;
3405 
3406 	/*
3407 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3408 	 * to adjust the last sgl entry's length. Since we know we need atleast
3409 	 * one, the i-1 use below is ok.
3410 	 */
3411 	if (frag_sz) {
3412 		sgl_p[i-1].ds_len = frag_sz;
3413 	}
3414 
3415 	/*
3416 	 * Update nfree count and return
3417 	 */
3418 	bktp->bkt_nfree -= num_needed;
3419 
3420 	mutex_exit(&state->id_lso_lock);
3421 
3422 	*nds_p = num_needed;
3423 
3424 	return (0);
3425 }
3426 
3427 static void
3428 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3429 {
3430 	ibd_lsobkt_t *bktp;
3431 	ibd_lsobuf_t *lbufp;
3432 	uint8_t *lso_mem_end;
3433 	uint_t ndx;
3434 	int i;
3435 
3436 	mutex_enter(&state->id_lso_lock);
3437 
3438 	bktp = state->id_lso;
3439 	ASSERT(bktp != NULL);
3440 
3441 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3442 	for (i = 0; i < nds; i++) {
3443 		uint8_t *va;
3444 
3445 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3446 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3447 
3448 		/*
3449 		 * Figure out the buflist element this sgl buffer corresponds
3450 		 * to and put it back at the head
3451 		 */
3452 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3453 		lbufp = bktp->bkt_bufl + ndx;
3454 
3455 		ASSERT(lbufp->lb_isfree == 0);
3456 		ASSERT(lbufp->lb_buf == va);
3457 
3458 		lbufp->lb_isfree = 1;
3459 		lbufp->lb_next = bktp->bkt_free_head;
3460 		bktp->bkt_free_head = lbufp;
3461 	}
3462 	bktp->bkt_nfree += nds;
3463 
3464 	mutex_exit(&state->id_lso_lock);
3465 }
3466 
3467 static void
3468 ibd_free_tx_copybufs(ibd_state_t *state)
3469 {
3470 	/*
3471 	 * Unregister txbuf mr
3472 	 */
3473 	if (ibt_deregister_mr(state->id_hca_hdl,
3474 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3475 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3476 	}
3477 	state->id_tx_mr_hdl = NULL;
3478 
3479 	/*
3480 	 * Free txbuf memory
3481 	 */
3482 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3483 	state->id_tx_bufs = NULL;
3484 }
3485 
3486 static void
3487 ibd_free_tx_lsobufs(ibd_state_t *state)
3488 {
3489 	ibd_lsobkt_t *bktp;
3490 
3491 	mutex_enter(&state->id_lso_lock);
3492 
3493 	if ((bktp = state->id_lso) == NULL) {
3494 		mutex_exit(&state->id_lso_lock);
3495 		return;
3496 	}
3497 
3498 	/*
3499 	 * First, free the buflist
3500 	 */
3501 	ASSERT(bktp->bkt_bufl != NULL);
3502 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3503 
3504 	/*
3505 	 * Unregister the LSO memory and free it
3506 	 */
3507 	ASSERT(bktp->bkt_mr_hdl != NULL);
3508 	if (ibt_deregister_mr(state->id_hca_hdl,
3509 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3510 		DPRINT(10,
3511 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3512 	}
3513 	ASSERT(bktp->bkt_mem);
3514 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3515 
3516 	/*
3517 	 * Finally free the bucket
3518 	 */
3519 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3520 	state->id_lso = NULL;
3521 
3522 	mutex_exit(&state->id_lso_lock);
3523 }
3524 
3525 /*
3526  * Free the statically allocated Tx buffer list.
3527  */
3528 static void
3529 ibd_fini_txlist(ibd_state_t *state)
3530 {
3531 	ibd_swqe_t *node;
3532 
3533 	/*
3534 	 * Free the allocated swqes
3535 	 */
3536 	mutex_enter(&state->id_tx_list.dl_mutex);
3537 	while (state->id_tx_list.dl_head != NULL) {
3538 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3539 		state->id_tx_list.dl_head = node->swqe_next;
3540 		ASSERT(state->id_tx_list.dl_cnt > 0);
3541 		state->id_tx_list.dl_cnt--;
3542 		ibd_free_swqe(state, node);
3543 	}
3544 	mutex_exit(&state->id_tx_list.dl_mutex);
3545 
3546 	ibd_free_tx_lsobufs(state);
3547 	ibd_free_tx_copybufs(state);
3548 }
3549 
3550 /*
3551  * Allocate a single send wqe and register it so it is almost
3552  * ready to be posted to the hardware.
3553  */
3554 static int
3555 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
3556 {
3557 	ibd_swqe_t *swqe;
3558 
3559 	swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
3560 	*wqe = swqe;
3561 
3562 	swqe->swqe_type = IBD_WQE_SEND;
3563 	swqe->swqe_next = NULL;
3564 	swqe->swqe_prev = NULL;
3565 	swqe->swqe_im_mblk = NULL;
3566 
3567 	swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3568 	    (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
3569 	swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3570 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3571 
3572 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3573 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3574 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3575 
3576 	/* These are set in send */
3577 	swqe->w_swr.wr_nds = 0;
3578 	swqe->w_swr.wr_sgl = NULL;
3579 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3580 
3581 	return (DDI_SUCCESS);
3582 }
3583 
3584 /*
3585  * Free an allocated send wqe.
3586  */
3587 /*ARGSUSED*/
3588 static void
3589 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3590 {
3591 	kmem_free(swqe, sizeof (ibd_swqe_t));
3592 }
3593 
3594 /*
3595  * Post a rwqe to the hardware and add it to the Rx list. The
3596  * "recycle" parameter indicates whether an old rwqe is being
3597  * recycled, or this is a new one.
3598  */
3599 static int
3600 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3601 {
3602 	ibt_status_t ibt_status;
3603 
3604 	if (recycle == B_FALSE) {
3605 		mutex_enter(&state->id_rx_list.dl_mutex);
3606 		if (state->id_rx_list.dl_head == NULL) {
3607 			rwqe->rwqe_prev = NULL;
3608 			rwqe->rwqe_next = NULL;
3609 			state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3610 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3611 		} else {
3612 			rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3613 			rwqe->rwqe_next = NULL;
3614 			state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3615 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3616 		}
3617 		mutex_exit(&state->id_rx_list.dl_mutex);
3618 	}
3619 
3620 	mutex_enter(&state->id_rxpost_lock);
3621 	if (state->id_rx_busy) {
3622 		rwqe->w_post_link = NULL;
3623 		if (state->id_rx_head)
3624 			*(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
3625 		else
3626 			state->id_rx_head = rwqe;
3627 		state->id_rx_tailp = &(rwqe->w_post_link);
3628 	} else {
3629 		state->id_rx_busy = 1;
3630 		do {
3631 			mutex_exit(&state->id_rxpost_lock);
3632 
3633 			/*
3634 			 * Here we should add dl_cnt before post recv, because
3635 			 * we would have to make sure dl_cnt is updated before
3636 			 * the corresponding ibd_process_rx() is called.
3637 			 */
3638 			atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3639 
3640 			ibt_status = ibt_post_recv(state->id_chnl_hdl,
3641 			    &rwqe->w_rwr, 1, NULL);
3642 			if (ibt_status != IBT_SUCCESS) {
3643 				(void) atomic_add_32_nv(
3644 				    &state->id_rx_list.dl_cnt, -1);
3645 				ibd_print_warn(state, "ibd_post_recv: "
3646 				    "posting failed, ret=%d", ibt_status);
3647 				return (DDI_FAILURE);
3648 			}
3649 
3650 			mutex_enter(&state->id_rxpost_lock);
3651 			rwqe = state->id_rx_head;
3652 			if (rwqe) {
3653 				state->id_rx_head =
3654 				    (ibd_rwqe_t *)(rwqe->w_post_link);
3655 			}
3656 		} while (rwqe);
3657 		state->id_rx_busy = 0;
3658 	}
3659 	mutex_exit(&state->id_rxpost_lock);
3660 
3661 	return (DDI_SUCCESS);
3662 }
3663 
3664 /*
3665  * Allocate the statically allocated Rx buffer list.
3666  */
3667 static int
3668 ibd_init_rxlist(ibd_state_t *state)
3669 {
3670 	ibd_rwqe_t *rwqe;
3671 	int i;
3672 
3673 	for (i = 0; i < state->id_num_rwqe; i++) {
3674 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3675 			ibd_fini_rxlist(state);
3676 			return (DDI_FAILURE);
3677 		}
3678 
3679 		if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
3680 			ibd_free_rwqe(state, rwqe);
3681 			ibd_fini_rxlist(state);
3682 			return (DDI_FAILURE);
3683 		}
3684 	}
3685 
3686 	return (DDI_SUCCESS);
3687 }
3688 
3689 /*
3690  * Free the statically allocated Rx buffer list.
3691  *
3692  */
3693 static void
3694 ibd_fini_rxlist(ibd_state_t *state)
3695 {
3696 	ibd_rwqe_t *node;
3697 
3698 	mutex_enter(&state->id_rx_list.dl_mutex);
3699 	while (state->id_rx_list.dl_head != NULL) {
3700 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3701 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3702 		ASSERT(state->id_rx_list.dl_cnt > 0);
3703 		state->id_rx_list.dl_cnt--;
3704 
3705 		ibd_free_rwqe(state, node);
3706 	}
3707 	mutex_exit(&state->id_rx_list.dl_mutex);
3708 }
3709 
3710 /*
3711  * Allocate a single recv wqe and register it so it is almost
3712  * ready to be posted to the hardware.
3713  */
3714 static int
3715 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3716 {
3717 	ibt_mr_attr_t mem_attr;
3718 	ibd_rwqe_t *rwqe;
3719 
3720 	if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3721 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3722 		return (DDI_FAILURE);
3723 	}
3724 	*wqe = rwqe;
3725 	rwqe->rwqe_type = IBD_WQE_RECV;
3726 	rwqe->w_state = state;
3727 	rwqe->rwqe_next = NULL;
3728 	rwqe->rwqe_prev = NULL;
3729 	rwqe->w_freeing_wqe = B_FALSE;
3730 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3731 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3732 
3733 	rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3734 	    IPOIB_GRH_SIZE, KM_NOSLEEP);
3735 	if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
3736 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3737 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3738 		return (DDI_FAILURE);
3739 	}
3740 
3741 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3742 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3743 	    NULL) {
3744 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3745 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3746 		    state->id_mtu + IPOIB_GRH_SIZE);
3747 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3748 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3749 		return (DDI_FAILURE);
3750 	}
3751 
3752 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3753 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3754 	mem_attr.mr_as = NULL;
3755 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3756 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3757 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3758 	    IBT_SUCCESS) {
3759 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3760 		rwqe->w_freeing_wqe = B_TRUE;
3761 		freemsg(rwqe->rwqe_im_mblk);
3762 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3763 		    state->id_mtu + IPOIB_GRH_SIZE);
3764 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3765 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3766 		return (DDI_FAILURE);
3767 	}
3768 
3769 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3770 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3771 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3772 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3773 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3774 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3775 	rwqe->w_rwr.wr_nds = 1;
3776 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3777 
3778 	return (DDI_SUCCESS);
3779 }
3780 
3781 /*
3782  * Free an allocated recv wqe.
3783  */
3784 static void
3785 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3786 {
3787 	if (ibt_deregister_mr(state->id_hca_hdl,
3788 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3789 		DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
3790 		return;
3791 	}
3792 
3793 	/*
3794 	 * Indicate to the callback function that this rwqe/mblk
3795 	 * should not be recycled. The freemsg() will invoke
3796 	 * ibd_freemsg_cb().
3797 	 */
3798 	if (rwqe->rwqe_im_mblk != NULL) {
3799 		rwqe->w_freeing_wqe = B_TRUE;
3800 		freemsg(rwqe->rwqe_im_mblk);
3801 	}
3802 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3803 	    state->id_mtu + IPOIB_GRH_SIZE);
3804 	rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3805 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3806 }
3807 
3808 /*
3809  * Delete the rwqe being freed from the rx list.
3810  */
3811 static void
3812 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3813 {
3814 	mutex_enter(&state->id_rx_list.dl_mutex);
3815 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
3816 		state->id_rx_list.dl_head = rwqe->rwqe_next;
3817 	else
3818 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
3819 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
3820 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
3821 	else
3822 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
3823 	mutex_exit(&state->id_rx_list.dl_mutex);
3824 }
3825 
3826 /*
3827  * IBA Rx/Tx completion queue handler. Guaranteed to be single
3828  * threaded and nonreentrant for this CQ. When using combined CQ,
3829  * this handles Tx and Rx completions. With separate CQs, this handles
3830  * only Rx completions.
3831  */
3832 /* ARGSUSED */
3833 static void
3834 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3835 {
3836 	ibd_state_t *state = (ibd_state_t *)arg;
3837 
3838 	atomic_add_64(&state->id_num_intrs, 1);
3839 
3840 	if (ibd_rx_softintr == 1)
3841 		ddi_trigger_softintr(state->id_rx);
3842 	else
3843 		(void) ibd_intr((char *)state);
3844 }
3845 
3846 /*
3847  * Separate CQ handler for Tx completions, when the Tx CQ is in
3848  * interrupt driven mode.
3849  */
3850 /* ARGSUSED */
3851 static void
3852 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3853 {
3854 	ibd_state_t *state = (ibd_state_t *)arg;
3855 
3856 	atomic_add_64(&state->id_num_intrs, 1);
3857 
3858 	if (ibd_tx_softintr == 1)
3859 		ddi_trigger_softintr(state->id_tx);
3860 	else
3861 		(void) ibd_tx_recycle((char *)state);
3862 }
3863 
3864 /*
3865  * Multicast group create/delete trap handler. These will be delivered
3866  * on a kernel thread (handling can thus block) and can be invoked
3867  * concurrently. The handler can be invoked anytime after it is
3868  * registered and before ibt_detach().
3869  */
3870 /* ARGSUSED */
3871 static void
3872 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3873     ibt_subnet_event_t *event)
3874 {
3875 	ibd_state_t *state = (ibd_state_t *)arg;
3876 	ibd_req_t *req;
3877 
3878 	/*
3879 	 * The trap handler will get invoked once for every event for
3880 	 * evert port. The input "gid" is the GID0 of the port the
3881 	 * trap came in on; we just need to act on traps that came
3882 	 * to our port, meaning the port on which the ipoib interface
3883 	 * resides. Since ipoib uses GID0 of the port, we just match
3884 	 * the gids to check whether we need to handle the trap.
3885 	 */
3886 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
3887 		return;
3888 
3889 	DPRINT(10, "ibd_notices_handler : %d\n", code);
3890 
3891 	switch (code) {
3892 		case IBT_SM_EVENT_UNAVAILABLE:
3893 			/*
3894 			 * If we are in promiscuous mode or have
3895 			 * sendnonmembers, we need to print a warning
3896 			 * message right now. Else, just store the
3897 			 * information, print when we enter promiscuous
3898 			 * mode or attempt nonmember send. We might
3899 			 * also want to stop caching sendnonmember.
3900 			 */
3901 			ibd_print_warn(state, "IBA multicast support "
3902 			    "degraded due to unavailability of multicast "
3903 			    "traps");
3904 			break;
3905 		case IBT_SM_EVENT_AVAILABLE:
3906 			/*
3907 			 * If we printed a warning message above or
3908 			 * while trying to nonmember send or get into
3909 			 * promiscuous mode, print an okay message.
3910 			 */
3911 			ibd_print_warn(state, "IBA multicast support "
3912 			    "restored due to availability of multicast "
3913 			    "traps");
3914 			break;
3915 		case IBT_SM_EVENT_MCG_CREATED:
3916 		case IBT_SM_EVENT_MCG_DELETED:
3917 			/*
3918 			 * Common processing of creation/deletion traps.
3919 			 * First check if the instance is being
3920 			 * [de]initialized; back off then, without doing
3921 			 * anything more, since we are not sure if the
3922 			 * async thread is around, or whether we might
3923 			 * be racing with the detach code in ibd_m_stop()
3924 			 * that scans the mcg list.
3925 			 */
3926 			if (!ibd_async_safe(state))
3927 				return;
3928 
3929 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
3930 			req->rq_gid = event->sm_notice_gid;
3931 			req->rq_ptr = (void *)code;
3932 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
3933 			break;
3934 	}
3935 }
3936 
3937 static void
3938 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
3939 {
3940 	ib_gid_t mgid = req->rq_gid;
3941 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
3942 
3943 	DPRINT(10, "ibd_async_trap : %d\n", code);
3944 
3945 	/*
3946 	 * Atomically search the nonmember and sendonlymember lists and
3947 	 * delete.
3948 	 */
3949 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
3950 
3951 	if (state->id_prom_op == IBD_OP_COMPLETED) {
3952 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
3953 
3954 		/*
3955 		 * If in promiscuous mode, try to join/attach to the new
3956 		 * mcg. Given the unreliable out-of-order mode of trap
3957 		 * delivery, we can never be sure whether it is a problem
3958 		 * if the join fails. Thus, we warn the admin of a failure
3959 		 * if this was a creation trap. Note that the trap might
3960 		 * actually be reporting a long past event, and the mcg
3961 		 * might already have been deleted, thus we might be warning
3962 		 * in vain.
3963 		 */
3964 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
3965 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
3966 			ibd_print_warn(state, "IBA promiscuous mode missed "
3967 			    "new multicast gid %016llx:%016llx",
3968 			    (u_longlong_t)mgid.gid_prefix,
3969 			    (u_longlong_t)mgid.gid_guid);
3970 	}
3971 
3972 	/*
3973 	 * Free the request slot allocated by the subnet event thread.
3974 	 */
3975 	ibd_async_done(state);
3976 }
3977 
3978 /*
3979  * GLDv3 entry point to get capabilities.
3980  */
3981 static boolean_t
3982 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3983 {
3984 	ibd_state_t *state = arg;
3985 
3986 	switch (cap) {
3987 	case MAC_CAPAB_HCKSUM: {
3988 		uint32_t *txflags = cap_data;
3989 
3990 		/*
3991 		 * We either do full checksum or not do it at all
3992 		 */
3993 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
3994 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
3995 		else
3996 			return (B_FALSE);
3997 		break;
3998 	}
3999 
4000 	case MAC_CAPAB_LSO: {
4001 		mac_capab_lso_t *cap_lso = cap_data;
4002 
4003 		/*
4004 		 * In addition to the capability and policy, since LSO
4005 		 * relies on hw checksum, we'll not enable LSO if we
4006 		 * don't have hw checksum.  Of course, if the HCA doesn't
4007 		 * provide the reserved lkey capability, enabling LSO will
4008 		 * actually affect performance adversely, so we'll disable
4009 		 * LSO even for that case.
4010 		 */
4011 		if (!state->id_lso_policy || !state->id_lso_capable)
4012 			return (B_FALSE);
4013 
4014 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4015 			return (B_FALSE);
4016 
4017 		if (state->id_hca_res_lkey_capab == 0) {
4018 			ibd_print_warn(state, "no reserved-lkey capability, "
4019 			    "disabling LSO");
4020 			return (B_FALSE);
4021 		}
4022 
4023 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4024 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4025 		break;
4026 	}
4027 
4028 	default:
4029 		return (B_FALSE);
4030 	}
4031 
4032 	return (B_TRUE);
4033 }
4034 
4035 static int
4036 ibd_get_port_details(ibd_state_t *state)
4037 {
4038 	ibt_hca_portinfo_t *port_infop;
4039 	ibt_status_t ret;
4040 	uint_t psize, port_infosz;
4041 
4042 	mutex_enter(&state->id_link_mutex);
4043 
4044 	/*
4045 	 * Query for port information
4046 	 */
4047 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4048 	    &port_infop, &psize, &port_infosz);
4049 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
4050 		mutex_exit(&state->id_link_mutex);
4051 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4052 		    "failed, ret=%d", ret);
4053 		return (DDI_FAILURE);
4054 	}
4055 
4056 	/*
4057 	 * If the link already went down by the time we get here,
4058 	 * give up
4059 	 */
4060 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
4061 		mutex_exit(&state->id_link_mutex);
4062 		ibt_free_portinfo(port_infop, port_infosz);
4063 		DPRINT(10, "ibd_get_port_details: port is not active");
4064 		return (DDI_FAILURE);
4065 	}
4066 
4067 	/*
4068 	 * If the link is active, verify the pkey
4069 	 */
4070 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4071 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4072 		mutex_exit(&state->id_link_mutex);
4073 		ibt_free_portinfo(port_infop, port_infosz);
4074 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
4075 		    "failed, ret=%d", ret);
4076 		return (DDI_FAILURE);
4077 	}
4078 
4079 	state->id_mtu = (128 << port_infop->p_mtu);
4080 	state->id_sgid = *port_infop->p_sgid_tbl;
4081 	state->id_link_state = LINK_STATE_UP;
4082 
4083 	mutex_exit(&state->id_link_mutex);
4084 	ibt_free_portinfo(port_infop, port_infosz);
4085 
4086 	/*
4087 	 * Now that the port is active, record the port speed
4088 	 */
4089 	state->id_link_speed = ibd_get_portspeed(state);
4090 
4091 	return (DDI_SUCCESS);
4092 }
4093 
4094 static int
4095 ibd_alloc_cqs(ibd_state_t *state)
4096 {
4097 	ibt_hca_attr_t hca_attrs;
4098 	ibt_cq_attr_t cq_attr;
4099 	ibt_status_t ret;
4100 	uint32_t real_size;
4101 
4102 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4103 	ASSERT(ret == IBT_SUCCESS);
4104 
4105 	/*
4106 	 * Allocate Rx/combined CQ:
4107 	 * Theoretically, there is no point in having more than #rwqe
4108 	 * plus #swqe cqe's, except that the CQ will be signalled for
4109 	 * overflow when the last wqe completes, if none of the previous
4110 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4111 	 * to make sure such overflow does not occur.
4112 	 */
4113 	cq_attr.cq_sched = NULL;
4114 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4115 
4116 	if (ibd_separate_cqs == 1) {
4117 		/*
4118 		 * Allocate Receive CQ.
4119 		 */
4120 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4121 			cq_attr.cq_size = state->id_num_rwqe + 1;
4122 		} else {
4123 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4124 			state->id_num_rwqe = cq_attr.cq_size - 1;
4125 		}
4126 
4127 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4128 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4129 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4130 			    "failed, ret=%d\n", ret);
4131 			return (DDI_FAILURE);
4132 		}
4133 
4134 		if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4135 		    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4136 			DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4137 			    "moderation failed, ret=%d\n", ret);
4138 		}
4139 
4140 		state->id_rxwcs_size = state->id_num_rwqe + 1;
4141 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4142 		    state->id_rxwcs_size, KM_SLEEP);
4143 
4144 		/*
4145 		 * Allocate Send CQ.
4146 		 */
4147 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4148 			cq_attr.cq_size = state->id_num_swqe + 1;
4149 		} else {
4150 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4151 			state->id_num_swqe = cq_attr.cq_size - 1;
4152 		}
4153 
4154 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4155 		    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4156 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4157 			    "failed, ret=%d\n", ret);
4158 			kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4159 			    state->id_rxwcs_size);
4160 			(void) ibt_free_cq(state->id_rcq_hdl);
4161 			return (DDI_FAILURE);
4162 		}
4163 		if ((ret = ibt_modify_cq(state->id_scq_hdl,
4164 		    IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
4165 			DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4166 			    "moderation failed, ret=%d\n", ret);
4167 		}
4168 
4169 		state->id_txwcs_size = state->id_num_swqe + 1;
4170 		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4171 		    state->id_txwcs_size, KM_SLEEP);
4172 	} else {
4173 		/*
4174 		 * Allocate combined Send/Receive CQ.
4175 		 */
4176 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
4177 		    state->id_num_swqe + 1)) {
4178 			cq_attr.cq_size = state->id_num_rwqe +
4179 			    state->id_num_swqe + 1;
4180 		} else {
4181 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4182 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
4183 			    state->id_num_rwqe) / (state->id_num_rwqe +
4184 			    state->id_num_swqe);
4185 			state->id_num_swqe = cq_attr.cq_size - 1 -
4186 			    state->id_num_rwqe;
4187 		}
4188 
4189 		state->id_rxwcs_size = cq_attr.cq_size;
4190 		state->id_txwcs_size = state->id_rxwcs_size;
4191 
4192 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4193 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4194 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
4195 			    "failed, ret=%d\n", ret);
4196 			return (DDI_FAILURE);
4197 		}
4198 		state->id_scq_hdl = state->id_rcq_hdl;
4199 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4200 		    state->id_rxwcs_size, KM_SLEEP);
4201 		state->id_txwcs = state->id_rxwcs;
4202 	}
4203 
4204 	/*
4205 	 * Print message in case we could not allocate as many wqe's
4206 	 * as was requested.
4207 	 */
4208 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4209 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4210 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4211 	}
4212 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4213 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4214 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4215 	}
4216 
4217 	return (DDI_SUCCESS);
4218 }
4219 
4220 static int
4221 ibd_setup_ud_channel(ibd_state_t *state)
4222 {
4223 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4224 	ibt_ud_chan_query_attr_t ud_chan_attr;
4225 	ibt_status_t ret;
4226 
4227 	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
4228 	if (state->id_hca_res_lkey_capab)
4229 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4230 	if (state->id_lso_policy && state->id_lso_capable)
4231 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4232 
4233 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4234 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4235 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4236 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4237 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4238 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4239 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4240 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4241 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4242 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4243 	ud_alloc_attr.ud_clone_chan	= NULL;
4244 
4245 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4246 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4247 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4248 		    "failed, ret=%d\n", ret);
4249 		return (DDI_FAILURE);
4250 	}
4251 
4252 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4253 	    &ud_chan_attr)) != IBT_SUCCESS) {
4254 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4255 		    "failed, ret=%d\n", ret);
4256 		(void) ibt_free_channel(state->id_chnl_hdl);
4257 		return (DDI_FAILURE);
4258 	}
4259 
4260 	state->id_qpnum = ud_chan_attr.ud_qpn;
4261 
4262 	return (DDI_SUCCESS);
4263 }
4264 
4265 static int
4266 ibd_undo_m_start(ibd_state_t *state)
4267 {
4268 	uint32_t progress = state->id_mac_state;
4269 	uint_t attempts;
4270 	ibt_status_t ret;
4271 	ib_gid_t mgid;
4272 	ibd_mce_t *mce;
4273 	uint8_t jstate;
4274 
4275 	/*
4276 	 * Before we try to stop/undo whatever we did in ibd_m_start(),
4277 	 * we need to mark the link state as unknown to prevent nw
4278 	 * layer from using this instance for any new transfers.
4279 	 */
4280 	if (progress & IBD_DRV_PORT_DETAILS_OBTAINED) {
4281 		state->id_link_state = LINK_STATE_UNKNOWN;
4282 		mac_link_update(state->id_mh, state->id_link_state);
4283 
4284 		state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4285 	}
4286 
4287 	if (progress & IBD_DRV_STARTED) {
4288 		state->id_mac_state &= (~IBD_DRV_STARTED);
4289 	}
4290 
4291 	/*
4292 	 * First, stop receive interrupts; this stops the driver from
4293 	 * handing up buffers to higher layers.  Wait for receive buffers
4294 	 * to be returned and give up after 5 seconds.
4295 	 */
4296 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4297 
4298 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4299 
4300 		attempts = 50;
4301 		while (state->id_rx_list.dl_bufs_outstanding > 0) {
4302 			delay(drv_usectohz(100000));
4303 			if (--attempts == 0) {
4304 				/*
4305 				 * There are pending bufs with the network
4306 				 * layer and we have no choice but to wait
4307 				 * for them to be done with. Reap all the
4308 				 * Tx/Rx completions that were posted since
4309 				 * we turned off the notification and
4310 				 * return failure.
4311 				 */
4312 				DPRINT(2, "ibd_undo_m_start: "
4313 				    "reclaiming failed");
4314 				ibd_poll_compq(state, state->id_rcq_hdl);
4315 				ibt_set_cq_handler(state->id_rcq_hdl,
4316 				    ibd_rcq_handler, state);
4317 				return (DDI_FAILURE);
4318 			}
4319 		}
4320 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4321 	}
4322 
4323 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4324 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4325 
4326 		mutex_enter(&state->id_trap_lock);
4327 		state->id_trap_stop = B_TRUE;
4328 		while (state->id_trap_inprog > 0)
4329 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4330 		mutex_exit(&state->id_trap_lock);
4331 
4332 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4333 	}
4334 
4335 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4336 		/*
4337 		 * Flushing the channel ensures that all pending WQE's
4338 		 * are marked with flush_error and handed to the CQ. It
4339 		 * does not guarantee the invocation of the CQ handler.
4340 		 * This call is guaranteed to return successfully for
4341 		 * UD QPNs.
4342 		 */
4343 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
4344 		    IBT_SUCCESS) {
4345 			DPRINT(10, "undo_m_start: flush_channel "
4346 			    "failed, ret=%d", ret);
4347 		}
4348 
4349 		/*
4350 		 * Turn off Tx interrupts and poll. By the time the polling
4351 		 * returns an empty indicator, we are sure we have seen all
4352 		 * pending Tx callbacks. Note that after the call to
4353 		 * ibt_set_cq_handler() returns, the old handler is
4354 		 * guaranteed not to be invoked anymore.
4355 		 */
4356 		if (ibd_separate_cqs == 1) {
4357 			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4358 		}
4359 		ibd_poll_compq(state, state->id_scq_hdl);
4360 
4361 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4362 	}
4363 
4364 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4365 		/*
4366 		 * No new async requests will be posted since the device
4367 		 * link state has been marked as unknown; completion handlers
4368 		 * have been turned off, so Tx handler will not cause any
4369 		 * more IBD_ASYNC_REAP requests.
4370 		 *
4371 		 * Queue a request for the async thread to exit, which will
4372 		 * be serviced after any pending ones. This can take a while,
4373 		 * specially if the SM is unreachable, since IBMF will slowly
4374 		 * timeout each SM request issued by the async thread.  Reap
4375 		 * the thread before continuing on, we do not want it to be
4376 		 * lingering in modunloaded code (or we could move the reap
4377 		 * to ibd_detach(), provided we keep track of the current
4378 		 * id_async_thrid somewhere safe).
4379 		 */
4380 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4381 		thread_join(state->id_async_thrid);
4382 
4383 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4384 	}
4385 
4386 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4387 		/*
4388 		 * Drop all residual full/non membership. This includes full
4389 		 * membership to the broadcast group, and any nonmembership
4390 		 * acquired during transmits. We do this after the Tx completion
4391 		 * handlers are done, since those might result in some late
4392 		 * leaves; this also eliminates a potential race with that
4393 		 * path wrt the mc full list insert/delete. Trap handling
4394 		 * has also been suppressed at this point. Thus, no locks
4395 		 * are required while traversing the mc full list.
4396 		 */
4397 		DPRINT(2, "ibd_undo_m_start: clear full cache entries");
4398 		mce = list_head(&state->id_mc_full);
4399 		while (mce != NULL) {
4400 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4401 			jstate = mce->mc_jstate;
4402 			mce = list_next(&state->id_mc_full, mce);
4403 			ibd_leave_group(state, mgid, jstate);
4404 		}
4405 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4406 	}
4407 
4408 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4409 		ibd_fini_rxlist(state);
4410 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4411 	}
4412 
4413 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4414 		ibd_fini_txlist(state);
4415 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4416 	}
4417 
4418 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4419 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
4420 		    IBT_SUCCESS) {
4421 			DPRINT(10, "undo_m_start: free_channel "
4422 			    "failed, ret=%d", ret);
4423 		}
4424 
4425 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4426 	}
4427 
4428 	if (progress & IBD_DRV_CQS_ALLOCD) {
4429 		if (ibd_separate_cqs == 1) {
4430 			kmem_free(state->id_txwcs,
4431 			    sizeof (ibt_wc_t) * state->id_txwcs_size);
4432 			if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
4433 			    IBT_SUCCESS) {
4434 				DPRINT(10, "undo_m_start: free_cq(scq) "
4435 				    "failed, ret=%d", ret);
4436 			}
4437 		}
4438 
4439 		kmem_free(state->id_rxwcs,
4440 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4441 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
4442 			DPRINT(10, "undo_m_start: free_cq(rcq) failed, "
4443 			    "ret=%d", ret);
4444 		}
4445 
4446 		state->id_txwcs = NULL;
4447 		state->id_rxwcs = NULL;
4448 		state->id_scq_hdl = NULL;
4449 		state->id_rcq_hdl = NULL;
4450 
4451 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4452 	}
4453 
4454 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4455 		mod_hash_destroy_hash(state->id_ah_active_hash);
4456 		ibd_acache_fini(state);
4457 
4458 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4459 	}
4460 
4461 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4462 		/*
4463 		 * If we'd created the ipoib broadcast group and had
4464 		 * successfully joined it, leave it now
4465 		 */
4466 		if (state->id_bgroup_created) {
4467 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
4468 			jstate = IB_MC_JSTATE_FULL;
4469 			(void) ibt_leave_mcg(state->id_sgid, mgid,
4470 			    state->id_sgid, jstate);
4471 		}
4472 		ibt_free_mcg_info(state->id_mcinfo, 1);
4473 
4474 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4475 	}
4476 
4477 	return (DDI_SUCCESS);
4478 }
4479 
4480 /*
4481  * GLDv3 entry point to start hardware.
4482  */
4483 /*ARGSUSED*/
4484 static int
4485 ibd_m_start(void *arg)
4486 {
4487 	ibd_state_t *state = arg;
4488 	kthread_t *kht;
4489 	int err;
4490 	ibt_status_t ret;
4491 
4492 	if (state->id_mac_state & IBD_DRV_STARTED)
4493 		return (DDI_SUCCESS);
4494 
4495 	/*
4496 	 * Get port details; if we fail here, very likely the port
4497 	 * state is inactive or the pkey can't be verified
4498 	 */
4499 	if (ibd_get_port_details(state) != DDI_SUCCESS) {
4500 		DPRINT(10, "ibd_m_start: ibd_get_port_details() failed");
4501 		return (EAGAIN);
4502 	}
4503 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4504 
4505 	/*
4506 	 * Find the IPoIB broadcast group
4507 	 */
4508 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4509 		DPRINT(10, "ibd_m_start: ibd_find_bgroup() failed");
4510 		err = ENOENT;
4511 		goto m_start_fail;
4512 	}
4513 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4514 
4515 	/*
4516 	 * Initialize per-interface caches and lists; if we fail here,
4517 	 * it is most likely due to a lack of resources
4518 	 */
4519 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4520 		DPRINT(10, "ibd_m_start: ibd_acache_init() failed");
4521 		err = ENOMEM;
4522 		goto m_start_fail;
4523 	}
4524 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4525 
4526 	/*
4527 	 * Allocate send and receive completion queues
4528 	 */
4529 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4530 		DPRINT(10, "ibd_m_start: ibd_alloc_cqs() failed");
4531 		err = ENOMEM;
4532 		goto m_start_fail;
4533 	}
4534 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4535 
4536 	/*
4537 	 * Setup a UD channel
4538 	 */
4539 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4540 		err = ENOMEM;
4541 		DPRINT(10, "ibd_m_start: ibd_setup_ud_channel() failed");
4542 		goto m_start_fail;
4543 	}
4544 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4545 
4546 	/*
4547 	 * Allocate and initialize the tx buffer list
4548 	 */
4549 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4550 		DPRINT(10, "ibd_m_start: ibd_init_txlist() failed");
4551 		err = ENOMEM;
4552 		goto m_start_fail;
4553 	}
4554 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4555 
4556 	/*
4557 	 * If we have separate cqs, create the send cq handler here
4558 	 */
4559 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
4560 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4561 		if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
4562 		    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4563 			DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(scq) "
4564 			    "failed, ret=%d", ret);
4565 			err = EINVAL;
4566 			goto m_start_fail;
4567 		}
4568 		state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4569 	}
4570 
4571 	/*
4572 	 * Allocate and initialize the rx buffer list
4573 	 */
4574 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4575 		DPRINT(10, "ibd_m_start: ibd_init_rxlist() failed");
4576 		err = ENOMEM;
4577 		goto m_start_fail;
4578 	}
4579 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4580 
4581 	/*
4582 	 * Join IPoIB broadcast group
4583 	 */
4584 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4585 		DPRINT(10, "ibd_m_start: ibd_join_group() failed");
4586 		err = EINVAL;
4587 		goto m_start_fail;
4588 	}
4589 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4590 
4591 	/*
4592 	 * Create the async thread; thread_create never fails.
4593 	 */
4594 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4595 	    TS_RUN, minclsyspri);
4596 	state->id_async_thrid = kht->t_did;
4597 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4598 
4599 	/*
4600 	 * When we did mac_register() in ibd_attach(), we didn't register
4601 	 * the real macaddr and we didn't have the true port mtu. Now that
4602 	 * we're almost ready, set the local mac address and broadcast
4603 	 * addresses and update gldv3 about the real values of these
4604 	 * parameters.
4605 	 */
4606 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4607 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4608 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4609 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4610 
4611 	mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
4612 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4613 
4614 	/*
4615 	 * Setup the receive cq handler
4616 	 */
4617 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4618 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
4619 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4620 		DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) "
4621 		    "failed, ret=%d", ret);
4622 		err = EINVAL;
4623 		goto m_start_fail;
4624 	}
4625 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4626 
4627 	/*
4628 	 * Setup the subnet notices handler after we've initialized the acache/
4629 	 * mcache and started the async thread, both of which are required for
4630 	 * the trap handler to function properly.
4631 	 *
4632 	 * Now that the async thread has been started (and we've already done
4633 	 * a mac_register() during attach so mac_tx_update() can be called
4634 	 * if necessary without any problem), we can enable the trap handler
4635 	 * to queue requests to the async thread.
4636 	 */
4637 	ibt_register_subnet_notices(state->id_ibt_hdl,
4638 	    ibd_snet_notices_handler, state);
4639 	mutex_enter(&state->id_trap_lock);
4640 	state->id_trap_stop = B_FALSE;
4641 	mutex_exit(&state->id_trap_lock);
4642 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4643 
4644 	/*
4645 	 * Indicate link status to GLDv3 and higher layers. By default,
4646 	 * we assume we are in up state (which must have been true at
4647 	 * least at the time the broadcast mcg's were probed); if there
4648 	 * were any up/down transitions till the time we come here, the
4649 	 * async handler will have updated last known state, which we
4650 	 * use to tell GLDv3. The async handler will not send any
4651 	 * notifications to GLDv3 till we reach here in the initialization
4652 	 * sequence.
4653 	 */
4654 	state->id_mac_state |= IBD_DRV_STARTED;
4655 	mac_link_update(state->id_mh, state->id_link_state);
4656 
4657 	return (DDI_SUCCESS);
4658 
4659 m_start_fail:
4660 	/*
4661 	 * If we ran into a problem during ibd_m_start() and ran into
4662 	 * some other problem during undoing our partial work, we can't
4663 	 * do anything about it.  Ignore any errors we might get from
4664 	 * ibd_undo_m_start() and just return the original error we got.
4665 	 */
4666 	(void) ibd_undo_m_start(state);
4667 	return (err);
4668 }
4669 
4670 /*
4671  * GLDv3 entry point to stop hardware from receiving packets.
4672  */
4673 /*ARGSUSED*/
4674 static void
4675 ibd_m_stop(void *arg)
4676 {
4677 	ibd_state_t *state = arg;
4678 
4679 	/*
4680 	 * Since ibd_m_stop() doesn't expect any return, we cannot
4681 	 * fail even if we run into some problem with ibd_undo_m_start().
4682 	 * The best we can do is to leave it in a good state, so
4683 	 * perhaps a future unplumb will succeed.
4684 	 */
4685 	(void) ibd_undo_m_start(state);
4686 }
4687 
4688 /*
4689  * GLDv3 entry point to modify device's mac address. We do not
4690  * allow address modifications.
4691  */
4692 static int
4693 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4694 {
4695 	ibd_state_t *state = arg;
4696 
4697 	/*
4698 	 * Don't bother even comparing the macaddr if we haven't
4699 	 * completed ibd_m_start().
4700 	 */
4701 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4702 		return (0);
4703 
4704 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4705 		return (0);
4706 	else
4707 		return (EINVAL);
4708 }
4709 
4710 /*
4711  * The blocking part of the IBA join/leave operations are done out
4712  * of here on the async thread.
4713  */
4714 static void
4715 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4716 {
4717 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4718 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4719 
4720 	if (op == IBD_ASYNC_JOIN) {
4721 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4722 			ibd_print_warn(state, "Joint multicast group failed :"
4723 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4724 		}
4725 	} else {
4726 		/*
4727 		 * Here, we must search for the proper mcg_info and
4728 		 * use that to leave the group.
4729 		 */
4730 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4731 	}
4732 }
4733 
4734 /*
4735  * GLDv3 entry point for multicast enable/disable requests.
4736  * This function queues the operation to the async thread and
4737  * return success for a valid multicast address.
4738  */
4739 static int
4740 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4741 {
4742 	ibd_state_t *state = (ibd_state_t *)arg;
4743 	ipoib_mac_t maddr, *mcast;
4744 	ib_gid_t mgid;
4745 	ibd_req_t *req;
4746 
4747 	/*
4748 	 * If we haven't completed ibd_m_start(), async thread wouldn't
4749 	 * have been started and id_bcaddr wouldn't be set, so there's
4750 	 * no point in continuing.
4751 	 */
4752 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4753 		return (0);
4754 
4755 	/*
4756 	 * The incoming multicast address might not be aligned properly
4757 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4758 	 * it to look like one though, to get the offsets of the mc gid,
4759 	 * since we know we are not going to dereference any values with
4760 	 * the ipoib_mac_t pointer.
4761 	 */
4762 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
4763 	mcast = &maddr;
4764 
4765 	/*
4766 	 * Check validity of MCG address. We could additionally check
4767 	 * that a enable/disable is not being issued on the "broadcast"
4768 	 * mcg, but since this operation is only invokable by priviledged
4769 	 * programs anyway, we allow the flexibility to those dlpi apps.
4770 	 * Note that we do not validate the "scope" of the IBA mcg.
4771 	 */
4772 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
4773 		return (EINVAL);
4774 
4775 	/*
4776 	 * fill in multicast pkey and scope
4777 	 */
4778 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
4779 
4780 	/*
4781 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4782 	 * nothing (i.e. we stay JOINed to the broadcast group done in
4783 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
4784 	 * requires to be joined to broadcast groups at all times.
4785 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4786 	 * depends on this.
4787 	 */
4788 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4789 		return (0);
4790 
4791 	ibd_n2h_gid(mcast, &mgid);
4792 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4793 	if (req == NULL)
4794 		return (ENOMEM);
4795 
4796 	req->rq_gid = mgid;
4797 
4798 	if (add) {
4799 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
4800 		    mgid.gid_prefix, mgid.gid_guid);
4801 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
4802 	} else {
4803 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
4804 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4805 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
4806 	}
4807 	return (0);
4808 }
4809 
4810 /*
4811  * The blocking part of the IBA promiscuous operations are done
4812  * out of here on the async thread. The dlpireq parameter indicates
4813  * whether this invocation is due to a dlpi request or due to
4814  * a port up/down event.
4815  */
4816 static void
4817 ibd_async_unsetprom(ibd_state_t *state)
4818 {
4819 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4820 	ib_gid_t mgid;
4821 
4822 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4823 
4824 	while (mce != NULL) {
4825 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4826 		mce = list_next(&state->id_mc_non, mce);
4827 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4828 	}
4829 	state->id_prom_op = IBD_OP_NOTSTARTED;
4830 }
4831 
4832 /*
4833  * The blocking part of the IBA promiscuous operations are done
4834  * out of here on the async thread. The dlpireq parameter indicates
4835  * whether this invocation is due to a dlpi request or due to
4836  * a port up/down event.
4837  */
4838 static void
4839 ibd_async_setprom(ibd_state_t *state)
4840 {
4841 	ibt_mcg_attr_t mcg_attr;
4842 	ibt_mcg_info_t *mcg_info;
4843 	ib_gid_t mgid;
4844 	uint_t numg;
4845 	int i;
4846 	char ret = IBD_OP_COMPLETED;
4847 
4848 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4849 
4850 	/*
4851 	 * Obtain all active MC groups on the IB fabric with
4852 	 * specified criteria (scope + Pkey + Qkey + mtu).
4853 	 */
4854 	bzero(&mcg_attr, sizeof (mcg_attr));
4855 	mcg_attr.mc_pkey = state->id_pkey;
4856 	mcg_attr.mc_scope = state->id_scope;
4857 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4858 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4859 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4860 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4861 	    IBT_SUCCESS) {
4862 		ibd_print_warn(state, "Could not get list of IBA multicast "
4863 		    "groups");
4864 		ret = IBD_OP_ERRORED;
4865 		goto done;
4866 	}
4867 
4868 	/*
4869 	 * Iterate over the returned mcg's and join as NonMember
4870 	 * to the IP mcg's.
4871 	 */
4872 	for (i = 0; i < numg; i++) {
4873 		/*
4874 		 * Do a NonMember JOIN on the MC group.
4875 		 */
4876 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4877 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4878 			ibd_print_warn(state, "IBA promiscuous mode missed "
4879 			    "multicast gid %016llx:%016llx",
4880 			    (u_longlong_t)mgid.gid_prefix,
4881 			    (u_longlong_t)mgid.gid_guid);
4882 	}
4883 
4884 	ibt_free_mcg_info(mcg_info, numg);
4885 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4886 done:
4887 	state->id_prom_op = ret;
4888 }
4889 
4890 /*
4891  * GLDv3 entry point for multicast promiscuous enable/disable requests.
4892  * GLDv3 assumes phys state receives more packets than multi state,
4893  * which is not true for IPoIB. Thus, treat the multi and phys
4894  * promiscuous states the same way to work with GLDv3's assumption.
4895  */
4896 static int
4897 ibd_m_promisc(void *arg, boolean_t on)
4898 {
4899 	ibd_state_t *state = (ibd_state_t *)arg;
4900 	ibd_req_t *req;
4901 
4902 	/*
4903 	 * Async thread wouldn't have been started if we haven't
4904 	 * passed ibd_m_start()
4905 	 */
4906 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4907 		return (0);
4908 
4909 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4910 	if (req == NULL)
4911 		return (ENOMEM);
4912 	if (on) {
4913 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
4914 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
4915 	} else {
4916 		DPRINT(1, "ibd_m_promisc : unset_promisc");
4917 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
4918 	}
4919 
4920 	return (0);
4921 }
4922 
4923 /*
4924  * GLDv3 entry point for gathering statistics.
4925  */
4926 static int
4927 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
4928 {
4929 	ibd_state_t *state = (ibd_state_t *)arg;
4930 
4931 	switch (stat) {
4932 	case MAC_STAT_IFSPEED:
4933 		*val = state->id_link_speed;
4934 		break;
4935 	case MAC_STAT_MULTIRCV:
4936 		*val = state->id_multi_rcv;
4937 		break;
4938 	case MAC_STAT_BRDCSTRCV:
4939 		*val = state->id_brd_rcv;
4940 		break;
4941 	case MAC_STAT_MULTIXMT:
4942 		*val = state->id_multi_xmt;
4943 		break;
4944 	case MAC_STAT_BRDCSTXMT:
4945 		*val = state->id_brd_xmt;
4946 		break;
4947 	case MAC_STAT_RBYTES:
4948 		*val = state->id_rcv_bytes;
4949 		break;
4950 	case MAC_STAT_IPACKETS:
4951 		*val = state->id_rcv_pkt;
4952 		break;
4953 	case MAC_STAT_OBYTES:
4954 		*val = state->id_xmt_bytes;
4955 		break;
4956 	case MAC_STAT_OPACKETS:
4957 		*val = state->id_xmt_pkt;
4958 		break;
4959 	case MAC_STAT_OERRORS:
4960 		*val = state->id_ah_error;	/* failed AH translation */
4961 		break;
4962 	case MAC_STAT_IERRORS:
4963 		*val = 0;
4964 		break;
4965 	case MAC_STAT_NOXMTBUF:
4966 		*val = state->id_tx_short;
4967 		break;
4968 	case MAC_STAT_NORCVBUF:
4969 	default:
4970 		return (ENOTSUP);
4971 	}
4972 
4973 	return (0);
4974 }
4975 
4976 static void
4977 ibd_async_txsched(ibd_state_t *state)
4978 {
4979 	ibd_req_t *req;
4980 	int ret;
4981 
4982 	if (ibd_txcomp_poll)
4983 		ibd_poll_compq(state, state->id_scq_hdl);
4984 
4985 	ret = ibd_resume_transmission(state);
4986 	if (ret && ibd_txcomp_poll) {
4987 		if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
4988 			ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
4989 		else {
4990 			ibd_print_warn(state, "ibd_async_txsched: "
4991 			    "no memory, can't schedule work slot");
4992 		}
4993 	}
4994 }
4995 
4996 static int
4997 ibd_resume_transmission(ibd_state_t *state)
4998 {
4999 	int flag;
5000 	int met_thresh = 0;
5001 	int ret = -1;
5002 
5003 	mutex_enter(&state->id_sched_lock);
5004 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
5005 		met_thresh = (state->id_tx_list.dl_cnt >
5006 		    IBD_FREE_SWQES_THRESH);
5007 		flag = IBD_RSRC_SWQE;
5008 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
5009 		ASSERT(state->id_lso != NULL);
5010 		met_thresh = (state->id_lso->bkt_nfree >
5011 		    IBD_FREE_LSOS_THRESH);
5012 		flag = IBD_RSRC_LSOBUF;
5013 	}
5014 	if (met_thresh) {
5015 		state->id_sched_needed &= ~flag;
5016 		ret = 0;
5017 	}
5018 	mutex_exit(&state->id_sched_lock);
5019 
5020 	if (ret == 0)
5021 		mac_tx_update(state->id_mh);
5022 
5023 	return (ret);
5024 }
5025 
5026 /*
5027  * Release the send wqe back into free list.
5028  */
5029 static void
5030 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
5031 {
5032 	/*
5033 	 * Add back on Tx list for reuse.
5034 	 */
5035 	swqe->swqe_next = NULL;
5036 	mutex_enter(&state->id_tx_list.dl_mutex);
5037 	if (state->id_tx_list.dl_pending_sends) {
5038 		state->id_tx_list.dl_pending_sends = B_FALSE;
5039 	}
5040 	if (state->id_tx_list.dl_head == NULL) {
5041 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
5042 	} else {
5043 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
5044 	}
5045 	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
5046 	state->id_tx_list.dl_cnt++;
5047 	mutex_exit(&state->id_tx_list.dl_mutex);
5048 }
5049 
5050 /*
5051  * Acquire a send wqe from free list.
5052  * Returns error number and send wqe pointer.
5053  */
5054 static int
5055 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
5056 {
5057 	int rc = 0;
5058 	ibd_swqe_t *wqe;
5059 
5060 	/*
5061 	 * Check and reclaim some of the completed Tx requests.
5062 	 * If someone else is already in this code and pulling Tx
5063 	 * completions, no need to poll, since the current lock holder
5064 	 * will do the work anyway. Normally, we poll for completions
5065 	 * every few Tx attempts, but if we are short on Tx descriptors,
5066 	 * we always try to poll.
5067 	 */
5068 	if ((ibd_txcomp_poll == 1) &&
5069 	    (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
5070 		ibd_poll_compq(state, state->id_scq_hdl);
5071 	}
5072 
5073 	/*
5074 	 * Grab required transmit wqes.
5075 	 */
5076 	mutex_enter(&state->id_tx_list.dl_mutex);
5077 	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
5078 	if (wqe != NULL) {
5079 		state->id_tx_list.dl_cnt -= 1;
5080 		state->id_tx_list.dl_head = wqe->swqe_next;
5081 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
5082 			state->id_tx_list.dl_tail = NULL;
5083 	} else {
5084 		/*
5085 		 * If we did not find the number we were looking for, flag
5086 		 * no resource. Adjust list appropriately in either case.
5087 		 */
5088 		rc = ENOENT;
5089 		state->id_tx_list.dl_pending_sends = B_TRUE;
5090 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
5091 		atomic_add_64(&state->id_tx_short, 1);
5092 	}
5093 	mutex_exit(&state->id_tx_list.dl_mutex);
5094 	*swqe = wqe;
5095 
5096 	return (rc);
5097 }
5098 
5099 static int
5100 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
5101     ibt_ud_dest_hdl_t ud_dest)
5102 {
5103 	mblk_t	*nmp;
5104 	int iph_len, tcph_len;
5105 	ibt_wr_lso_t *lso;
5106 	uintptr_t ip_start, tcp_start;
5107 	uint8_t *dst;
5108 	uint_t pending, mblen;
5109 
5110 	/*
5111 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
5112 	 * we need to adjust it here for lso.
5113 	 */
5114 	lso = &(node->w_swr.wr.ud_lso);
5115 	lso->lso_ud_dest = ud_dest;
5116 	lso->lso_mss = mss;
5117 
5118 	/*
5119 	 * Calculate the LSO header size and set it in the UD LSO structure.
5120 	 * Note that the only assumption we make is that each of the IPoIB,
5121 	 * IP and TCP headers will be contained in a single mblk fragment;
5122 	 * together, the headers may span multiple mblk fragments.
5123 	 */
5124 	nmp = mp;
5125 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
5126 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
5127 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
5128 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
5129 		nmp = nmp->b_cont;
5130 
5131 	}
5132 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
5133 
5134 	tcp_start = ip_start + iph_len;
5135 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
5136 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
5137 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
5138 		nmp = nmp->b_cont;
5139 	}
5140 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5141 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5142 
5143 	/*
5144 	 * If the lso header fits entirely within a single mblk fragment,
5145 	 * we'll avoid an additional copy of the lso header here and just
5146 	 * pass the b_rptr of the mblk directly.
5147 	 *
5148 	 * If this isn't true, we'd have to allocate for it explicitly.
5149 	 */
5150 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5151 		lso->lso_hdr = mp->b_rptr;
5152 	} else {
5153 		/* On work completion, remember to free this allocated hdr */
5154 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5155 		if (lso->lso_hdr == NULL) {
5156 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5157 			    "sz = %d", lso->lso_hdr_sz);
5158 			lso->lso_hdr_sz = 0;
5159 			lso->lso_mss = 0;
5160 			return (-1);
5161 		}
5162 	}
5163 
5164 	/*
5165 	 * Copy in the lso header only if we need to
5166 	 */
5167 	if (lso->lso_hdr != mp->b_rptr) {
5168 		dst = lso->lso_hdr;
5169 		pending = lso->lso_hdr_sz;
5170 
5171 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5172 			mblen = MBLKL(nmp);
5173 			if (pending > mblen) {
5174 				bcopy(nmp->b_rptr, dst, mblen);
5175 				dst += mblen;
5176 				pending -= mblen;
5177 			} else {
5178 				bcopy(nmp->b_rptr, dst, pending);
5179 				break;
5180 			}
5181 		}
5182 	}
5183 
5184 	return (0);
5185 }
5186 
5187 static void
5188 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5189 {
5190 	ibt_wr_lso_t *lso;
5191 
5192 	if ((!node) || (!mp))
5193 		return;
5194 
5195 	/*
5196 	 * Free any header space that we might've allocated if we
5197 	 * did an LSO
5198 	 */
5199 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5200 		lso = &(node->w_swr.wr.ud_lso);
5201 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5202 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5203 			lso->lso_hdr = NULL;
5204 			lso->lso_hdr_sz = 0;
5205 		}
5206 	}
5207 }
5208 
5209 static void
5210 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5211 {
5212 	uint_t		i;
5213 	uint_t		num_posted;
5214 	uint_t		n_wrs;
5215 	ibt_status_t	ibt_status;
5216 	ibt_send_wr_t	wrs[IBD_MAX_POST_MULTIPLE];
5217 	ibd_swqe_t	*elem;
5218 	ibd_swqe_t	*nodes[IBD_MAX_POST_MULTIPLE];
5219 
5220 	node->swqe_next = NULL;
5221 
5222 	mutex_enter(&state->id_txpost_lock);
5223 
5224 	/*
5225 	 * Enqueue the new node in chain of wqes to send
5226 	 */
5227 	if (state->id_tx_head) {
5228 		*(state->id_tx_tailp) = (ibd_wqe_t *)node;
5229 	} else {
5230 		state->id_tx_head = node;
5231 	}
5232 	state->id_tx_tailp = &(node->swqe_next);
5233 
5234 	/*
5235 	 * If someone else is helping out with the sends,
5236 	 * just go back
5237 	 */
5238 	if (state->id_tx_busy) {
5239 		mutex_exit(&state->id_txpost_lock);
5240 		return;
5241 	}
5242 
5243 	/*
5244 	 * Otherwise, mark the flag to indicate that we'll be
5245 	 * doing the dispatch of what's there in the wqe chain
5246 	 */
5247 	state->id_tx_busy = 1;
5248 
5249 	while (state->id_tx_head) {
5250 		/*
5251 		 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
5252 		 * at a time if possible, and keep posting them.
5253 		 */
5254 		for (n_wrs = 0, elem = state->id_tx_head;
5255 		    (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
5256 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5257 
5258 			nodes[n_wrs] = elem;
5259 			wrs[n_wrs] = elem->w_swr;
5260 		}
5261 		state->id_tx_head = elem;
5262 
5263 		/*
5264 		 * Release the txpost lock before posting the
5265 		 * send request to the hca; if the posting fails
5266 		 * for some reason, we'll never receive completion
5267 		 * intimation, so we'll need to cleanup.
5268 		 */
5269 		mutex_exit(&state->id_txpost_lock);
5270 
5271 		ASSERT(n_wrs != 0);
5272 
5273 		/*
5274 		 * If posting fails for some reason, we'll never receive
5275 		 * completion intimation, so we'll need to cleanup. But
5276 		 * we need to make sure we don't clean up nodes whose
5277 		 * wrs have been successfully posted. We assume that the
5278 		 * hca driver returns on the first failure to post and
5279 		 * therefore the first 'num_posted' entries don't need
5280 		 * cleanup here.
5281 		 */
5282 		num_posted = 0;
5283 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5284 		    wrs, n_wrs, &num_posted);
5285 		if (ibt_status != IBT_SUCCESS) {
5286 
5287 			ibd_print_warn(state, "ibd_post_send: "
5288 			    "posting multiple wrs failed: "
5289 			    "requested=%d, done=%d, ret=%d",
5290 			    n_wrs, num_posted, ibt_status);
5291 
5292 			for (i = num_posted; i < n_wrs; i++)
5293 				ibd_tx_cleanup(state, nodes[i]);
5294 		}
5295 
5296 		/*
5297 		 * Grab the mutex before we go and check the tx Q again
5298 		 */
5299 		mutex_enter(&state->id_txpost_lock);
5300 	}
5301 
5302 	state->id_tx_busy = 0;
5303 	mutex_exit(&state->id_txpost_lock);
5304 }
5305 
5306 static int
5307 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5308     uint_t lsohdr_sz)
5309 {
5310 	ibt_wr_ds_t *sgl;
5311 	ibt_status_t ibt_status;
5312 	mblk_t *nmp;
5313 	mblk_t *data_mp;
5314 	uchar_t *bufp;
5315 	size_t blksize;
5316 	size_t skip;
5317 	size_t avail;
5318 	uint_t pktsize;
5319 	uint_t frag_len;
5320 	uint_t pending_hdr;
5321 	uint_t hiwm;
5322 	int nmblks;
5323 	int i;
5324 
5325 	/*
5326 	 * Let's skip ahead to the data if this is LSO
5327 	 */
5328 	data_mp = mp;
5329 	pending_hdr = 0;
5330 	if (lsohdr_sz) {
5331 		pending_hdr = lsohdr_sz;
5332 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5333 			frag_len = nmp->b_wptr - nmp->b_rptr;
5334 			if (frag_len > pending_hdr)
5335 				break;
5336 			pending_hdr -= frag_len;
5337 		}
5338 		data_mp = nmp;	/* start of data past lso header */
5339 		ASSERT(data_mp != NULL);
5340 	}
5341 
5342 	/*
5343 	 * Calculate the size of message data and number of msg blocks
5344 	 */
5345 	pktsize = 0;
5346 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5347 	    nmp = nmp->b_cont, nmblks++) {
5348 		pktsize += MBLKL(nmp);
5349 	}
5350 	pktsize -= pending_hdr;
5351 
5352 	/*
5353 	 * Translating the virtual address regions into physical regions
5354 	 * for using the Reserved LKey feature results in a wr sgl that
5355 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
5356 	 * we'll fix a high-water mark (65%) for when we should stop.
5357 	 */
5358 	hiwm = (state->id_max_sqseg * 65) / 100;
5359 
5360 	/*
5361 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5362 	 * "copy-threshold", and if the number of mp fragments is less than
5363 	 * the maximum acceptable.
5364 	 */
5365 	if ((state->id_hca_res_lkey_capab) &&
5366 	    (pktsize > IBD_TX_COPY_THRESH) &&
5367 	    (nmblks < hiwm)) {
5368 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5369 		ibt_iov_attr_t iov_attr;
5370 
5371 		iov_attr.iov_as = NULL;
5372 		iov_attr.iov = iov_arr;
5373 		iov_attr.iov_buf = NULL;
5374 		iov_attr.iov_list_len = nmblks;
5375 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5376 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5377 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5378 
5379 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5380 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5381 			iov_arr[i].iov_len = MBLKL(nmp);
5382 			if (i == 0) {
5383 				iov_arr[i].iov_addr += pending_hdr;
5384 				iov_arr[i].iov_len -= pending_hdr;
5385 			}
5386 		}
5387 
5388 		node->w_buftype = IBD_WQE_MAPPED;
5389 		node->w_swr.wr_sgl = node->w_sgl;
5390 
5391 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5392 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5393 		if (ibt_status != IBT_SUCCESS) {
5394 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5395 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5396 			goto ibd_copy_path;
5397 		}
5398 
5399 		return (0);
5400 	}
5401 
5402 ibd_copy_path:
5403 	if (pktsize <= state->id_tx_buf_sz) {
5404 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5405 		node->w_swr.wr_nds = 1;
5406 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5407 		node->w_buftype = IBD_WQE_TXBUF;
5408 
5409 		/*
5410 		 * Even though this is the copy path for transfers less than
5411 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5412 		 * is possible the first data mblk fragment (data_mp) still
5413 		 * contains part of the LSO header that we need to skip.
5414 		 */
5415 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5416 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5417 			blksize = MBLKL(nmp) - pending_hdr;
5418 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5419 			bufp += blksize;
5420 			pending_hdr = 0;
5421 		}
5422 
5423 		return (0);
5424 	}
5425 
5426 	/*
5427 	 * Copy path for transfers greater than id_tx_buf_sz
5428 	 */
5429 	node->w_swr.wr_sgl = node->w_sgl;
5430 	if (ibd_acquire_lsobufs(state, pktsize,
5431 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5432 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5433 		return (-1);
5434 	}
5435 	node->w_buftype = IBD_WQE_LSOBUF;
5436 
5437 	/*
5438 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5439 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5440 	 * need to skip part of the LSO header in the first fragment
5441 	 * as before.
5442 	 */
5443 	nmp = data_mp;
5444 	skip = pending_hdr;
5445 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5446 		sgl = node->w_swr.wr_sgl + i;
5447 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5448 		avail = IBD_LSO_BUFSZ;
5449 		while (nmp && avail) {
5450 			blksize = MBLKL(nmp) - skip;
5451 			if (blksize > avail) {
5452 				bcopy(nmp->b_rptr + skip, bufp, avail);
5453 				skip += avail;
5454 				avail = 0;
5455 			} else {
5456 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5457 				skip = 0;
5458 				avail -= blksize;
5459 				bufp += blksize;
5460 				nmp = nmp->b_cont;
5461 			}
5462 		}
5463 	}
5464 
5465 	return (0);
5466 }
5467 
5468 /*
5469  * Schedule a completion queue polling to reap the resource we're
5470  * short on.  If we implement the change to reap tx completions
5471  * in a separate thread, we'll need to wake up that thread here.
5472  */
5473 static int
5474 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5475 {
5476 	ibd_req_t *req;
5477 
5478 	mutex_enter(&state->id_sched_lock);
5479 	state->id_sched_needed |= resource_type;
5480 	mutex_exit(&state->id_sched_lock);
5481 
5482 	/*
5483 	 * If we are asked to queue a work entry, we need to do it
5484 	 */
5485 	if (q_flag) {
5486 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5487 		if (req == NULL)
5488 			return (-1);
5489 
5490 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5491 	}
5492 
5493 	return (0);
5494 }
5495 
5496 /*
5497  * The passed in packet has this format:
5498  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5499  */
5500 static boolean_t
5501 ibd_send(ibd_state_t *state, mblk_t *mp)
5502 {
5503 	ibd_ace_t *ace;
5504 	ibd_swqe_t *node;
5505 	ipoib_mac_t *dest;
5506 	ib_header_info_t *ipibp;
5507 	ip6_t *ip6h;
5508 	uint_t pktsize;
5509 	uint32_t mss;
5510 	uint32_t hckflags;
5511 	uint32_t lsoflags = 0;
5512 	uint_t lsohdr_sz = 0;
5513 	int ret, len;
5514 	boolean_t dofree = B_FALSE;
5515 	boolean_t rc;
5516 
5517 	/*
5518 	 * If we aren't done with the device initialization and start,
5519 	 * we shouldn't be here.
5520 	 */
5521 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5522 		return (B_FALSE);
5523 
5524 	node = NULL;
5525 	if (ibd_acquire_swqe(state, &node) != 0) {
5526 		/*
5527 		 * If we don't have an swqe available, schedule a transmit
5528 		 * completion queue cleanup and hold off on sending more
5529 		 * more packets until we have some free swqes
5530 		 */
5531 		if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
5532 			return (B_FALSE);
5533 
5534 		/*
5535 		 * If a poll cannot be scheduled, we have no choice but
5536 		 * to drop this packet
5537 		 */
5538 		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5539 		return (B_TRUE);
5540 	}
5541 
5542 	/*
5543 	 * Initialize the commonly used fields in swqe to NULL to protect
5544 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5545 	 * failure.
5546 	 */
5547 	node->swqe_im_mblk = NULL;
5548 	node->w_swr.wr_nds = 0;
5549 	node->w_swr.wr_sgl = NULL;
5550 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5551 
5552 	/*
5553 	 * Obtain an address handle for the destination.
5554 	 */
5555 	ipibp = (ib_header_info_t *)mp->b_rptr;
5556 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5557 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5558 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5559 
5560 	pktsize = msgsize(mp);
5561 
5562 	atomic_add_64(&state->id_xmt_bytes, pktsize);
5563 	atomic_inc_64(&state->id_xmt_pkt);
5564 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5565 		atomic_inc_64(&state->id_brd_xmt);
5566 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5567 		atomic_inc_64(&state->id_multi_xmt);
5568 
5569 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
5570 		node->w_ahandle = ace;
5571 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5572 	} else {
5573 		DPRINT(5,
5574 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5575 		    ((ret == EFAULT) ? "failed" : "queued"),
5576 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5577 		    htonl(dest->ipoib_gidpref[1]),
5578 		    htonl(dest->ipoib_gidsuff[0]),
5579 		    htonl(dest->ipoib_gidsuff[1]));
5580 		node->w_ahandle = NULL;
5581 
5582 		/*
5583 		 * for the poll mode, it is probably some cqe pending in the
5584 		 * cq. So ibd has to poll cq here, otherwise acache probably
5585 		 * may not be recycled.
5586 		 */
5587 		if (ibd_txcomp_poll == 1)
5588 			ibd_poll_compq(state, state->id_scq_hdl);
5589 
5590 		/*
5591 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5592 		 * can not find a path for the specific dest address. We
5593 		 * should get rid of this kind of packet.  We also should get
5594 		 * rid of the packet if we cannot schedule a poll via the
5595 		 * async thread.  For the normal case, ibd will return the
5596 		 * packet to upper layer and wait for AH creating.
5597 		 *
5598 		 * Note that we always queue a work slot entry for the async
5599 		 * thread when we fail AH lookup (even in intr mode); this is
5600 		 * due to the convoluted way the code currently looks for AH.
5601 		 */
5602 		if (ret == EFAULT) {
5603 			dofree = B_TRUE;
5604 			rc = B_TRUE;
5605 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5606 			dofree = B_TRUE;
5607 			rc = B_TRUE;
5608 		} else {
5609 			dofree = B_FALSE;
5610 			rc = B_FALSE;
5611 		}
5612 		goto ibd_send_fail;
5613 	}
5614 
5615 	/*
5616 	 * For ND6 packets, padding is at the front of the source lladdr.
5617 	 * Insert the padding at front.
5618 	 */
5619 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) {
5620 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
5621 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5622 			    sizeof (ib_header_info_t))) {
5623 				DPRINT(10, "ibd_send: pullupmsg failure ");
5624 				dofree = B_TRUE;
5625 				rc = B_TRUE;
5626 				goto ibd_send_fail;
5627 			}
5628 			ipibp = (ib_header_info_t *)mp->b_rptr;
5629 		}
5630 		ip6h = (ip6_t *)((uchar_t *)ipibp +
5631 		    sizeof (ib_header_info_t));
5632 		len = ntohs(ip6h->ip6_plen);
5633 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5634 			mblk_t	*pad;
5635 
5636 			pad = allocb(4, 0);
5637 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
5638 			linkb(mp, pad);
5639 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
5640 			    IPV6_HDR_LEN + len + 4) {
5641 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
5642 				    IPV6_HDR_LEN + len + 4)) {
5643 					DPRINT(10, "ibd_send: pullupmsg "
5644 					    "failure ");
5645 					dofree = B_TRUE;
5646 					rc = B_TRUE;
5647 					goto ibd_send_fail;
5648 				}
5649 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5650 				    sizeof (ib_header_info_t));
5651 			}
5652 
5653 			/* LINTED: E_CONSTANT_CONDITION */
5654 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5655 		}
5656 	}
5657 
5658 	mp->b_rptr += sizeof (ib_addrs_t);
5659 
5660 	/*
5661 	 * Do LSO and checksum related work here.  For LSO send, adjust the
5662 	 * ud destination, the opcode and the LSO header information to the
5663 	 * work request.
5664 	 */
5665 	lso_info_get(mp, &mss, &lsoflags);
5666 	if ((lsoflags & HW_LSO) != HW_LSO) {
5667 		node->w_swr.wr_opcode = IBT_WRC_SEND;
5668 		lsohdr_sz = 0;
5669 	} else {
5670 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
5671 			/*
5672 			 * The routine can only fail if there's no memory; we
5673 			 * can only drop the packet if this happens
5674 			 */
5675 			ibd_print_warn(state,
5676 			    "ibd_send: no memory, lso posting failed");
5677 			dofree = B_TRUE;
5678 			rc = B_TRUE;
5679 			goto ibd_send_fail;
5680 		}
5681 
5682 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
5683 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
5684 	}
5685 
5686 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
5687 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
5688 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
5689 	else
5690 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
5691 
5692 	/*
5693 	 * Prepare the sgl for posting; the routine can only fail if there's
5694 	 * no lso buf available for posting. If this is the case, we should
5695 	 * probably resched for lso bufs to become available and then try again.
5696 	 */
5697 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
5698 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
5699 			dofree = B_TRUE;
5700 			rc = B_TRUE;
5701 		} else {
5702 			dofree = B_FALSE;
5703 			rc = B_FALSE;
5704 		}
5705 		goto ibd_send_fail;
5706 	}
5707 	node->swqe_im_mblk = mp;
5708 
5709 	/*
5710 	 * Queue the wqe to hardware; since we can now simply queue a
5711 	 * post instead of doing it serially, we cannot assume anything
5712 	 * about the 'node' after ibd_post_send() returns.
5713 	 */
5714 	ibd_post_send(state, node);
5715 
5716 	return (B_TRUE);
5717 
5718 ibd_send_fail:
5719 	if (node && mp)
5720 		ibd_free_lsohdr(node, mp);
5721 
5722 	if (dofree)
5723 		freemsg(mp);
5724 
5725 	if (node != NULL)
5726 		ibd_tx_cleanup(state, node);
5727 
5728 	return (rc);
5729 }
5730 
5731 /*
5732  * GLDv3 entry point for transmitting datagram.
5733  */
5734 static mblk_t *
5735 ibd_m_tx(void *arg, mblk_t *mp)
5736 {
5737 	ibd_state_t *state = (ibd_state_t *)arg;
5738 	mblk_t *next;
5739 
5740 	if (state->id_link_state != LINK_STATE_UP) {
5741 		freemsgchain(mp);
5742 		mp = NULL;
5743 	}
5744 
5745 	while (mp != NULL) {
5746 		next = mp->b_next;
5747 		mp->b_next = NULL;
5748 		if (ibd_send(state, mp) == B_FALSE) {
5749 			/* Send fail */
5750 			mp->b_next = next;
5751 			break;
5752 		}
5753 		mp = next;
5754 	}
5755 
5756 	return (mp);
5757 }
5758 
5759 /*
5760  * this handles Tx and Rx completions. With separate CQs, this handles
5761  * only Rx completions.
5762  */
5763 static uint_t
5764 ibd_intr(char *arg)
5765 {
5766 	ibd_state_t *state = (ibd_state_t *)arg;
5767 
5768 	ibd_poll_compq(state, state->id_rcq_hdl);
5769 
5770 	return (DDI_INTR_CLAIMED);
5771 }
5772 
5773 /*
5774  * Poll and drain the cq
5775  */
5776 static uint_t
5777 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
5778     uint_t numwcs)
5779 {
5780 	ibd_wqe_t *wqe;
5781 	ibt_wc_t *wc;
5782 	uint_t total_polled = 0;
5783 	uint_t num_polled;
5784 	int i;
5785 
5786 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5787 		total_polled += num_polled;
5788 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5789 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5790 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
5791 			    (wqe->w_type == IBD_WQE_RECV));
5792 			if (wc->wc_status != IBT_WC_SUCCESS) {
5793 				/*
5794 				 * Channel being torn down.
5795 				 */
5796 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5797 					DPRINT(5, "ibd_drain_cq: flush error");
5798 					/*
5799 					 * Only invoke the Tx handler to
5800 					 * release possibly held resources
5801 					 * like AH refcount etc. Can not
5802 					 * invoke Rx handler because it might
5803 					 * try adding buffers to the Rx pool
5804 					 * when we are trying to deinitialize.
5805 					 */
5806 					if (wqe->w_type == IBD_WQE_RECV) {
5807 						continue;
5808 					} else {
5809 						DPRINT(10, "ibd_drain_cq: Bad "
5810 						    "status %d", wc->wc_status);
5811 					}
5812 				}
5813 			}
5814 			if (wqe->w_type == IBD_WQE_SEND) {
5815 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
5816 			} else {
5817 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5818 			}
5819 		}
5820 	}
5821 
5822 	return (total_polled);
5823 }
5824 
5825 /*
5826  * Common code for interrupt handling as well as for polling
5827  * for all completed wqe's while detaching.
5828  */
5829 static void
5830 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5831 {
5832 	ibt_wc_t *wcs;
5833 	uint_t numwcs;
5834 	int flag, redo_flag;
5835 	int redo = 1;
5836 	uint_t num_polled = 0;
5837 
5838 	if (ibd_separate_cqs == 1) {
5839 		if (cq_hdl == state->id_rcq_hdl) {
5840 			flag = IBD_RX_CQ_POLLING;
5841 			redo_flag = IBD_REDO_RX_CQ_POLLING;
5842 		} else {
5843 			flag = IBD_TX_CQ_POLLING;
5844 			redo_flag = IBD_REDO_TX_CQ_POLLING;
5845 		}
5846 	} else {
5847 		flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
5848 		redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
5849 	}
5850 
5851 	mutex_enter(&state->id_cq_poll_lock);
5852 	if (state->id_cq_poll_busy & flag) {
5853 		state->id_cq_poll_busy |= redo_flag;
5854 		mutex_exit(&state->id_cq_poll_lock);
5855 		return;
5856 	}
5857 	state->id_cq_poll_busy |= flag;
5858 	mutex_exit(&state->id_cq_poll_lock);
5859 
5860 	/*
5861 	 * In some cases (eg detaching), this code can be invoked on
5862 	 * any cpu after disabling cq notification (thus no concurrency
5863 	 * exists). Apart from that, the following applies normally:
5864 	 * The receive completion handling is always on the Rx interrupt
5865 	 * cpu. Transmit completion handling could be from any cpu if
5866 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
5867 	 * is interrupt driven. Combined completion handling is always
5868 	 * on the interrupt cpu. Thus, lock accordingly and use the
5869 	 * proper completion array.
5870 	 */
5871 	if (ibd_separate_cqs == 1) {
5872 		if (cq_hdl == state->id_rcq_hdl) {
5873 			wcs = state->id_rxwcs;
5874 			numwcs = state->id_rxwcs_size;
5875 		} else {
5876 			wcs = state->id_txwcs;
5877 			numwcs = state->id_txwcs_size;
5878 		}
5879 	} else {
5880 		wcs = state->id_rxwcs;
5881 		numwcs = state->id_rxwcs_size;
5882 	}
5883 
5884 	/*
5885 	 * Poll and drain the CQ
5886 	 */
5887 	num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5888 
5889 	/*
5890 	 * Enable CQ notifications and redrain the cq to catch any
5891 	 * completions we might have missed after the ibd_drain_cq()
5892 	 * above and before the ibt_enable_cq_notify() that follows.
5893 	 * Finally, service any new requests to poll the cq that
5894 	 * could've come in after the ibt_enable_cq_notify().
5895 	 */
5896 	do {
5897 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
5898 		    IBT_SUCCESS) {
5899 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
5900 		}
5901 
5902 		num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5903 
5904 		mutex_enter(&state->id_cq_poll_lock);
5905 		if (state->id_cq_poll_busy & redo_flag)
5906 			state->id_cq_poll_busy &= ~redo_flag;
5907 		else {
5908 			state->id_cq_poll_busy &= ~flag;
5909 			redo = 0;
5910 		}
5911 		mutex_exit(&state->id_cq_poll_lock);
5912 
5913 	} while (redo);
5914 
5915 	/*
5916 	 * If we polled the receive cq and found anything, we need to flush
5917 	 * it out to the nw layer here.
5918 	 */
5919 	if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
5920 		ibd_flush_rx(state, NULL);
5921 	}
5922 }
5923 
5924 /*
5925  * Unmap the memory area associated with a given swqe.
5926  */
5927 static void
5928 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
5929 {
5930 	ibt_status_t stat;
5931 
5932 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
5933 
5934 	if (swqe->w_mi_hdl) {
5935 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
5936 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
5937 			DPRINT(10,
5938 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
5939 		}
5940 		swqe->w_mi_hdl = NULL;
5941 	}
5942 	swqe->w_swr.wr_nds = 0;
5943 }
5944 
5945 /*
5946  * Common code that deals with clean ups after a successful or
5947  * erroneous transmission attempt.
5948  */
5949 static void
5950 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
5951 {
5952 	ibd_ace_t *ace = swqe->w_ahandle;
5953 
5954 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
5955 
5956 	/*
5957 	 * If this was a dynamic mapping in ibd_send(), we need to
5958 	 * unmap here. If this was an lso buffer we'd used for sending,
5959 	 * we need to release the lso buf to the pool, since the resource
5960 	 * is scarce. However, if this was simply a normal send using
5961 	 * the copybuf (present in each swqe), we don't need to release it.
5962 	 */
5963 	if (swqe->swqe_im_mblk != NULL) {
5964 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
5965 			ibd_unmap_mem(state, swqe);
5966 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
5967 			ibd_release_lsobufs(state,
5968 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
5969 		}
5970 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
5971 		freemsg(swqe->swqe_im_mblk);
5972 		swqe->swqe_im_mblk = NULL;
5973 	}
5974 
5975 	/*
5976 	 * Drop the reference count on the AH; it can be reused
5977 	 * now for a different destination if there are no more
5978 	 * posted sends that will use it. This can be eliminated
5979 	 * if we can always associate each Tx buffer with an AH.
5980 	 * The ace can be null if we are cleaning up from the
5981 	 * ibd_send() error path.
5982 	 */
5983 	if (ace != NULL) {
5984 		/*
5985 		 * The recycling logic can be eliminated from here
5986 		 * and put into the async thread if we create another
5987 		 * list to hold ACE's for unjoined mcg's.
5988 		 */
5989 		if (DEC_REF_DO_CYCLE(ace)) {
5990 			ibd_mce_t *mce;
5991 
5992 			/*
5993 			 * Check with the lock taken: we decremented
5994 			 * reference count without the lock, and some
5995 			 * transmitter might alreay have bumped the
5996 			 * reference count (possible in case of multicast
5997 			 * disable when we leave the AH on the active
5998 			 * list). If not still 0, get out, leaving the
5999 			 * recycle bit intact.
6000 			 *
6001 			 * Atomically transition the AH from active
6002 			 * to free list, and queue a work request to
6003 			 * leave the group and destroy the mce. No
6004 			 * transmitter can be looking at the AH or
6005 			 * the MCE in between, since we have the
6006 			 * ac_mutex lock. In the SendOnly reap case,
6007 			 * it is not neccesary to hold the ac_mutex
6008 			 * and recheck the ref count (since the AH was
6009 			 * taken off the active list), we just do it
6010 			 * to have uniform processing with the Full
6011 			 * reap case.
6012 			 */
6013 			mutex_enter(&state->id_ac_mutex);
6014 			mce = ace->ac_mce;
6015 			if (GET_REF_CYCLE(ace) == 0) {
6016 				CLEAR_REFCYCLE(ace);
6017 				/*
6018 				 * Identify the case of fullmember reap as
6019 				 * opposed to mcg trap reap. Also, port up
6020 				 * might set ac_mce to NULL to indicate Tx
6021 				 * cleanup should do no more than put the
6022 				 * AH in the free list (see ibd_async_link).
6023 				 */
6024 				if (mce != NULL) {
6025 					ace->ac_mce = NULL;
6026 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
6027 					/*
6028 					 * mc_req was initialized at mce
6029 					 * creation time.
6030 					 */
6031 					ibd_queue_work_slot(state,
6032 					    &mce->mc_req, IBD_ASYNC_REAP);
6033 				}
6034 				IBD_ACACHE_INSERT_FREE(state, ace);
6035 			}
6036 			mutex_exit(&state->id_ac_mutex);
6037 		}
6038 	}
6039 
6040 	/*
6041 	 * Release the send wqe for reuse.
6042 	 */
6043 	ibd_release_swqe(state, swqe);
6044 }
6045 
6046 /*
6047  * Hand off the processed rx mp chain to mac_rx()
6048  */
6049 static void
6050 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
6051 {
6052 	if (mpc == NULL) {
6053 		mutex_enter(&state->id_rx_lock);
6054 
6055 		mpc = state->id_rx_mp;
6056 
6057 		state->id_rx_mp = NULL;
6058 		state->id_rx_mp_tail = NULL;
6059 		state->id_rx_mp_len = 0;
6060 
6061 		mutex_exit(&state->id_rx_lock);
6062 	}
6063 
6064 	if (mpc) {
6065 		mac_rx(state->id_mh, state->id_rh, mpc);
6066 	}
6067 }
6068 
6069 /*
6070  * Processing to be done after receipt of a packet; hand off to GLD
6071  * in the format expected by GLD.  The received packet has this
6072  * format: 2b sap :: 00 :: data.
6073  */
6074 static void
6075 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
6076 {
6077 	ib_header_info_t *phdr;
6078 	mblk_t *mp;
6079 	mblk_t *mpc = NULL;
6080 	ipoib_hdr_t *ipibp;
6081 	ipha_t *iphap;
6082 	ip6_t *ip6h;
6083 	int rxcnt, len;
6084 
6085 	/*
6086 	 * Track number handed to upper layer, and number still
6087 	 * available to receive packets.
6088 	 */
6089 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
6090 	ASSERT(rxcnt >= 0);
6091 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
6092 
6093 	/*
6094 	 * Adjust write pointer depending on how much data came in.
6095 	 */
6096 	mp = rwqe->rwqe_im_mblk;
6097 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
6098 
6099 	/*
6100 	 * Make sure this is NULL or we're in trouble.
6101 	 */
6102 	if (mp->b_next != NULL) {
6103 		ibd_print_warn(state,
6104 		    "ibd_process_rx: got duplicate mp from rcq?");
6105 		mp->b_next = NULL;
6106 	}
6107 
6108 	/*
6109 	 * the IB link will deliver one of the IB link layer
6110 	 * headers called, the Global Routing Header (GRH).
6111 	 * ibd driver uses the information in GRH to build the
6112 	 * Header_info structure and pass it with the datagram up
6113 	 * to GLDv3.
6114 	 * If the GRH is not valid, indicate to GLDv3 by setting
6115 	 * the VerTcFlow field to 0.
6116 	 */
6117 	phdr = (ib_header_info_t *)mp->b_rptr;
6118 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
6119 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
6120 
6121 		/* if it is loop back packet, just drop it. */
6122 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
6123 		    IPOIB_ADDRL) == 0) {
6124 			freemsg(mp);
6125 			return;
6126 		}
6127 
6128 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
6129 		    sizeof (ipoib_mac_t));
6130 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
6131 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
6132 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
6133 		} else {
6134 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
6135 		}
6136 	} else {
6137 		/*
6138 		 * It can not be a IBA multicast packet. Must have been
6139 		 * unicast for us. Just copy the interface address to dst.
6140 		 */
6141 		phdr->ib_grh.ipoib_vertcflow = 0;
6142 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
6143 		    sizeof (ipoib_mac_t));
6144 	}
6145 
6146 	/*
6147 	 * For ND6 packets, padding is at the front of the source/target
6148 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6149 	 * the padding from such packets.
6150 	 */
6151 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6152 	if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) {
6153 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
6154 			if (!pullupmsg(mp, IPV6_HDR_LEN +
6155 			    sizeof (ipoib_hdr_t))) {
6156 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
6157 				freemsg(mp);
6158 				return;
6159 			}
6160 			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
6161 			    sizeof (ipoib_pgrh_t));
6162 		}
6163 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6164 		len = ntohs(ip6h->ip6_plen);
6165 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6166 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
6167 			    IPV6_HDR_LEN + len) {
6168 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
6169 				    IPV6_HDR_LEN + len)) {
6170 					DPRINT(10, "ibd_process_rx: pullupmsg"
6171 					    " failed");
6172 					freemsg(mp);
6173 					return;
6174 				}
6175 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6176 				    sizeof (ipoib_pgrh_t) +
6177 				    sizeof (ipoib_hdr_t));
6178 			}
6179 			/* LINTED: E_CONSTANT_CONDITION */
6180 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6181 		}
6182 	}
6183 
6184 	/*
6185 	 * Update statistics
6186 	 */
6187 	atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
6188 	atomic_inc_64(&state->id_rcv_pkt);
6189 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6190 		atomic_inc_64(&state->id_brd_rcv);
6191 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6192 		atomic_inc_64(&state->id_multi_rcv);
6193 
6194 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6195 	/*
6196 	 * Set receive checksum status in mp
6197 	 * Hardware checksumming can be considered valid only if:
6198 	 * 1. CQE.IP_OK bit is set
6199 	 * 2. CQE.CKSUM = 0xffff
6200 	 * 3. IPv6 routing header is not present in the packet
6201 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6202 	 */
6203 
6204 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6205 	    (wc->wc_cksum == 0xFFFF) &&
6206 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6207 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
6208 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
6209 	}
6210 
6211 	/*
6212 	 * Add this mp to the list of processed mp's to send to
6213 	 * the nw layer
6214 	 */
6215 	mutex_enter(&state->id_rx_lock);
6216 	if (state->id_rx_mp) {
6217 		ASSERT(state->id_rx_mp_tail != NULL);
6218 		state->id_rx_mp_tail->b_next = mp;
6219 	} else {
6220 		ASSERT(state->id_rx_mp_tail == NULL);
6221 		state->id_rx_mp = mp;
6222 	}
6223 
6224 	state->id_rx_mp_tail = mp;
6225 	state->id_rx_mp_len++;
6226 
6227 	if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
6228 		mpc = state->id_rx_mp;
6229 
6230 		state->id_rx_mp = NULL;
6231 		state->id_rx_mp_tail = NULL;
6232 		state->id_rx_mp_len = 0;
6233 	}
6234 
6235 	mutex_exit(&state->id_rx_lock);
6236 
6237 	if (mpc) {
6238 		ibd_flush_rx(state, mpc);
6239 	}
6240 }
6241 
6242 /*
6243  * Callback code invoked from STREAMs when the receive data buffer is
6244  * free for recycling.
6245  */
6246 static void
6247 ibd_freemsg_cb(char *arg)
6248 {
6249 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6250 	ibd_state_t *state = rwqe->w_state;
6251 
6252 	/*
6253 	 * If the wqe is being destructed, do not attempt recycling.
6254 	 */
6255 	if (rwqe->w_freeing_wqe == B_TRUE) {
6256 		DPRINT(6, "ibd_freemsg: wqe being freed");
6257 		return;
6258 	} else {
6259 		/*
6260 		 * Upper layer has released held mblk, so we have
6261 		 * no more use for keeping the old pointer in
6262 		 * our rwqe.
6263 		 */
6264 		rwqe->rwqe_im_mblk = NULL;
6265 	}
6266 
6267 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6268 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6269 	if (rwqe->rwqe_im_mblk == NULL) {
6270 		ibd_delete_rwqe(state, rwqe);
6271 		ibd_free_rwqe(state, rwqe);
6272 		DPRINT(6, "ibd_freemsg: desballoc failed");
6273 		return;
6274 	}
6275 
6276 	if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
6277 		ibd_delete_rwqe(state, rwqe);
6278 		ibd_free_rwqe(state, rwqe);
6279 		return;
6280 	}
6281 
6282 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
6283 }
6284 
6285 static uint_t
6286 ibd_tx_recycle(char *arg)
6287 {
6288 	ibd_state_t *state = (ibd_state_t *)arg;
6289 
6290 	/*
6291 	 * Poll for completed entries
6292 	 */
6293 	ibd_poll_compq(state, state->id_scq_hdl);
6294 
6295 	/*
6296 	 * Resume any blocked transmissions if possible
6297 	 */
6298 	(void) ibd_resume_transmission(state);
6299 
6300 	return (DDI_INTR_CLAIMED);
6301 }
6302 
6303 #ifdef IBD_LOGGING
6304 static void
6305 ibd_log_init(void)
6306 {
6307 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
6308 	ibd_lbuf_ndx = 0;
6309 
6310 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
6311 }
6312 
6313 static void
6314 ibd_log_fini(void)
6315 {
6316 	if (ibd_lbuf)
6317 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
6318 	ibd_lbuf_ndx = 0;
6319 	ibd_lbuf = NULL;
6320 
6321 	mutex_destroy(&ibd_lbuf_lock);
6322 }
6323 
6324 static void
6325 ibd_log(const char *fmt, ...)
6326 {
6327 	va_list	ap;
6328 	uint32_t off;
6329 	uint32_t msglen;
6330 	char tmpbuf[IBD_DMAX_LINE];
6331 
6332 	if (ibd_lbuf == NULL)
6333 		return;
6334 
6335 	va_start(ap, fmt);
6336 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
6337 	va_end(ap);
6338 
6339 	if (msglen >= IBD_DMAX_LINE)
6340 		msglen = IBD_DMAX_LINE - 1;
6341 
6342 	mutex_enter(&ibd_lbuf_lock);
6343 
6344 	off = ibd_lbuf_ndx;		/* current msg should go here */
6345 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
6346 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
6347 
6348 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
6349 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
6350 
6351 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
6352 		ibd_lbuf_ndx = 0;
6353 
6354 	mutex_exit(&ibd_lbuf_lock);
6355 
6356 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
6357 }
6358 #endif
6359