xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision fcfc878b917d4c5ca9d1d067c725cc529a551ac3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * An implementation of the IPoIB standard based on PSARC 2001/289.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/sysmacros.h>	/* for offsetof */
44 #include <sys/disp.h>		/* for async thread pri */
45 #include <sys/atomic.h>		/* for atomic_add*() */
46 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
47 #include <netinet/in.h>		/* for netinet/ip.h below */
48 #include <netinet/ip.h>		/* for struct ip */
49 #include <netinet/udp.h>	/* for struct udphdr */
50 #include <inet/common.h>	/* for inet/ip.h below */
51 #include <inet/ip.h>		/* for ipha_t */
52 #include <inet/ip6.h>		/* for ip6_t */
53 #include <inet/tcp.h>		/* for tcph_t */
54 #include <netinet/icmp6.h>	/* for icmp6_t */
55 #include <sys/callb.h>
56 #include <sys/modhash.h>
57 
58 #include <sys/ib/clients/ibd/ibd.h>
59 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
60 #include <sys/note.h>
61 #include <sys/multidata.h>
62 
63 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
64 
65 #include <sys/priv_names.h>
66 #include <sys/dls.h>
67 #include <sys/dld_ioc.h>
68 #include <sys/policy.h>
69 #include <sys/ibpart.h>
70 #include <sys/file.h>
71 
72 /*
73  * The write-up below includes details on the following:
74  * 1. The dladm administrative model.
75  * 2. Late HCA initialization feature.
76  * 3. Brussels support and its implications to the current architecture.
77  *
78  * 1. The dladm administrative model.
79  * ------------------------------------------
80  * With the dladm model, ibnex will create one ibd instance per port. These
81  * instances will be created independent of the port state.
82  *
83  * The ibd driver is two faceted: One side of it working as the port driver and
84  * the other as the partition object driver.
85  *
86  * The port instance is a child of the HCA, and will have an entry in the devfs.
87  * A DDI attach only happens for the port driver, and its attach is
88  * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89  * handled in ibd_port_unattach().
90  *
91  * The partition object is only a registrant to the mac layer via mac_register()
92  * and does not have an entry in the device tree. There is no DDI softstate
93  * managed by the DDI framework for the partition objects. However, the state is
94  * managed inside the ibd driver, and every partition object hangs off the
95  * "ibd_objlist_head".
96  *
97  * The partition object first comes into existence when a user runs the
98  * 'create-part' subcommand of dladm. This is like invoking the attach entry
99  * point of the partition object. The partition object goes away with the
100  * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101  * point of the partition object.
102  *
103  * The create-part and delete-part subcommands result in dld ioctls that end up
104  * calling ibd_create_parition() and ibd_delete_partition respectively.
105  * There ioctls are registered with the dld layer in _init() via a call to
106  * dld_ioc_register().
107  *
108  * The port instance by itself cannot be plumbed. It is only the partition
109  * objects that can be plumbed and they alone participate in I/O and not the
110  * port driver.
111  *
112  * There are some info ioctls supported in ibd which are used by dladm(1M) to
113  * display useful information. The info entry point for ibd is
114  * ibd_get_partition_info().
115  *
116  * 2. Late HCA initialization feature.
117  * ------------------------------------
118  * As mentioned in section 1, the user creates the partition objects via
119  * dladm(1M). It is possible that:
120  * a) The physical port itself is down and the SM cannot be reached.
121  * b) The PKEY specified by the used has not been created in the SM yet.
122  * c) An IPoIB broadcast group for the specified PKEY is not present.
123  *
124  * In all of the above cases, complete initialization of the partition object is
125  * not possible. However, the new model allows the creation of partition
126  * objects even in such cases but will defer the initialization for later.
127  * When such a partition object is plumbed, the link state will be displayed as
128  * "down".
129  * The driver, at this point, is listening to events that herald the
130  * availability of resources -
131  * i)   LINK_UP when the link becomes available
132  * ii)  PORT_CHANGE when the PKEY has been created
133  * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134  * created
135  * via ibd_async_handler() for events i) and ii), and via
136  * ibd_snet_notices_handler() for iii.
137  * The driver handles these events (as and when they arrive) and completes the
138  * initialization of the partition object and transitions it to a usable state.
139  *
140  * 3. Brussels support and its implications to the current architecture.
141  * ---------------------------------------------------------------------
142  * The brussels support introduces two new interfaces to the ibd driver -
143  * ibd_m_getprop() and ibd_m_setprop().
144  * These interfaces allow setting and retrieval of certain properties.
145  * Some of them are public properties while most other are private properties
146  * meant to be used by developers. Tuning the latter kind can cause
147  * performance issues and should not be used without understanding the
148  * implications. All properties are specific to an instance of either the
149  * partition object or the port driver.
150  *
151  * The public properties are : mtu and linkmode.
152  * mtu is a read-only property.
153  * linkmode can take two values - UD and CM.
154  *
155  * Changing the linkmode requires some bookkeeping in the driver. The
156  * capabilities need to be re-reported to the mac layer. This is done by
157  * calling mac_capab_update().  The maxsdu is updated by calling
158  * mac_maxsdu_update().
159  * The private properties retain their values across the change of linkmode.
160  * NOTE:
161  * - The port driver does not support any property apart from mtu.
162  * - All other properties are only meant for the partition object.
163  * - The properties cannot be set when an instance is plumbed. The
164  * instance has to be unplumbed to effect any setting.
165  */
166 
167 /*
168  * Driver wide tunables
169  *
170  * ibd_tx_softintr
171  * ibd_rx_softintr
172  *     The softintr mechanism allows ibd to avoid event queue overflows if
173  *     the receive/completion handlers are to be expensive. These are enabled
174  *     by default.
175  *
176  * ibd_log_sz
177  *     This specifies the size of the ibd log buffer in bytes. The buffer is
178  *     allocated and logging is enabled only when IBD_LOGGING is defined.
179  *
180  */
181 uint_t ibd_rx_softintr = 1;
182 uint_t ibd_tx_softintr = 1;
183 
184 #ifdef IBD_LOGGING
185 uint_t ibd_log_sz = 0x20000;
186 #endif
187 
188 #ifdef IBD_LOGGING
189 #define	IBD_LOG_SZ			ibd_log_sz
190 #endif
191 
192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 #define	IBD_RX_POST_CNT			8
194 
195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 #define	IBD_LOG_RX_POST			4
197 
198 /* Minimum number of receive work requests driver needs to always have */
199 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200 
201 /*
202  * LSO parameters
203  */
204 #define	IBD_LSO_MAXLEN			65536
205 #define	IBD_LSO_BUFSZ			8192
206 
207 /*
208  * Async operation states
209  */
210 #define	IBD_OP_NOTSTARTED		0
211 #define	IBD_OP_ONGOING			1
212 #define	IBD_OP_COMPLETED		2
213 #define	IBD_OP_ERRORED			3
214 #define	IBD_OP_ROUTERED			4
215 
216 /*
217  * Start/stop in-progress flags; note that restart must always remain
218  * the OR of start and stop flag values.
219  */
220 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
221 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
222 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
223 #define	IBD_DRV_DELETE_IN_PROGRESS	IBD_DRV_RESTART_IN_PROGRESS
224 
225 /*
226  * Miscellaneous constants
227  */
228 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
229 #define	IBD_DEF_MAX_SDU			2044
230 #define	IBD_DEF_MAX_MTU			(IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
231 #define	IBD_DEF_RC_MAX_SDU		65520
232 #define	IBD_DEF_RC_MAX_MTU		(IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
233 #define	IBD_DEFAULT_QKEY		0xB1B
234 #ifdef IBD_LOGGING
235 #define	IBD_DMAX_LINE			100
236 #endif
237 
238 /*
239  * Enumerations for link states
240  */
241 typedef enum {
242 	IBD_LINK_DOWN,
243 	IBD_LINK_UP,
244 	IBD_LINK_UP_ABSENT
245 } ibd_link_op_t;
246 
247 /*
248  * Driver State Pointer
249  */
250 void *ibd_list;
251 
252 /*
253  * Driver Global Data
254  */
255 ibd_global_state_t ibd_gstate;
256 
257 /*
258  * Partition object list
259  */
260 ibd_state_t	*ibd_objlist_head = NULL;
261 kmutex_t	ibd_objlist_lock;
262 
263 int ibd_rc_conn_timeout = 60 * 10;	/* 10 minutes */
264 
265 /*
266  * Logging
267  */
268 #ifdef IBD_LOGGING
269 kmutex_t ibd_lbuf_lock;
270 uint8_t *ibd_lbuf;
271 uint32_t ibd_lbuf_ndx;
272 #endif
273 
274 /*
275  * Required system entry points
276  */
277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
279 
280 /*
281  * Required driver entry points for GLDv3
282  */
283 static int ibd_m_stat(void *, uint_t, uint64_t *);
284 static int ibd_m_start(void *);
285 static void ibd_m_stop(void *);
286 static int ibd_m_promisc(void *, boolean_t);
287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
288 static int ibd_m_unicst(void *, const uint8_t *);
289 static mblk_t *ibd_m_tx(void *, mblk_t *);
290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
291 
292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
293     const void *);
294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
296     mac_prop_info_handle_t);
297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
298     const void *);
299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
300 
301 /*
302  * Private driver entry points for GLDv3
303  */
304 
305 /*
306  * Initialization
307  */
308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
309 static int ibd_init_txlist(ibd_state_t *);
310 static int ibd_init_rxlist(ibd_state_t *);
311 static int ibd_acache_init(ibd_state_t *);
312 #ifdef IBD_LOGGING
313 static void ibd_log_init(void);
314 #endif
315 
316 /*
317  * Termination/cleanup
318  */
319 static void ibd_state_fini(ibd_state_t *);
320 static void ibd_fini_txlist(ibd_state_t *);
321 static void ibd_fini_rxlist(ibd_state_t *);
322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
324 static void ibd_acache_fini(ibd_state_t *);
325 #ifdef IBD_LOGGING
326 static void ibd_log_fini(void);
327 #endif
328 
329 /*
330  * Allocation/acquire/map routines
331  */
332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
337     uint32_t *);
338 
339 /*
340  * Free/release/unmap routines
341  */
342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
343 static void ibd_free_tx_copybufs(ibd_state_t *);
344 static void ibd_free_rx_copybufs(ibd_state_t *);
345 static void ibd_free_rx_rsrcs(ibd_state_t *);
346 static void ibd_free_tx_lsobufs(ibd_state_t *);
347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
350 
351 /*
352  * Handlers/callback routines
353  */
354 static uint_t ibd_intr(caddr_t);
355 static uint_t ibd_tx_recycle(caddr_t);
356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
362 static void ibd_freemsg_cb(char *);
363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
364     ibt_async_event_t *);
365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
366     ibt_async_event_t *);
367 static void ibd_snet_notices_handler(void *, ib_gid_t,
368     ibt_subnet_event_code_t, ibt_subnet_event_t *);
369 
370 /*
371  * Send/receive routines
372  */
373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
377 
378 /*
379  * Threads
380  */
381 static void ibd_async_work(ibd_state_t *);
382 
383 /*
384  * Async tasks
385  */
386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
388 static void ibd_async_setprom(ibd_state_t *);
389 static void ibd_async_unsetprom(ibd_state_t *);
390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
392 static void ibd_async_txsched(ibd_state_t *);
393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
394 
395 /*
396  * Async task helpers
397  */
398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
402     ipoib_mac_t *, ipoib_mac_t *);
403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
409 static uint64_t ibd_get_portspeed(ibd_state_t *);
410 static boolean_t ibd_async_safe(ibd_state_t *);
411 static void ibd_async_done(ibd_state_t *);
412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
416 
417 /*
418  * Helpers for attach/start routines
419  */
420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
421 static int ibd_record_capab(ibd_state_t *);
422 static int ibd_get_port_details(ibd_state_t *);
423 static int ibd_alloc_cqs(ibd_state_t *);
424 static int ibd_setup_ud_channel(ibd_state_t *);
425 static int ibd_start(ibd_state_t *);
426 static int ibd_undo_start(ibd_state_t *, link_state_t);
427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
430 static void ibd_part_unattach(ibd_state_t *state);
431 static int ibd_port_attach(dev_info_t *);
432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
434 static int ibd_part_busy(ibd_state_t *);
435 
436 /*
437  * Miscellaneous helpers
438  */
439 static int ibd_sched_poll(ibd_state_t *, int, int);
440 static void ibd_resume_transmission(ibd_state_t *);
441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
443 static void *list_get_head(list_t *);
444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
446 
447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
449 
450 #ifdef IBD_LOGGING
451 static void ibd_log(const char *, ...);
452 #endif
453 
454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
455     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
456 
457 /* Module Driver Info */
458 static struct modldrv ibd_modldrv = {
459 	&mod_driverops,			/* This one is a driver */
460 	"InfiniBand GLDv3 Driver",	/* short description */
461 	&ibd_dev_ops			/* driver specific ops */
462 };
463 
464 /* Module Linkage */
465 static struct modlinkage ibd_modlinkage = {
466 	MODREV_1, (void *)&ibd_modldrv, NULL
467 };
468 
469 /*
470  * Module (static) info passed to IBTL during ibt_attach
471  */
472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
473 	IBTI_V_CURR,
474 	IBT_NETWORK,
475 	ibd_async_handler,
476 	NULL,
477 	"IBPART"
478 };
479 
480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
481 	IBTI_V_CURR,
482 	IBT_NETWORK,
483 	ibdpd_async_handler,
484 	NULL,
485 	"IPIB"
486 };
487 
488 /*
489  * GLDv3 entry points
490  */
491 #define	IBD_M_CALLBACK_FLAGS	\
492 	(MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
493 
494 static mac_callbacks_t ibd_m_callbacks = {
495 	IBD_M_CALLBACK_FLAGS,
496 	ibd_m_stat,
497 	ibd_m_start,
498 	ibd_m_stop,
499 	ibd_m_promisc,
500 	ibd_m_multicst,
501 	ibd_m_unicst,
502 	ibd_m_tx,
503 	NULL,
504 	NULL,
505 	ibd_m_getcapab,
506 	NULL,
507 	NULL,
508 	ibd_m_setprop,
509 	ibd_m_getprop,
510 	ibd_m_propinfo
511 };
512 
513 /* Private properties */
514 char *ibd_priv_props[] = {
515 	"_ibd_broadcast_group",
516 	"_ibd_coalesce_completions",
517 	"_ibd_create_broadcast_group",
518 	"_ibd_hash_size",
519 	"_ibd_lso_enable",
520 	"_ibd_num_ah",
521 	"_ibd_num_lso_bufs",
522 	"_ibd_rc_enable_srq",
523 	"_ibd_rc_num_rwqe",
524 	"_ibd_rc_num_srq",
525 	"_ibd_rc_num_swqe",
526 	"_ibd_rc_rx_comp_count",
527 	"_ibd_rc_rx_comp_usec",
528 	"_ibd_rc_rx_copy_thresh",
529 	"_ibd_rc_rx_rwqe_thresh",
530 	"_ibd_rc_tx_comp_count",
531 	"_ibd_rc_tx_comp_usec",
532 	"_ibd_rc_tx_copy_thresh",
533 	"_ibd_ud_num_rwqe",
534 	"_ibd_ud_num_swqe",
535 	"_ibd_ud_rx_comp_count",
536 	"_ibd_ud_rx_comp_usec",
537 	"_ibd_ud_tx_comp_count",
538 	"_ibd_ud_tx_comp_usec",
539 	"_ibd_ud_tx_copy_thresh",
540 	NULL
541 };
542 
543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
546 
547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
548 	{IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
549 	    ibd_create_partition, secpolicy_dl_config},
550 	{IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
551 	    ibd_delete_partition, secpolicy_dl_config},
552 	{IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
553 	    ibd_get_partition_info, NULL}
554 };
555 
556 /*
557  * Fill/clear <scope> and <p_key> in multicast/broadcast address
558  */
559 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
560 {							\
561 	*(uint32_t *)((char *)(maddr) + 4) |=		\
562 	    htonl((uint32_t)(scope) << 16);		\
563 	*(uint32_t *)((char *)(maddr) + 8) |=		\
564 	    htonl((uint32_t)(pkey) << 16);		\
565 }
566 
567 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
568 {							\
569 	*(uint32_t *)((char *)(maddr) + 4) &=		\
570 	    htonl(~((uint32_t)0xF << 16));		\
571 	*(uint32_t *)((char *)(maddr) + 8) &=		\
572 	    htonl(~((uint32_t)0xFFFF << 16));		\
573 }
574 
575 /*
576  * Rudimentary debugging support
577  */
578 #ifdef DEBUG
579 int ibd_debuglevel = 100;
580 void
581 debug_print(int l, char *fmt, ...)
582 {
583 	va_list ap;
584 
585 	if (l < ibd_debuglevel)
586 		return;
587 	va_start(ap, fmt);
588 	vcmn_err(CE_CONT, fmt, ap);
589 	va_end(ap);
590 }
591 #endif
592 
593 /*
594  * Common routine to print warning messages; adds in hca guid, port number
595  * and pkey to be able to identify the IBA interface.
596  */
597 void
598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
599 {
600 	ib_guid_t hca_guid;
601 	char ibd_print_buf[MAXNAMELEN + 256];
602 	int len;
603 	va_list ap;
604 	char part_name[MAXNAMELEN];
605 	datalink_id_t linkid = state->id_plinkid;
606 
607 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
608 	    0, "hca-guid", 0);
609 	(void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
610 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
611 	    "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
612 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
613 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
614 	    part_name);
615 	va_start(ap, fmt);
616 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
617 	    fmt, ap);
618 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
619 	va_end(ap);
620 }
621 
622 /*
623  * Warlock directives
624  */
625 
626 /*
627  * id_lso_lock
628  *
629  * state->id_lso->bkt_nfree may be accessed without a lock to
630  * determine the threshold at which we have to ask the nw layer
631  * to resume transmission (see ibd_resume_transmission()).
632  */
633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
634     ibd_state_t::id_lso))
635 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
636 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
637 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
638 
639 /*
640  * id_scq_poll_lock
641  */
642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
643     ibd_state_t::id_scq_poll_busy))
644 
645 /*
646  * id_txpost_lock
647  */
648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
649     ibd_state_t::id_tx_head))
650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
651     ibd_state_t::id_tx_busy))
652 
653 /*
654  * id_acache_req_lock
655  */
656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
657     ibd_state_t::id_acache_req_cv))
658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
659     ibd_state_t::id_req_list))
660 _NOTE(SCHEME_PROTECTS_DATA("atomic",
661     ibd_acache_s::ac_ref))
662 
663 /*
664  * id_ac_mutex
665  *
666  * This mutex is actually supposed to protect id_ah_op as well,
667  * but this path of the code isn't clean (see update of id_ah_op
668  * in ibd_async_acache(), immediately after the call to
669  * ibd_async_mcache()). For now, we'll skip this check by
670  * declaring that id_ah_op is protected by some internal scheme
671  * that warlock isn't aware of.
672  */
673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
674     ibd_state_t::id_ah_active))
675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
676     ibd_state_t::id_ah_free))
677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
678     ibd_state_t::id_ah_addr))
679 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
680     ibd_state_t::id_ah_op))
681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
682     ibd_state_t::id_ah_error))
683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
684     ibd_state_t::id_ac_hot_ace))
685 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
686 
687 /*
688  * id_mc_mutex
689  */
690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
691     ibd_state_t::id_mc_full))
692 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
693     ibd_state_t::id_mc_non))
694 
695 /*
696  * id_trap_lock
697  */
698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
699     ibd_state_t::id_trap_cv))
700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
701     ibd_state_t::id_trap_stop))
702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
703     ibd_state_t::id_trap_inprog))
704 
705 /*
706  * id_prom_op
707  */
708 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
709     ibd_state_t::id_prom_op))
710 
711 /*
712  * id_sched_lock
713  */
714 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
715     ibd_state_t::id_sched_needed))
716 
717 /*
718  * id_link_mutex
719  */
720 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
721     ibd_state_t::id_link_state))
722 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
723 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
724     ibd_state_t::id_link_speed))
725 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
726 
727 /*
728  * id_tx_list.dl_mutex
729  */
730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
731     ibd_state_t::id_tx_list.dl_head))
732 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
733     ibd_state_t::id_tx_list.dl_pending_sends))
734 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
735     ibd_state_t::id_tx_list.dl_cnt))
736 
737 /*
738  * id_rx_list.dl_mutex
739  */
740 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
741     ibd_state_t::id_rx_list.dl_bufs_outstanding))
742 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
743     ibd_state_t::id_rx_list.dl_cnt))
744 
745 /*
746  * rc_timeout_lock
747  */
748 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
749     ibd_state_t::rc_timeout_start))
750 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
751     ibd_state_t::rc_timeout))
752 
753 
754 /*
755  * Items protected by atomic updates
756  */
757 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
758     ibd_state_s::id_brd_rcv
759     ibd_state_s::id_brd_xmt
760     ibd_state_s::id_multi_rcv
761     ibd_state_s::id_multi_xmt
762     ibd_state_s::id_num_intrs
763     ibd_state_s::id_rcv_bytes
764     ibd_state_s::id_rcv_pkt
765     ibd_state_s::id_rx_post_queue_index
766     ibd_state_s::id_tx_short
767     ibd_state_s::id_xmt_bytes
768     ibd_state_s::id_xmt_pkt
769     ibd_state_s::rc_rcv_trans_byte
770     ibd_state_s::rc_rcv_trans_pkt
771     ibd_state_s::rc_rcv_copy_byte
772     ibd_state_s::rc_rcv_copy_pkt
773     ibd_state_s::rc_xmt_bytes
774     ibd_state_s::rc_xmt_small_pkt
775     ibd_state_s::rc_xmt_fragmented_pkt
776     ibd_state_s::rc_xmt_map_fail_pkt
777     ibd_state_s::rc_xmt_map_succ_pkt
778     ibd_rc_chan_s::rcq_invoking))
779 
780 /*
781  * Non-mutex protection schemes for data elements. Almost all of
782  * these are non-shared items.
783  */
784 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
785     callb_cpr
786     ib_gid_s
787     ib_header_info
788     ibd_acache_rq
789     ibd_acache_s::ac_mce
790     ibd_acache_s::ac_chan
791     ibd_mcache::mc_fullreap
792     ibd_mcache::mc_jstate
793     ibd_mcache::mc_req
794     ibd_rwqe_s
795     ibd_swqe_s
796     ibd_wqe_s
797     ibt_wr_ds_s::ds_va
798     ibt_wr_lso_s
799     ipoib_mac::ipoib_qpn
800     mac_capab_lso_s
801     msgb::b_next
802     msgb::b_cont
803     msgb::b_rptr
804     msgb::b_wptr
805     ibd_state_s::id_bgroup_created
806     ibd_state_s::id_mac_state
807     ibd_state_s::id_mtu
808     ibd_state_s::id_ud_num_rwqe
809     ibd_state_s::id_ud_num_swqe
810     ibd_state_s::id_qpnum
811     ibd_state_s::id_rcq_hdl
812     ibd_state_s::id_rx_buf_sz
813     ibd_state_s::id_rx_bufs
814     ibd_state_s::id_rx_mr_hdl
815     ibd_state_s::id_rx_wqes
816     ibd_state_s::id_rxwcs
817     ibd_state_s::id_rxwcs_size
818     ibd_state_s::id_rx_nqueues
819     ibd_state_s::id_rx_queues
820     ibd_state_s::id_scope
821     ibd_state_s::id_scq_hdl
822     ibd_state_s::id_tx_buf_sz
823     ibd_state_s::id_tx_bufs
824     ibd_state_s::id_tx_mr_hdl
825     ibd_state_s::id_tx_rel_list.dl_cnt
826     ibd_state_s::id_tx_wqes
827     ibd_state_s::id_txwcs
828     ibd_state_s::id_txwcs_size
829     ibd_state_s::rc_listen_hdl
830     ibd_state_s::rc_listen_hdl_OFED_interop
831     ibd_state_s::rc_srq_size
832     ibd_state_s::rc_srq_rwqes
833     ibd_state_s::rc_srq_rx_bufs
834     ibd_state_s::rc_srq_rx_mr_hdl
835     ibd_state_s::rc_tx_largebuf_desc_base
836     ibd_state_s::rc_tx_mr_bufs
837     ibd_state_s::rc_tx_mr_hdl
838     ipha_s
839     icmph_s
840     ibt_path_info_s::pi_sid
841     ibd_rc_chan_s::ace
842     ibd_rc_chan_s::chan_hdl
843     ibd_rc_chan_s::state
844     ibd_rc_chan_s::chan_state
845     ibd_rc_chan_s::is_tx_chan
846     ibd_rc_chan_s::rcq_hdl
847     ibd_rc_chan_s::rcq_size
848     ibd_rc_chan_s::scq_hdl
849     ibd_rc_chan_s::scq_size
850     ibd_rc_chan_s::rx_bufs
851     ibd_rc_chan_s::rx_mr_hdl
852     ibd_rc_chan_s::rx_rwqes
853     ibd_rc_chan_s::tx_wqes
854     ibd_rc_chan_s::tx_mr_bufs
855     ibd_rc_chan_s::tx_mr_hdl
856     ibd_rc_chan_s::tx_rel_list.dl_cnt
857     ibd_rc_chan_s::is_used
858     ibd_rc_tx_largebuf_s::lb_buf
859     ibd_rc_msg_hello_s
860     ibt_cm_return_args_s))
861 
862 /*
863  * ibd_rc_chan_s::next is protected by two mutexes:
864  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
865  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
866  */
867 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
868     ibd_rc_chan_s::next))
869 
870 /*
871  * ibd_state_s.rc_tx_large_bufs_lock
872  */
873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
874     ibd_state_s::rc_tx_largebuf_free_head))
875 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
876     ibd_state_s::rc_tx_largebuf_nfree))
877 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
878     ibd_rc_tx_largebuf_s::lb_next))
879 
880 /*
881  * ibd_acache_s.tx_too_big_mutex
882  */
883 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
884     ibd_acache_s::tx_too_big_ongoing))
885 
886 /*
887  * tx_wqe_list.dl_mutex
888  */
889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
890     ibd_rc_chan_s::tx_wqe_list.dl_head))
891 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
892     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
893 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
894     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
895 
896 /*
897  * ibd_state_s.rc_ace_recycle_lock
898  */
899 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
900     ibd_state_s::rc_ace_recycle))
901 
902 /*
903  * rc_srq_rwqe_list.dl_mutex
904  */
905 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
906     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
907 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
908     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
909 
910 /*
911  * Non-mutex protection schemes for data elements. They are counters
912  * for problem diagnosis. Don't need be protected.
913  */
914 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
915     ibd_state_s::rc_rcv_alloc_fail
916     ibd_state_s::rc_rcq_err
917     ibd_state_s::rc_ace_not_found
918     ibd_state_s::rc_xmt_drop_too_long_pkt
919     ibd_state_s::rc_xmt_icmp_too_long_pkt
920     ibd_state_s::rc_xmt_reenter_too_long_pkt
921     ibd_state_s::rc_swqe_short
922     ibd_state_s::rc_swqe_mac_update
923     ibd_state_s::rc_xmt_buf_short
924     ibd_state_s::rc_xmt_buf_mac_update
925     ibd_state_s::rc_scq_no_swqe
926     ibd_state_s::rc_scq_no_largebuf
927     ibd_state_s::rc_conn_succ
928     ibd_state_s::rc_conn_fail
929     ibd_state_s::rc_null_conn
930     ibd_state_s::rc_no_estab_conn
931     ibd_state_s::rc_act_close
932     ibd_state_s::rc_pas_close
933     ibd_state_s::rc_delay_ace_recycle
934     ibd_state_s::rc_act_close_simultaneous
935     ibd_state_s::rc_act_close_not_clean
936     ibd_state_s::rc_pas_close_rcq_invoking
937     ibd_state_s::rc_reset_cnt
938     ibd_state_s::rc_timeout_act
939     ibd_state_s::rc_timeout_pas
940     ibd_state_s::rc_stop_connect))
941 
942 #ifdef DEBUG
943 /*
944  * Non-mutex protection schemes for data elements. They are counters
945  * for problem diagnosis. Don't need be protected.
946  */
947 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
948     ibd_state_s::rc_rwqe_short
949     ibd_rc_stat_s::rc_rcv_trans_byte
950     ibd_rc_stat_s::rc_rcv_trans_pkt
951     ibd_rc_stat_s::rc_rcv_copy_byte
952     ibd_rc_stat_s::rc_rcv_copy_pkt
953     ibd_rc_stat_s::rc_rcv_alloc_fail
954     ibd_rc_stat_s::rc_rcq_err
955     ibd_rc_stat_s::rc_rwqe_short
956     ibd_rc_stat_s::rc_xmt_bytes
957     ibd_rc_stat_s::rc_xmt_small_pkt
958     ibd_rc_stat_s::rc_xmt_fragmented_pkt
959     ibd_rc_stat_s::rc_xmt_map_fail_pkt
960     ibd_rc_stat_s::rc_xmt_map_succ_pkt
961     ibd_rc_stat_s::rc_ace_not_found
962     ibd_rc_stat_s::rc_scq_no_swqe
963     ibd_rc_stat_s::rc_scq_no_largebuf
964     ibd_rc_stat_s::rc_swqe_short
965     ibd_rc_stat_s::rc_swqe_mac_update
966     ibd_rc_stat_s::rc_xmt_buf_short
967     ibd_rc_stat_s::rc_xmt_buf_mac_update
968     ibd_rc_stat_s::rc_conn_succ
969     ibd_rc_stat_s::rc_conn_fail
970     ibd_rc_stat_s::rc_null_conn
971     ibd_rc_stat_s::rc_no_estab_conn
972     ibd_rc_stat_s::rc_act_close
973     ibd_rc_stat_s::rc_pas_close
974     ibd_rc_stat_s::rc_delay_ace_recycle
975     ibd_rc_stat_s::rc_act_close_simultaneous
976     ibd_rc_stat_s::rc_reset_cnt
977     ibd_rc_stat_s::rc_timeout_act
978     ibd_rc_stat_s::rc_timeout_pas))
979 #endif
980 
981 int
982 _init()
983 {
984 	int status;
985 
986 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
987 	    PAGESIZE), 0);
988 	if (status != 0) {
989 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
990 		return (status);
991 	}
992 
993 	mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
994 
995 	mac_init_ops(&ibd_dev_ops, "ibp");
996 	status = mod_install(&ibd_modlinkage);
997 	if (status != 0) {
998 		DPRINT(10, "_init:failed in mod_install()");
999 		ddi_soft_state_fini(&ibd_list);
1000 		mac_fini_ops(&ibd_dev_ops);
1001 		return (status);
1002 	}
1003 
1004 	mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1005 	mutex_enter(&ibd_gstate.ig_mutex);
1006 	ibd_gstate.ig_ibt_hdl = NULL;
1007 	ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1008 	ibd_gstate.ig_service_list = NULL;
1009 	mutex_exit(&ibd_gstate.ig_mutex);
1010 
1011 	if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1012 	    DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1013 		return (EIO);
1014 	}
1015 
1016 	ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1017 
1018 #ifdef IBD_LOGGING
1019 	ibd_log_init();
1020 #endif
1021 	return (0);
1022 }
1023 
1024 int
1025 _info(struct modinfo *modinfop)
1026 {
1027 	return (mod_info(&ibd_modlinkage, modinfop));
1028 }
1029 
1030 int
1031 _fini()
1032 {
1033 	int status;
1034 
1035 	status = mod_remove(&ibd_modlinkage);
1036 	if (status != 0)
1037 		return (status);
1038 
1039 	ibt_unregister_part_attr_cb();
1040 
1041 	mac_fini_ops(&ibd_dev_ops);
1042 	mutex_destroy(&ibd_objlist_lock);
1043 	ddi_soft_state_fini(&ibd_list);
1044 	mutex_destroy(&ibd_gstate.ig_mutex);
1045 #ifdef IBD_LOGGING
1046 	ibd_log_fini();
1047 #endif
1048 	return (0);
1049 }
1050 
1051 /*
1052  * Convert the GID part of the mac address from network byte order
1053  * to host order.
1054  */
1055 static void
1056 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1057 {
1058 	ib_sn_prefix_t nbopref;
1059 	ib_guid_t nboguid;
1060 
1061 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1062 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1063 	dgid->gid_prefix = b2h64(nbopref);
1064 	dgid->gid_guid = b2h64(nboguid);
1065 }
1066 
1067 /*
1068  * Create the IPoIB address in network byte order from host order inputs.
1069  */
1070 static void
1071 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1072     ib_guid_t guid)
1073 {
1074 	ib_sn_prefix_t nbopref;
1075 	ib_guid_t nboguid;
1076 
1077 	mac->ipoib_qpn = htonl(qpn);
1078 	nbopref = h2b64(prefix);
1079 	nboguid = h2b64(guid);
1080 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1081 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1082 }
1083 
1084 /*
1085  * Send to the appropriate all-routers group when the IBA multicast group
1086  * does not exist, based on whether the target group is v4 or v6.
1087  */
1088 static boolean_t
1089 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1090     ipoib_mac_t *rmac)
1091 {
1092 	boolean_t retval = B_TRUE;
1093 	uint32_t adjscope = state->id_scope << 16;
1094 	uint32_t topword;
1095 
1096 	/*
1097 	 * Copy the first 4 bytes in without assuming any alignment of
1098 	 * input mac address; this will have IPoIB signature, flags and
1099 	 * scope bits.
1100 	 */
1101 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1102 	topword = ntohl(topword);
1103 
1104 	/*
1105 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
1106 	 */
1107 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1108 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1109 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1110 		    ((uint32_t)(state->id_pkey << 16))),
1111 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1112 	else
1113 		/*
1114 		 * Does not have proper bits in the mgid address.
1115 		 */
1116 		retval = B_FALSE;
1117 
1118 	return (retval);
1119 }
1120 
1121 /*
1122  * Membership states for different mcg's are tracked by two lists:
1123  * the "non" list is used for promiscuous mode, when all mcg traffic
1124  * needs to be inspected. This type of membership is never used for
1125  * transmission, so there can not be an AH in the active list
1126  * corresponding to a member in this list. This list does not need
1127  * any protection, since all operations are performed by the async
1128  * thread.
1129  *
1130  * "Full" and "SendOnly" membership is tracked using a single list,
1131  * the "full" list. This is because this single list can then be
1132  * searched during transmit to a multicast group (if an AH for the
1133  * mcg is not found in the active list), since at least one type
1134  * of membership must be present before initiating the transmit.
1135  * This list is also emptied during driver detach, since sendonly
1136  * membership acquired during transmit is dropped at detach time
1137  * along with ipv4 broadcast full membership. Insert/deletes to
1138  * this list are done only by the async thread, but it is also
1139  * searched in program context (see multicast disable case), thus
1140  * the id_mc_mutex protects the list. The driver detach path also
1141  * deconstructs the "full" list, but it ensures that the async
1142  * thread will not be accessing the list (by blocking out mcg
1143  * trap handling and making sure no more Tx reaping will happen).
1144  *
1145  * Currently, an IBA attach is done in the SendOnly case too,
1146  * although this is not required.
1147  */
1148 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1149 	list_insert_head(&state->id_mc_full, mce)
1150 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1151 	list_insert_head(&state->id_mc_non, mce)
1152 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1153 	ibd_mcache_find(mgid, &state->id_mc_full)
1154 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1155 	ibd_mcache_find(mgid, &state->id_mc_non)
1156 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1157 	list_remove(&state->id_mc_full, mce)
1158 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1159 	list_remove(&state->id_mc_non, mce)
1160 
1161 static void *
1162 list_get_head(list_t *list)
1163 {
1164 	list_node_t *lhead = list_head(list);
1165 
1166 	if (lhead != NULL)
1167 		list_remove(list, lhead);
1168 	return (lhead);
1169 }
1170 
1171 /*
1172  * This is always guaranteed to be able to queue the work.
1173  */
1174 void
1175 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1176 {
1177 	/* Initialize request */
1178 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1179 	ptr->rq_op = op;
1180 
1181 	/*
1182 	 * Queue provided slot onto request pool.
1183 	 */
1184 	mutex_enter(&state->id_acache_req_lock);
1185 	list_insert_tail(&state->id_req_list, ptr);
1186 
1187 	/* Go, fetch, async thread */
1188 	cv_signal(&state->id_acache_req_cv);
1189 	mutex_exit(&state->id_acache_req_lock);
1190 }
1191 
1192 /*
1193  * Main body of the per interface async thread.
1194  */
1195 static void
1196 ibd_async_work(ibd_state_t *state)
1197 {
1198 	ibd_req_t *ptr;
1199 	callb_cpr_t cprinfo;
1200 
1201 	mutex_enter(&state->id_acache_req_lock);
1202 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1203 	    callb_generic_cpr, "ibd_async_work");
1204 
1205 	for (;;) {
1206 		ptr = list_get_head(&state->id_req_list);
1207 		if (ptr != NULL) {
1208 			mutex_exit(&state->id_acache_req_lock);
1209 
1210 			/*
1211 			 * If we are in late hca initialization mode, do not
1212 			 * process any other async request other than TRAP. TRAP
1213 			 * is used for indicating creation of a broadcast group;
1214 			 * in which case, we need to join/create the group.
1215 			 */
1216 			if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1217 			    (ptr->rq_op != IBD_ASYNC_TRAP)) {
1218 				goto free_req_and_continue;
1219 			}
1220 
1221 			/*
1222 			 * Once we have done the operation, there is no
1223 			 * guarantee the request slot is going to be valid,
1224 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1225 			 * TRAP).
1226 			 *
1227 			 * Perform the request.
1228 			 */
1229 			switch (ptr->rq_op) {
1230 				case IBD_ASYNC_GETAH:
1231 					ibd_async_acache(state, &ptr->rq_mac);
1232 					break;
1233 				case IBD_ASYNC_JOIN:
1234 				case IBD_ASYNC_LEAVE:
1235 					ibd_async_multicast(state,
1236 					    ptr->rq_gid, ptr->rq_op);
1237 					break;
1238 				case IBD_ASYNC_PROMON:
1239 					ibd_async_setprom(state);
1240 					break;
1241 				case IBD_ASYNC_PROMOFF:
1242 					ibd_async_unsetprom(state);
1243 					break;
1244 				case IBD_ASYNC_REAP:
1245 					ibd_async_reap_group(state,
1246 					    ptr->rq_ptr, ptr->rq_gid,
1247 					    IB_MC_JSTATE_FULL);
1248 					/*
1249 					 * the req buf contains in mce
1250 					 * structure, so we do not need
1251 					 * to free it here.
1252 					 */
1253 					ptr = NULL;
1254 					break;
1255 				case IBD_ASYNC_TRAP:
1256 					ibd_async_trap(state, ptr);
1257 					break;
1258 				case IBD_ASYNC_SCHED:
1259 					ibd_async_txsched(state);
1260 					break;
1261 				case IBD_ASYNC_LINK:
1262 					ibd_async_link(state, ptr);
1263 					break;
1264 				case IBD_ASYNC_EXIT:
1265 					mutex_enter(&state->id_acache_req_lock);
1266 #ifndef __lock_lint
1267 					CALLB_CPR_EXIT(&cprinfo);
1268 #else
1269 					mutex_exit(&state->id_acache_req_lock);
1270 #endif
1271 					return;
1272 				case IBD_ASYNC_RC_TOO_BIG:
1273 					ibd_async_rc_process_too_big(state,
1274 					    ptr);
1275 					break;
1276 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1277 					ibd_async_rc_close_act_chan(state, ptr);
1278 					break;
1279 				case IBD_ASYNC_RC_RECYCLE_ACE:
1280 					ibd_async_rc_recycle_ace(state, ptr);
1281 					break;
1282 				case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
1283 					(void) ibd_rc_pas_close(ptr->rq_ptr,
1284 					    B_TRUE, B_TRUE);
1285 					break;
1286 			}
1287 free_req_and_continue:
1288 			if (ptr != NULL)
1289 				kmem_cache_free(state->id_req_kmc, ptr);
1290 
1291 			mutex_enter(&state->id_acache_req_lock);
1292 		} else {
1293 #ifndef __lock_lint
1294 			/*
1295 			 * Nothing to do: wait till new request arrives.
1296 			 */
1297 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1298 			cv_wait(&state->id_acache_req_cv,
1299 			    &state->id_acache_req_lock);
1300 			CALLB_CPR_SAFE_END(&cprinfo,
1301 			    &state->id_acache_req_lock);
1302 #endif
1303 		}
1304 	}
1305 
1306 	/*NOTREACHED*/
1307 	_NOTE(NOT_REACHED)
1308 }
1309 
1310 /*
1311  * Return when it is safe to queue requests to the async daemon; primarily
1312  * for subnet trap and async event handling. Disallow requests before the
1313  * daemon is created, and when interface deinitilization starts.
1314  */
1315 static boolean_t
1316 ibd_async_safe(ibd_state_t *state)
1317 {
1318 	mutex_enter(&state->id_trap_lock);
1319 	if (state->id_trap_stop) {
1320 		mutex_exit(&state->id_trap_lock);
1321 		return (B_FALSE);
1322 	}
1323 	state->id_trap_inprog++;
1324 	mutex_exit(&state->id_trap_lock);
1325 	return (B_TRUE);
1326 }
1327 
1328 /*
1329  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1330  * trap or event handling to complete to kill the async thread and deconstruct
1331  * the mcg/ace list.
1332  */
1333 static void
1334 ibd_async_done(ibd_state_t *state)
1335 {
1336 	mutex_enter(&state->id_trap_lock);
1337 	if (--state->id_trap_inprog == 0)
1338 		cv_signal(&state->id_trap_cv);
1339 	mutex_exit(&state->id_trap_lock);
1340 }
1341 
1342 /*
1343  * Hash functions:
1344  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1345  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1346  * These operate on mac addresses input into ibd_send, but there is no
1347  * guarantee on the alignment of the ipoib_mac_t structure.
1348  */
1349 /*ARGSUSED*/
1350 static uint_t
1351 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1352 {
1353 	ulong_t ptraddr = (ulong_t)key;
1354 	uint_t hval;
1355 
1356 	/*
1357 	 * If the input address is 4 byte aligned, we can just dereference
1358 	 * it. This is most common, since IP will send in a 4 byte aligned
1359 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1360 	 * 4 byte aligned too.
1361 	 */
1362 	if ((ptraddr & 3) == 0)
1363 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1364 
1365 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1366 	return (hval);
1367 }
1368 
1369 static int
1370 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1371 {
1372 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1373 		return (0);
1374 	else
1375 		return (1);
1376 }
1377 
1378 /*
1379  * Initialize all the per interface caches and lists; AH cache,
1380  * MCG list etc.
1381  */
1382 static int
1383 ibd_acache_init(ibd_state_t *state)
1384 {
1385 	ibd_ace_t *ce;
1386 	int i;
1387 
1388 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1389 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1390 	mutex_enter(&state->id_ac_mutex);
1391 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1392 	    offsetof(ibd_ace_t, ac_list));
1393 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1394 	    offsetof(ibd_ace_t, ac_list));
1395 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1396 	    state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1397 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1398 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1399 	    offsetof(ibd_mce_t, mc_list));
1400 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1401 	    offsetof(ibd_mce_t, mc_list));
1402 	state->id_ac_hot_ace = NULL;
1403 
1404 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1405 	    state->id_num_ah, KM_SLEEP);
1406 	for (i = 0; i < state->id_num_ah; i++, ce++) {
1407 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1408 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1409 			mutex_exit(&state->id_ac_mutex);
1410 			ibd_acache_fini(state);
1411 			return (DDI_FAILURE);
1412 		} else {
1413 			CLEAR_REFCYCLE(ce);
1414 			ce->ac_mce = NULL;
1415 			mutex_init(&ce->tx_too_big_mutex, NULL,
1416 			    MUTEX_DRIVER, NULL);
1417 			IBD_ACACHE_INSERT_FREE(state, ce);
1418 		}
1419 	}
1420 	mutex_exit(&state->id_ac_mutex);
1421 	return (DDI_SUCCESS);
1422 }
1423 
1424 static void
1425 ibd_acache_fini(ibd_state_t *state)
1426 {
1427 	ibd_ace_t *ptr;
1428 
1429 	mutex_enter(&state->id_ac_mutex);
1430 
1431 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1432 		ASSERT(GET_REF(ptr) == 0);
1433 		mutex_destroy(&ptr->tx_too_big_mutex);
1434 		(void) ibt_free_ud_dest(ptr->ac_dest);
1435 	}
1436 
1437 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1438 		ASSERT(GET_REF(ptr) == 0);
1439 		mutex_destroy(&ptr->tx_too_big_mutex);
1440 		(void) ibt_free_ud_dest(ptr->ac_dest);
1441 	}
1442 
1443 	list_destroy(&state->id_ah_free);
1444 	list_destroy(&state->id_ah_active);
1445 	list_destroy(&state->id_mc_full);
1446 	list_destroy(&state->id_mc_non);
1447 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1448 	mutex_exit(&state->id_ac_mutex);
1449 	mutex_destroy(&state->id_ac_mutex);
1450 	mutex_destroy(&state->id_mc_mutex);
1451 }
1452 
1453 /*
1454  * Search AH active hash list for a cached path to input destination.
1455  * If we are "just looking", hold == F. When we are in the Tx path,
1456  * we set hold == T to grab a reference on the AH so that it can not
1457  * be recycled to a new destination while the Tx request is posted.
1458  */
1459 ibd_ace_t *
1460 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1461 {
1462 	ibd_ace_t *ptr;
1463 
1464 	ASSERT(mutex_owned(&state->id_ac_mutex));
1465 
1466 	/*
1467 	 * Do hash search.
1468 	 */
1469 	if (mod_hash_find(state->id_ah_active_hash,
1470 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1471 		if (hold)
1472 			INC_REF(ptr, num);
1473 		return (ptr);
1474 	}
1475 	return (NULL);
1476 }
1477 
1478 /*
1479  * This is called by the tx side; if an initialized AH is found in
1480  * the active list, it is locked down and can be used; if no entry
1481  * is found, an async request is queued to do path resolution.
1482  */
1483 static ibd_ace_t *
1484 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1485 {
1486 	ibd_ace_t *ptr;
1487 	ibd_req_t *req;
1488 
1489 	/*
1490 	 * Only attempt to print when we can; in the mdt pattr case, the
1491 	 * address is not aligned properly.
1492 	 */
1493 	if (((ulong_t)mac & 3) == 0) {
1494 		DPRINT(4,
1495 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1496 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1497 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1498 		    htonl(mac->ipoib_gidsuff[1]));
1499 	}
1500 
1501 	mutex_enter(&state->id_ac_mutex);
1502 
1503 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1504 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1505 		INC_REF(ptr, numwqe);
1506 		mutex_exit(&state->id_ac_mutex);
1507 		return (ptr);
1508 	}
1509 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1510 		state->id_ac_hot_ace = ptr;
1511 		mutex_exit(&state->id_ac_mutex);
1512 		return (ptr);
1513 	}
1514 
1515 	/*
1516 	 * Implementation of a single outstanding async request; if
1517 	 * the operation is not started yet, queue a request and move
1518 	 * to ongoing state. Remember in id_ah_addr for which address
1519 	 * we are queueing the request, in case we need to flag an error;
1520 	 * Any further requests, for the same or different address, until
1521 	 * the operation completes, is sent back to GLDv3 to be retried.
1522 	 * The async thread will update id_ah_op with an error indication
1523 	 * or will set it to indicate the next look up can start; either
1524 	 * way, it will mac_tx_update() so that all blocked requests come
1525 	 * back here.
1526 	 */
1527 	*err = EAGAIN;
1528 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1529 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1530 		if (req != NULL) {
1531 			/*
1532 			 * We did not even find the entry; queue a request
1533 			 * for it.
1534 			 */
1535 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1536 			state->id_ah_op = IBD_OP_ONGOING;
1537 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1538 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1539 		}
1540 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1541 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1542 		/*
1543 		 * Check the status of the pathrecord lookup request
1544 		 * we had queued before.
1545 		 */
1546 		if (state->id_ah_op == IBD_OP_ERRORED) {
1547 			*err = EFAULT;
1548 			state->id_ah_error++;
1549 		} else {
1550 			/*
1551 			 * IBD_OP_ROUTERED case: We need to send to the
1552 			 * all-router MCG. If we can find the AH for
1553 			 * the mcg, the Tx will be attempted. If we
1554 			 * do not find the AH, we return NORESOURCES
1555 			 * to retry.
1556 			 */
1557 			ipoib_mac_t routermac;
1558 
1559 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1560 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1561 			    numwqe);
1562 		}
1563 		state->id_ah_op = IBD_OP_NOTSTARTED;
1564 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1565 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1566 		/*
1567 		 * This case can happen when we get a higher band
1568 		 * packet. The easiest way is to reset the state machine
1569 		 * to accommodate the higher priority packet.
1570 		 */
1571 		state->id_ah_op = IBD_OP_NOTSTARTED;
1572 	}
1573 	mutex_exit(&state->id_ac_mutex);
1574 
1575 	return (ptr);
1576 }
1577 
1578 /*
1579  * Grab a not-currently-in-use AH/PathRecord from the active
1580  * list to recycle to a new destination. Only the async thread
1581  * executes this code.
1582  */
1583 static ibd_ace_t *
1584 ibd_acache_get_unref(ibd_state_t *state)
1585 {
1586 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1587 	boolean_t try_rc_chan_recycle = B_FALSE;
1588 
1589 	ASSERT(mutex_owned(&state->id_ac_mutex));
1590 
1591 	/*
1592 	 * Do plain linear search.
1593 	 */
1594 	while (ptr != NULL) {
1595 		/*
1596 		 * Note that it is possible that the "cycle" bit
1597 		 * is set on the AH w/o any reference count. The
1598 		 * mcg must have been deleted, and the tx cleanup
1599 		 * just decremented the reference count to 0, but
1600 		 * hasn't gotten around to grabbing the id_ac_mutex
1601 		 * to move the AH into the free list.
1602 		 */
1603 		if (GET_REF(ptr) == 0) {
1604 			if (ptr->ac_chan != NULL) {
1605 				ASSERT(state->id_enable_rc == B_TRUE);
1606 				if (!try_rc_chan_recycle) {
1607 					try_rc_chan_recycle = B_TRUE;
1608 					ibd_rc_signal_ace_recycle(state, ptr);
1609 				}
1610 			} else {
1611 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1612 				break;
1613 			}
1614 		}
1615 		ptr = list_prev(&state->id_ah_active, ptr);
1616 	}
1617 	return (ptr);
1618 }
1619 
1620 /*
1621  * Invoked to clean up AH from active list in case of multicast
1622  * disable and to handle sendonly memberships during mcg traps.
1623  * And for port up processing for multicast and unicast AHs.
1624  * Normally, the AH is taken off the active list, and put into
1625  * the free list to be recycled for a new destination. In case
1626  * Tx requests on the AH have not completed yet, the AH is marked
1627  * for reaping (which will put the AH on the free list) once the Tx's
1628  * complete; in this case, depending on the "force" input, we take
1629  * out the AH from the active list right now, or leave it also for
1630  * the reap operation. Returns TRUE if the AH is taken off the active
1631  * list (and either put into the free list right now, or arranged for
1632  * later), FALSE otherwise.
1633  */
1634 boolean_t
1635 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1636 {
1637 	ibd_ace_t *acactive;
1638 	boolean_t ret = B_TRUE;
1639 
1640 	ASSERT(mutex_owned(&state->id_ac_mutex));
1641 
1642 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1643 
1644 		/*
1645 		 * Note that the AH might already have the cycle bit set
1646 		 * on it; this might happen if sequences of multicast
1647 		 * enables and disables are coming so fast, that posted
1648 		 * Tx's to the mcg have not completed yet, and the cycle
1649 		 * bit is set successively by each multicast disable.
1650 		 */
1651 		if (SET_CYCLE_IF_REF(acactive)) {
1652 			if (!force) {
1653 				/*
1654 				 * The ace is kept on the active list, further
1655 				 * Tx's can still grab a reference on it; the
1656 				 * ace is reaped when all pending Tx's
1657 				 * referencing the AH complete.
1658 				 */
1659 				ret = B_FALSE;
1660 			} else {
1661 				/*
1662 				 * In the mcg trap case, we always pull the
1663 				 * AH from the active list. And also the port
1664 				 * up multi/unicast case.
1665 				 */
1666 				ASSERT(acactive->ac_chan == NULL);
1667 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1668 				acactive->ac_mce = NULL;
1669 			}
1670 		} else {
1671 			/*
1672 			 * Determined the ref count is 0, thus reclaim
1673 			 * immediately after pulling out the ace from
1674 			 * the active list.
1675 			 */
1676 			ASSERT(acactive->ac_chan == NULL);
1677 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1678 			acactive->ac_mce = NULL;
1679 			IBD_ACACHE_INSERT_FREE(state, acactive);
1680 		}
1681 
1682 	}
1683 	return (ret);
1684 }
1685 
1686 /*
1687  * Helper function for async path record lookup. If we are trying to
1688  * Tx to a MCG, check our membership, possibly trying to join the
1689  * group if required. If that fails, try to send the packet to the
1690  * all router group (indicated by the redirect output), pointing
1691  * the input mac address to the router mcg address.
1692  */
1693 static ibd_mce_t *
1694 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1695 {
1696 	ib_gid_t mgid;
1697 	ibd_mce_t *mce;
1698 	ipoib_mac_t routermac;
1699 
1700 	*redirect = B_FALSE;
1701 	ibd_n2h_gid(mac, &mgid);
1702 
1703 	/*
1704 	 * Check the FullMember+SendOnlyNonMember list.
1705 	 * Since we are the only one who manipulates the
1706 	 * id_mc_full list, no locks are needed.
1707 	 */
1708 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1709 	if (mce != NULL) {
1710 		DPRINT(4, "ibd_async_mcache : already joined to group");
1711 		return (mce);
1712 	}
1713 
1714 	/*
1715 	 * Not found; try to join(SendOnlyNonMember) and attach.
1716 	 */
1717 	DPRINT(4, "ibd_async_mcache : not joined to group");
1718 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1719 	    NULL) {
1720 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1721 		return (mce);
1722 	}
1723 
1724 	/*
1725 	 * MCGroup not present; try to join the all-router group. If
1726 	 * any of the following steps succeed, we will be redirecting
1727 	 * to the all router group.
1728 	 */
1729 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1730 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1731 		return (NULL);
1732 	*redirect = B_TRUE;
1733 	ibd_n2h_gid(&routermac, &mgid);
1734 	bcopy(&routermac, mac, IPOIB_ADDRL);
1735 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1736 	    mgid.gid_prefix, mgid.gid_guid);
1737 
1738 	/*
1739 	 * Are we already joined to the router group?
1740 	 */
1741 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1742 		DPRINT(4, "ibd_async_mcache : using already joined router"
1743 		    "group\n");
1744 		return (mce);
1745 	}
1746 
1747 	/*
1748 	 * Can we join(SendOnlyNonMember) the router group?
1749 	 */
1750 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1751 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1752 	    NULL) {
1753 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1754 		return (mce);
1755 	}
1756 
1757 	return (NULL);
1758 }
1759 
1760 /*
1761  * Async path record lookup code.
1762  */
1763 static void
1764 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1765 {
1766 	ibd_ace_t *ce;
1767 	ibd_mce_t *mce = NULL;
1768 	ibt_path_attr_t path_attr;
1769 	ibt_path_info_t path_info;
1770 	ib_gid_t destgid;
1771 	char ret = IBD_OP_NOTSTARTED;
1772 
1773 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1774 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1775 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1776 	    htonl(mac->ipoib_gidsuff[1]));
1777 
1778 	/*
1779 	 * Check whether we are trying to transmit to a MCG.
1780 	 * In that case, we need to make sure we are a member of
1781 	 * the MCG.
1782 	 */
1783 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1784 		boolean_t redirected;
1785 
1786 		/*
1787 		 * If we can not find or join the group or even
1788 		 * redirect, error out.
1789 		 */
1790 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1791 		    NULL) {
1792 			state->id_ah_op = IBD_OP_ERRORED;
1793 			return;
1794 		}
1795 
1796 		/*
1797 		 * If we got redirected, we need to determine whether
1798 		 * the AH for the new mcg is in the cache already, and
1799 		 * not pull it in then; otherwise proceed to get the
1800 		 * path for the new mcg. There is no guarantee that
1801 		 * if the AH is currently in the cache, it will still be
1802 		 * there when we look in ibd_acache_lookup(), but that's
1803 		 * okay, we will come back here.
1804 		 */
1805 		if (redirected) {
1806 			ret = IBD_OP_ROUTERED;
1807 			DPRINT(4, "ibd_async_acache :  redirected to "
1808 			    "%08X:%08X:%08X:%08X:%08X",
1809 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1810 			    htonl(mac->ipoib_gidpref[1]),
1811 			    htonl(mac->ipoib_gidsuff[0]),
1812 			    htonl(mac->ipoib_gidsuff[1]));
1813 
1814 			mutex_enter(&state->id_ac_mutex);
1815 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1816 				state->id_ah_op = IBD_OP_ROUTERED;
1817 				mutex_exit(&state->id_ac_mutex);
1818 				DPRINT(4, "ibd_async_acache : router AH found");
1819 				return;
1820 			}
1821 			mutex_exit(&state->id_ac_mutex);
1822 		}
1823 	}
1824 
1825 	/*
1826 	 * Get an AH from the free list.
1827 	 */
1828 	mutex_enter(&state->id_ac_mutex);
1829 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1830 		/*
1831 		 * No free ones; try to grab an unreferenced active
1832 		 * one. Maybe we need to make the active list LRU,
1833 		 * but that will create more work for Tx callbacks.
1834 		 * Is there a way of not having to pull out the
1835 		 * entry from the active list, but just indicate it
1836 		 * is being recycled? Yes, but that creates one more
1837 		 * check in the fast lookup path.
1838 		 */
1839 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1840 			/*
1841 			 * Pretty serious shortage now.
1842 			 */
1843 			state->id_ah_op = IBD_OP_NOTSTARTED;
1844 			mutex_exit(&state->id_ac_mutex);
1845 			DPRINT(10, "ibd_async_acache : failed to find AH "
1846 			    "slot\n");
1847 			return;
1848 		}
1849 		/*
1850 		 * We could check whether ac_mce points to a SendOnly
1851 		 * member and drop that membership now. Or do it lazily
1852 		 * at detach time.
1853 		 */
1854 		ce->ac_mce = NULL;
1855 	}
1856 	mutex_exit(&state->id_ac_mutex);
1857 	ASSERT(ce->ac_mce == NULL);
1858 
1859 	/*
1860 	 * Update the entry.
1861 	 */
1862 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1863 
1864 	bzero(&path_info, sizeof (path_info));
1865 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1866 	path_attr.pa_sgid = state->id_sgid;
1867 	path_attr.pa_num_dgids = 1;
1868 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1869 	path_attr.pa_dgids = &destgid;
1870 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1871 	path_attr.pa_pkey = state->id_pkey;
1872 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1873 	    &path_info, NULL) != IBT_SUCCESS) {
1874 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1875 		goto error;
1876 	}
1877 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1878 	    ntohl(ce->ac_mac.ipoib_qpn),
1879 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1880 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1881 		goto error;
1882 	}
1883 
1884 	/*
1885 	 * mce is set whenever an AH is being associated with a
1886 	 * MCG; this will come in handy when we leave the MCG. The
1887 	 * lock protects Tx fastpath from scanning the active list.
1888 	 */
1889 	if (mce != NULL)
1890 		ce->ac_mce = mce;
1891 
1892 	/*
1893 	 * initiate a RC mode connection for unicast address
1894 	 */
1895 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1896 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1897 		ASSERT(ce->ac_chan == NULL);
1898 		DPRINT(10, "ibd_async_acache: call "
1899 		    "ibd_rc_try_connect(ace=%p)", ce);
1900 		ibd_rc_try_connect(state, ce, &path_info);
1901 		if (ce->ac_chan == NULL) {
1902 			DPRINT(10, "ibd_async_acache: fail to setup RC"
1903 			    " channel");
1904 			state->rc_conn_fail++;
1905 			goto error;
1906 		}
1907 	}
1908 
1909 	mutex_enter(&state->id_ac_mutex);
1910 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1911 	state->id_ah_op = ret;
1912 	mutex_exit(&state->id_ac_mutex);
1913 	return;
1914 error:
1915 	/*
1916 	 * We might want to drop SendOnly membership here if we
1917 	 * joined above. The lock protects Tx callbacks inserting
1918 	 * into the free list.
1919 	 */
1920 	mutex_enter(&state->id_ac_mutex);
1921 	state->id_ah_op = IBD_OP_ERRORED;
1922 	IBD_ACACHE_INSERT_FREE(state, ce);
1923 	mutex_exit(&state->id_ac_mutex);
1924 }
1925 
1926 /*
1927  * While restoring port's presence on the subnet on a port up, it is possible
1928  * that the port goes down again.
1929  */
1930 static void
1931 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1932 {
1933 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1934 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1935 	    LINK_STATE_UP;
1936 	ibd_mce_t *mce, *pmce;
1937 	ibd_ace_t *ace, *pace;
1938 
1939 	DPRINT(10, "ibd_async_link(): %d", opcode);
1940 
1941 	/*
1942 	 * On a link up, revalidate the link speed/width. No point doing
1943 	 * this on a link down, since we will be unable to do SA operations,
1944 	 * defaulting to the lowest speed. Also notice that we update our
1945 	 * notion of speed before calling mac_link_update(), which will do
1946 	 * necessary higher level notifications for speed changes.
1947 	 */
1948 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1949 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1950 		state->id_link_speed = ibd_get_portspeed(state);
1951 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1952 	}
1953 
1954 	/*
1955 	 * Do all the work required to establish our presence on
1956 	 * the subnet.
1957 	 */
1958 	if (opcode == IBD_LINK_UP_ABSENT) {
1959 		/*
1960 		 * If in promiscuous mode ...
1961 		 */
1962 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1963 			/*
1964 			 * Drop all nonmembership.
1965 			 */
1966 			ibd_async_unsetprom(state);
1967 
1968 			/*
1969 			 * Then, try to regain nonmembership to all mcg's.
1970 			 */
1971 			ibd_async_setprom(state);
1972 
1973 		}
1974 
1975 		/*
1976 		 * Drop all sendonly membership (which also gets rid of the
1977 		 * AHs); try to reacquire all full membership.
1978 		 */
1979 		mce = list_head(&state->id_mc_full);
1980 		while ((pmce = mce) != NULL) {
1981 			mce = list_next(&state->id_mc_full, mce);
1982 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1983 				ibd_leave_group(state,
1984 				    pmce->mc_info.mc_adds_vect.av_dgid,
1985 				    IB_MC_JSTATE_SEND_ONLY_NON);
1986 			else
1987 				ibd_reacquire_group(state, pmce);
1988 		}
1989 
1990 		/*
1991 		 * Recycle all active AHs to free list (and if there are
1992 		 * pending posts, make sure they will go into the free list
1993 		 * once the Tx's complete). Grab the lock to prevent
1994 		 * concurrent Tx's as well as Tx cleanups.
1995 		 */
1996 		mutex_enter(&state->id_ac_mutex);
1997 		ace = list_head(&state->id_ah_active);
1998 		while ((pace = ace) != NULL) {
1999 			boolean_t cycled;
2000 
2001 			ace = list_next(&state->id_ah_active, ace);
2002 			mce = pace->ac_mce;
2003 			if (pace->ac_chan != NULL) {
2004 				ASSERT(mce == NULL);
2005 				ASSERT(state->id_enable_rc == B_TRUE);
2006 				if (pace->ac_chan->chan_state ==
2007 				    IBD_RC_STATE_ACT_ESTAB) {
2008 					INC_REF(pace, 1);
2009 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2010 					pace->ac_chan->chan_state =
2011 					    IBD_RC_STATE_ACT_CLOSING;
2012 					ibd_rc_signal_act_close(state, pace);
2013 				} else {
2014 					state->rc_act_close_simultaneous++;
2015 					DPRINT(40, "ibd_async_link: other "
2016 					    "thread is closing it, ace=%p, "
2017 					    "ac_chan=%p, chan_state=%d",
2018 					    pace, pace->ac_chan,
2019 					    pace->ac_chan->chan_state);
2020 				}
2021 			} else {
2022 				cycled = ibd_acache_recycle(state,
2023 				    &pace->ac_mac, B_TRUE);
2024 			}
2025 			/*
2026 			 * If this is for an mcg, it must be for a fullmember,
2027 			 * since we got rid of send-only members above when
2028 			 * processing the mce list.
2029 			 */
2030 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2031 			    IB_MC_JSTATE_FULL)));
2032 
2033 			/*
2034 			 * Check if the fullmember mce needs to be torn down,
2035 			 * ie whether the DLPI disable has already been done.
2036 			 * If so, do some of the work of tx_cleanup, namely
2037 			 * causing leave (which will fail), detach and
2038 			 * mce-freeing. tx_cleanup will put the AH into free
2039 			 * list. The reason to duplicate some of this
2040 			 * tx_cleanup work is because we want to delete the
2041 			 * AH right now instead of waiting for tx_cleanup, to
2042 			 * force subsequent Tx's to reacquire an AH.
2043 			 */
2044 			if ((mce != NULL) && (mce->mc_fullreap))
2045 				ibd_async_reap_group(state, mce,
2046 				    mce->mc_info.mc_adds_vect.av_dgid,
2047 				    mce->mc_jstate);
2048 		}
2049 		mutex_exit(&state->id_ac_mutex);
2050 	}
2051 
2052 	/*
2053 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2054 	 * (which stops further events from being delivered) before
2055 	 * mac_unregister(). At this point, it is guaranteed that mac_register
2056 	 * has already been done.
2057 	 */
2058 	mutex_enter(&state->id_link_mutex);
2059 	state->id_link_state = lstate;
2060 	mac_link_update(state->id_mh, lstate);
2061 	mutex_exit(&state->id_link_mutex);
2062 
2063 	ibd_async_done(state);
2064 }
2065 
2066 /*
2067  * Check the pkey table to see if we can find the pkey we're looking for.
2068  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2069  * failure.
2070  */
2071 static int
2072 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2073     uint16_t *pkix)
2074 {
2075 	uint16_t ndx;
2076 
2077 	ASSERT(pkix != NULL);
2078 
2079 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2080 		if (pkey_tbl[ndx] == pkey) {
2081 			*pkix = ndx;
2082 			return (0);
2083 		}
2084 	}
2085 	return (-1);
2086 }
2087 
2088 /*
2089  * Late HCA Initialization:
2090  * If plumb had succeeded without the availability of an active port or the
2091  * pkey, and either of their availability is now being indicated via PORT_UP
2092  * or PORT_CHANGE respectively, try a start of the interface.
2093  *
2094  * Normal Operation:
2095  * When the link is notified up, we need to do a few things, based
2096  * on the port's current p_init_type_reply claiming a reinit has been
2097  * done or not. The reinit steps are:
2098  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2099  *    the old Pkey and GID0 are correct.
2100  * 2. Register for mcg traps (already done by ibmf).
2101  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2102  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2103  * 4. Give up all sendonly memberships.
2104  * 5. Acquire all full memberships.
2105  * 6. In promiscuous mode, acquire all non memberships.
2106  * 7. Recycle all AHs to free list.
2107  */
2108 static void
2109 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2110 {
2111 	ibt_hca_portinfo_t *port_infop = NULL;
2112 	ibt_status_t ibt_status;
2113 	uint_t psize, port_infosz;
2114 	ibd_link_op_t opcode;
2115 	ibd_req_t *req;
2116 	link_state_t new_link_state = LINK_STATE_UP;
2117 	uint8_t itreply;
2118 	uint16_t pkix;
2119 	int ret;
2120 
2121 	/*
2122 	 * Let's not race with a plumb or an unplumb; if we detect a
2123 	 * pkey relocation event later on here, we may have to restart.
2124 	 */
2125 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2126 
2127 	mutex_enter(&state->id_link_mutex);
2128 
2129 	/*
2130 	 * If the link state is unknown, a plumb has not yet been attempted
2131 	 * on the interface. Nothing to do.
2132 	 */
2133 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2134 		mutex_exit(&state->id_link_mutex);
2135 		goto link_mod_return;
2136 	}
2137 
2138 	/*
2139 	 * If link state is down because of plumb failure, and we are not in
2140 	 * late HCA init, and we were not successfully plumbed, nothing to do.
2141 	 */
2142 	if ((state->id_link_state == LINK_STATE_DOWN) &&
2143 	    ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2144 	    ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2145 		mutex_exit(&state->id_link_mutex);
2146 		goto link_mod_return;
2147 	}
2148 
2149 	/*
2150 	 * If this routine was called in response to a port down event,
2151 	 * we just need to see if this should be informed.
2152 	 */
2153 	if (code == IBT_ERROR_PORT_DOWN) {
2154 		new_link_state = LINK_STATE_DOWN;
2155 		goto update_link_state;
2156 	}
2157 
2158 	/*
2159 	 * If it's not a port down event we've received, try to get the port
2160 	 * attributes first. If we fail here, the port is as good as down.
2161 	 * Otherwise, if the link went down by the time the handler gets
2162 	 * here, give up - we cannot even validate the pkey/gid since those
2163 	 * are not valid and this is as bad as a port down anyway.
2164 	 */
2165 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2166 	    &port_infop, &psize, &port_infosz);
2167 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2168 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2169 		new_link_state = LINK_STATE_DOWN;
2170 		goto update_link_state;
2171 	}
2172 
2173 	/*
2174 	 * If in the previous attempt, the pkey was not found either due to the
2175 	 * port state being down, or due to it's absence in the pkey table,
2176 	 * look for it now and try to start the interface.
2177 	 */
2178 	if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2179 		mutex_exit(&state->id_link_mutex);
2180 		if ((ret = ibd_start(state)) != 0) {
2181 			DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2182 			    "init, ret=%d", ret);
2183 		}
2184 		ibt_free_portinfo(port_infop, port_infosz);
2185 		goto link_mod_return;
2186 	}
2187 
2188 	/*
2189 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2190 	 * PreserveContentReply are 0, we don't know anything about the
2191 	 * data loaded into the port attributes, so we need to verify
2192 	 * if gid0 and pkey are still valid.
2193 	 */
2194 	itreply = port_infop->p_init_type_reply;
2195 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2196 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2197 		/*
2198 		 * Check to see if the subnet part of GID0 has changed. If
2199 		 * not, check the simple case first to see if the pkey
2200 		 * index is the same as before; finally check to see if the
2201 		 * pkey has been relocated to a different index in the table.
2202 		 */
2203 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2204 		if (bcmp(port_infop->p_sgid_tbl,
2205 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2206 
2207 			new_link_state = LINK_STATE_DOWN;
2208 
2209 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2210 		    state->id_pkey) {
2211 
2212 			new_link_state = LINK_STATE_UP;
2213 
2214 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2215 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2216 
2217 			ibt_free_portinfo(port_infop, port_infosz);
2218 			mutex_exit(&state->id_link_mutex);
2219 
2220 			/*
2221 			 * Currently a restart is required if our pkey has moved
2222 			 * in the pkey table. If we get the ibt_recycle_ud() to
2223 			 * work as documented (expected), we may be able to
2224 			 * avoid a complete restart.  Note that we've already
2225 			 * marked both the start and stop 'in-progress' flags,
2226 			 * so it is ok to go ahead and do this restart.
2227 			 */
2228 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2229 			if ((ret = ibd_start(state)) != 0) {
2230 				DPRINT(10, "ibd_restart: cannot restart, "
2231 				    "ret=%d", ret);
2232 			}
2233 
2234 			goto link_mod_return;
2235 		} else {
2236 			new_link_state = LINK_STATE_DOWN;
2237 		}
2238 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2239 	}
2240 
2241 update_link_state:
2242 	if (port_infop) {
2243 		ibt_free_portinfo(port_infop, port_infosz);
2244 	}
2245 
2246 	/*
2247 	 * If we're reporting a link up, check InitTypeReply to see if
2248 	 * the SM has ensured that the port's presence in mcg, traps,
2249 	 * etc. is intact.
2250 	 */
2251 	if (new_link_state == LINK_STATE_DOWN) {
2252 		opcode = IBD_LINK_DOWN;
2253 	} else {
2254 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2255 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2256 			opcode = IBD_LINK_UP;
2257 		} else {
2258 			opcode = IBD_LINK_UP_ABSENT;
2259 		}
2260 	}
2261 
2262 	/*
2263 	 * If the old state is the same as the new state, and the SM indicated
2264 	 * no change in the port parameters, nothing to do.
2265 	 */
2266 	if ((state->id_link_state == new_link_state) && (opcode !=
2267 	    IBD_LINK_UP_ABSENT)) {
2268 		mutex_exit(&state->id_link_mutex);
2269 		goto link_mod_return;
2270 	}
2271 
2272 	/*
2273 	 * Ok, so there was a link state change; see if it's safe to ask
2274 	 * the async thread to do the work
2275 	 */
2276 	if (!ibd_async_safe(state)) {
2277 		state->id_link_state = new_link_state;
2278 		mutex_exit(&state->id_link_mutex);
2279 		goto link_mod_return;
2280 	}
2281 
2282 	mutex_exit(&state->id_link_mutex);
2283 
2284 	/*
2285 	 * Queue up a request for ibd_async_link() to handle this link
2286 	 * state change event
2287 	 */
2288 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2289 	req->rq_ptr = (void *)opcode;
2290 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2291 
2292 link_mod_return:
2293 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2294 }
2295 
2296 /*
2297  * For the port up/down events, IBTL guarantees there will not be concurrent
2298  * invocations of the handler. IBTL might coalesce link transition events,
2299  * and not invoke the handler for _each_ up/down transition, but it will
2300  * invoke the handler with last known state
2301  */
2302 static void
2303 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2304     ibt_async_code_t code, ibt_async_event_t *event)
2305 {
2306 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2307 
2308 	switch (code) {
2309 	case IBT_ERROR_CATASTROPHIC_CHAN:
2310 		ibd_print_warn(state, "catastrophic channel error");
2311 		break;
2312 	case IBT_ERROR_CQ:
2313 		ibd_print_warn(state, "completion queue error");
2314 		break;
2315 	case IBT_PORT_CHANGE_EVENT:
2316 		/*
2317 		 * Events will be delivered to all instances that have
2318 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2319 		 * Only need to do work for our port; IBTF will deliver
2320 		 * events for other ports on the hca we have ibt_open_hca'ed
2321 		 * too. Note that id_port is initialized in ibd_attach()
2322 		 * before we do an ibt_open_hca() in ibd_attach().
2323 		 */
2324 		ASSERT(state->id_hca_hdl == hca_hdl);
2325 		if (state->id_port != event->ev_port)
2326 			break;
2327 
2328 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2329 		    IBT_PORT_CHANGE_PKEY) {
2330 			ibd_link_mod(state, code);
2331 		}
2332 		break;
2333 	case IBT_ERROR_PORT_DOWN:
2334 	case IBT_CLNT_REREG_EVENT:
2335 	case IBT_EVENT_PORT_UP:
2336 		/*
2337 		 * Events will be delivered to all instances that have
2338 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2339 		 * Only need to do work for our port; IBTF will deliver
2340 		 * events for other ports on the hca we have ibt_open_hca'ed
2341 		 * too. Note that id_port is initialized in ibd_attach()
2342 		 * before we do an ibt_open_hca() in ibd_attach().
2343 		 */
2344 		ASSERT(state->id_hca_hdl == hca_hdl);
2345 		if (state->id_port != event->ev_port)
2346 			break;
2347 
2348 		ibd_link_mod(state, code);
2349 		break;
2350 
2351 	case IBT_HCA_ATTACH_EVENT:
2352 	case IBT_HCA_DETACH_EVENT:
2353 		/*
2354 		 * When a new card is plugged to the system, attach_event is
2355 		 * invoked. Additionally, a cfgadm needs to be run to make the
2356 		 * card known to the system, and an ifconfig needs to be run to
2357 		 * plumb up any ibd interfaces on the card. In the case of card
2358 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2359 		 * unplumb the ibd interfaces on the card; when the card is
2360 		 * actually unplugged, the detach_event is invoked;
2361 		 * additionally, if any ibd instances are still active on the
2362 		 * card (eg there were no associated RCM scripts), driver's
2363 		 * detach routine is invoked.
2364 		 */
2365 		break;
2366 	default:
2367 		break;
2368 	}
2369 }
2370 
2371 static int
2372 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2373 {
2374 	mac_register_t *macp;
2375 	int ret;
2376 
2377 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2378 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2379 		return (DDI_FAILURE);
2380 	}
2381 
2382 	/*
2383 	 * Note that when we register with mac during attach, we don't
2384 	 * have the id_macaddr yet, so we'll simply be registering a
2385 	 * zero macaddr that we'll overwrite later during plumb (in
2386 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2387 	 * update the mac layer with the correct mtu during plumb.
2388 	 */
2389 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2390 	macp->m_driver = state;
2391 	macp->m_dip = dip;
2392 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2393 	macp->m_callbacks = &ibd_m_callbacks;
2394 	macp->m_min_sdu = 0;
2395 	if (state->id_type == IBD_PORT_DRIVER) {
2396 		macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2397 	} else if (state->id_enable_rc) {
2398 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2399 	} else {
2400 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
2401 	}
2402 	macp->m_priv_props = ibd_priv_props;
2403 
2404 	/*
2405 	 *  Register ourselves with the GLDv3 interface
2406 	 */
2407 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2408 		mac_free(macp);
2409 		DPRINT(10,
2410 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2411 		return (DDI_FAILURE);
2412 	}
2413 
2414 	mac_free(macp);
2415 	return (DDI_SUCCESS);
2416 }
2417 
2418 static int
2419 ibd_record_capab(ibd_state_t *state)
2420 {
2421 	ibt_hca_attr_t hca_attrs;
2422 	ibt_status_t ibt_status;
2423 
2424 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2425 
2426 	/*
2427 	 * Query the HCA and fetch its attributes
2428 	 */
2429 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2430 	ASSERT(ibt_status == IBT_SUCCESS);
2431 
2432 	/*
2433 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2434 	 *    full checksum offload.
2435 	 */
2436 	if (state->id_enable_rc) {
2437 			state->id_hwcksum_capab = 0;
2438 	} else {
2439 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2440 		    == IBT_HCA_CKSUM_FULL) {
2441 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2442 		}
2443 	}
2444 
2445 	/*
2446 	 * 2. Set LSO policy, capability and maximum length
2447 	 */
2448 	if (state->id_enable_rc) {
2449 		state->id_lso_capable = B_FALSE;
2450 		state->id_lso_maxlen = 0;
2451 	} else {
2452 		if (hca_attrs.hca_max_lso_size > 0) {
2453 			state->id_lso_capable = B_TRUE;
2454 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2455 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
2456 			else
2457 				state->id_lso_maxlen =
2458 				    hca_attrs.hca_max_lso_size;
2459 		} else {
2460 			state->id_lso_capable = B_FALSE;
2461 			state->id_lso_maxlen = 0;
2462 		}
2463 	}
2464 
2465 	/*
2466 	 * 3. Set Reserved L_Key capability
2467 	 */
2468 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2469 		state->id_hca_res_lkey_capab = 1;
2470 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2471 		state->rc_enable_iov_map = B_TRUE;
2472 	} else {
2473 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
2474 		state->rc_enable_iov_map = B_FALSE;
2475 	}
2476 
2477 	/*
2478 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2479 	 *    size information is provided by the hca
2480 	 */
2481 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2482 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2483 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2484 	} else {
2485 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2486 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2487 	}
2488 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2489 		state->id_max_sqseg = IBD_MAX_SQSEG;
2490 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2491 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2492 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2493 	}
2494 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2495 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2496 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2497 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2498 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2499 	}
2500 
2501 	/*
2502 	 * Translating the virtual address regions into physical regions
2503 	 * for using the Reserved LKey feature results in a wr sgl that
2504 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2505 	 * we'll fix a high-water mark (65%) for when we should stop.
2506 	 */
2507 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2508 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2509 
2510 	/*
2511 	 * 5. Set number of recv and send wqes after checking hca maximum
2512 	 *    channel size. Store the max channel size in the state so that it
2513 	 *    can be referred to when the swqe/rwqe change is requested via
2514 	 *    dladm.
2515 	 */
2516 
2517 	state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2518 
2519 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2520 		state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2521 
2522 	state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2523 	    IBD_RWQE_MIN;
2524 
2525 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2526 		state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2527 
2528 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2529 
2530 	return (DDI_SUCCESS);
2531 }
2532 
2533 static int
2534 ibd_part_busy(ibd_state_t *state)
2535 {
2536 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2537 		DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2538 		return (DDI_FAILURE);
2539 	}
2540 
2541 	if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2542 		DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2543 		return (DDI_FAILURE);
2544 	}
2545 
2546 	/*
2547 	 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2548 	 * connecting to a remote IPoIB port. We can't remove this port.
2549 	 */
2550 	if (state->id_ah_op == IBD_OP_ONGOING) {
2551 		DPRINT(10, "ibd_part_busy: failed: connecting\n");
2552 		return (DDI_FAILURE);
2553 	}
2554 
2555 	return (DDI_SUCCESS);
2556 }
2557 
2558 
2559 static void
2560 ibd_part_unattach(ibd_state_t *state)
2561 {
2562 	uint32_t progress = state->id_mac_state;
2563 	ibt_status_t ret;
2564 
2565 	/* make sure rx resources are freed */
2566 	ibd_free_rx_rsrcs(state);
2567 
2568 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2569 		ASSERT(state->id_enable_rc);
2570 		ibd_rc_fini_srq_list(state);
2571 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2572 	}
2573 
2574 	if (progress & IBD_DRV_MAC_REGISTERED) {
2575 		(void) mac_unregister(state->id_mh);
2576 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2577 	}
2578 
2579 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2580 		/*
2581 		 * No new async requests will be posted since the device
2582 		 * link state has been marked as unknown; completion handlers
2583 		 * have been turned off, so Tx handler will not cause any
2584 		 * more IBD_ASYNC_REAP requests.
2585 		 *
2586 		 * Queue a request for the async thread to exit, which will
2587 		 * be serviced after any pending ones. This can take a while,
2588 		 * specially if the SM is unreachable, since IBMF will slowly
2589 		 * timeout each SM request issued by the async thread.  Reap
2590 		 * the thread before continuing on, we do not want it to be
2591 		 * lingering in modunloaded code.
2592 		 */
2593 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2594 		thread_join(state->id_async_thrid);
2595 
2596 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2597 	}
2598 
2599 	if (progress & IBD_DRV_REQ_LIST_INITED) {
2600 		list_destroy(&state->id_req_list);
2601 		mutex_destroy(&state->id_acache_req_lock);
2602 		cv_destroy(&state->id_acache_req_cv);
2603 		state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2604 	}
2605 
2606 	if (progress & IBD_DRV_PD_ALLOCD) {
2607 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2608 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2609 			ibd_print_warn(state, "failed to free "
2610 			    "protection domain, ret=%d", ret);
2611 		}
2612 		state->id_pd_hdl = NULL;
2613 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2614 	}
2615 
2616 	if (progress & IBD_DRV_HCA_OPENED) {
2617 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2618 		    IBT_SUCCESS) {
2619 			ibd_print_warn(state, "failed to close "
2620 			    "HCA device, ret=%d", ret);
2621 		}
2622 		state->id_hca_hdl = NULL;
2623 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2624 	}
2625 
2626 	mutex_enter(&ibd_gstate.ig_mutex);
2627 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2628 		if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2629 		    IBT_SUCCESS) {
2630 			ibd_print_warn(state,
2631 			    "ibt_detach() failed, ret=%d", ret);
2632 		}
2633 		state->id_ibt_hdl = NULL;
2634 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2635 		ibd_gstate.ig_ibt_hdl_ref_cnt--;
2636 	}
2637 	if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2638 	    (ibd_gstate.ig_ibt_hdl != NULL)) {
2639 		if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2640 		    IBT_SUCCESS) {
2641 			ibd_print_warn(state, "ibt_detach(): global "
2642 			    "failed, ret=%d", ret);
2643 		}
2644 		ibd_gstate.ig_ibt_hdl = NULL;
2645 	}
2646 	mutex_exit(&ibd_gstate.ig_mutex);
2647 
2648 	if (progress & IBD_DRV_TXINTR_ADDED) {
2649 		ddi_remove_softintr(state->id_tx);
2650 		state->id_tx = NULL;
2651 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2652 	}
2653 
2654 	if (progress & IBD_DRV_RXINTR_ADDED) {
2655 		ddi_remove_softintr(state->id_rx);
2656 		state->id_rx = NULL;
2657 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2658 	}
2659 
2660 #ifdef DEBUG
2661 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2662 		kstat_delete(state->rc_ksp);
2663 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2664 	}
2665 #endif
2666 
2667 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2668 		ibd_state_fini(state);
2669 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2670 	}
2671 }
2672 
2673 int
2674 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2675 {
2676 	ibt_status_t ret;
2677 	int rv;
2678 	kthread_t *kht;
2679 
2680 	/*
2681 	 * Initialize mutexes and condition variables
2682 	 */
2683 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2684 		DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2685 		return (DDI_FAILURE);
2686 	}
2687 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2688 
2689 	/*
2690 	 * Allocate rx,tx softintr
2691 	 */
2692 	if (ibd_rx_softintr == 1) {
2693 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2694 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2695 			DPRINT(10, "ibd_part_attach: failed in "
2696 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2697 			return (DDI_FAILURE);
2698 		}
2699 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2700 	}
2701 	if (ibd_tx_softintr == 1) {
2702 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2703 		    NULL, NULL, ibd_tx_recycle,
2704 		    (caddr_t)state)) != DDI_SUCCESS) {
2705 			DPRINT(10, "ibd_part_attach: failed in "
2706 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2707 			return (DDI_FAILURE);
2708 		}
2709 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2710 	}
2711 
2712 	/*
2713 	 * Attach to IBTL
2714 	 */
2715 	mutex_enter(&ibd_gstate.ig_mutex);
2716 	if (ibd_gstate.ig_ibt_hdl == NULL) {
2717 		if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2718 		    &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2719 			DPRINT(10, "ibd_part_attach: global: failed in "
2720 			    "ibt_attach(), ret=%d", ret);
2721 			mutex_exit(&ibd_gstate.ig_mutex);
2722 			return (DDI_FAILURE);
2723 		}
2724 	}
2725 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2726 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2727 		DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2728 		    ret);
2729 		mutex_exit(&ibd_gstate.ig_mutex);
2730 		return (DDI_FAILURE);
2731 	}
2732 	ibd_gstate.ig_ibt_hdl_ref_cnt++;
2733 	mutex_exit(&ibd_gstate.ig_mutex);
2734 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2735 
2736 	/*
2737 	 * Open the HCA
2738 	 */
2739 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2740 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2741 		DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2742 		    ret);
2743 		return (DDI_FAILURE);
2744 	}
2745 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2746 
2747 #ifdef DEBUG
2748 	/* Initialize Driver Counters for Reliable Connected Mode */
2749 	if (state->id_enable_rc) {
2750 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2751 			DPRINT(10, "ibd_part_attach: failed in "
2752 			    "ibd_rc_init_stats");
2753 			return (DDI_FAILURE);
2754 		}
2755 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2756 	}
2757 #endif
2758 
2759 	/*
2760 	 * Record capabilities
2761 	 */
2762 	(void) ibd_record_capab(state);
2763 
2764 	/*
2765 	 * Allocate a protection domain on the HCA
2766 	 */
2767 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2768 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2769 		DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2770 		    ret);
2771 		return (DDI_FAILURE);
2772 	}
2773 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2774 
2775 
2776 	/*
2777 	 * We need to initialise the req_list that is required for the
2778 	 * operation of the async_thread.
2779 	 */
2780 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2781 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2782 	list_create(&state->id_req_list, sizeof (ibd_req_t),
2783 	    offsetof(ibd_req_t, rq_list));
2784 	state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2785 
2786 	/*
2787 	 * Create the async thread; thread_create never fails.
2788 	 */
2789 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2790 	    TS_RUN, minclsyspri);
2791 	state->id_async_thrid = kht->t_did;
2792 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2793 
2794 	return (DDI_SUCCESS);
2795 }
2796 
2797 /*
2798  * Attach device to the IO framework.
2799  */
2800 static int
2801 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2802 {
2803 	int ret;
2804 
2805 	switch (cmd) {
2806 		case DDI_ATTACH:
2807 			ret = ibd_port_attach(dip);
2808 			break;
2809 		default:
2810 			ret = DDI_FAILURE;
2811 			break;
2812 	}
2813 	return (ret);
2814 }
2815 
2816 /*
2817  * Detach device from the IO framework.
2818  */
2819 static int
2820 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2821 {
2822 	ibd_state_t *state;
2823 	int instance;
2824 
2825 	/*
2826 	 * IBD doesn't support suspend/resume
2827 	 */
2828 	if (cmd != DDI_DETACH)
2829 		return (DDI_FAILURE);
2830 
2831 	/*
2832 	 * Get the instance softstate
2833 	 */
2834 	instance = ddi_get_instance(dip);
2835 	state = ddi_get_soft_state(ibd_list, instance);
2836 
2837 	/*
2838 	 * Release all resources we're holding still.  Note that if we'd
2839 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2840 	 * so far, we should find all the flags we need in id_mac_state.
2841 	 */
2842 	return (ibd_port_unattach(state, dip));
2843 }
2844 
2845 /*
2846  * Pre ibt_attach() driver initialization
2847  */
2848 static int
2849 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2850 {
2851 	char buf[64];
2852 
2853 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2854 	state->id_link_state = LINK_STATE_UNKNOWN;
2855 
2856 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2857 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2858 	state->id_trap_stop = B_TRUE;
2859 	state->id_trap_inprog = 0;
2860 
2861 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2862 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2863 	state->id_dip = dip;
2864 
2865 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2866 
2867 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2868 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2869 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2870 	state->id_tx_busy = 0;
2871 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2872 
2873 	state->id_rx_list.dl_bufs_outstanding = 0;
2874 	state->id_rx_list.dl_cnt = 0;
2875 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2876 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2877 	(void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2878 	    state->id_pkey, state->id_plinkid);
2879 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2880 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2881 
2882 	/* For Reliable Connected Mode */
2883 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2884 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2885 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2886 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2887 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2888 	    MUTEX_DRIVER, NULL);
2889 	mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2890 
2891 	/*
2892 	 * Make the default link mode as RC. If this fails during connection
2893 	 * setup, the link mode is automatically transitioned to UD.
2894 	 * Also set the RC MTU.
2895 	 */
2896 	state->id_enable_rc = IBD_DEF_LINK_MODE;
2897 	state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2898 	state->id_mtu = IBD_DEF_MAX_MTU;
2899 
2900 	/* Iniatialize all tunables to default */
2901 	state->id_lso_policy = IBD_DEF_LSO_POLICY;
2902 	state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2903 	state->id_num_ah = IBD_DEF_NUM_AH;
2904 	state->id_hash_size = IBD_DEF_HASH_SIZE;
2905 	state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2906 	state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2907 	state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2908 	state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2909 	state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2910 	state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2911 	state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2912 	state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2913 	state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2914 	state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2915 	state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2916 	state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2917 	state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2918 	state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2919 	state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2920 	state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2921 	state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2922 	state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2923 	state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2924 	state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2925 
2926 	return (DDI_SUCCESS);
2927 }
2928 
2929 /*
2930  * Post ibt_detach() driver deconstruction
2931  */
2932 static void
2933 ibd_state_fini(ibd_state_t *state)
2934 {
2935 	kmem_cache_destroy(state->id_req_kmc);
2936 
2937 	mutex_destroy(&state->id_rx_list.dl_mutex);
2938 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2939 
2940 	mutex_destroy(&state->id_txpost_lock);
2941 	mutex_destroy(&state->id_tx_list.dl_mutex);
2942 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2943 	mutex_destroy(&state->id_lso_lock);
2944 
2945 	mutex_destroy(&state->id_sched_lock);
2946 	mutex_destroy(&state->id_scq_poll_lock);
2947 	mutex_destroy(&state->id_rcq_poll_lock);
2948 
2949 	cv_destroy(&state->id_trap_cv);
2950 	mutex_destroy(&state->id_trap_lock);
2951 	mutex_destroy(&state->id_link_mutex);
2952 
2953 	/* For Reliable Connected Mode */
2954 	mutex_destroy(&state->rc_timeout_lock);
2955 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2956 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2957 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2958 	mutex_destroy(&state->rc_tx_large_bufs_lock);
2959 	mutex_destroy(&state->rc_rx_lock);
2960 }
2961 
2962 /*
2963  * Fetch link speed from SA for snmp ifspeed reporting.
2964  */
2965 static uint64_t
2966 ibd_get_portspeed(ibd_state_t *state)
2967 {
2968 	int			ret;
2969 	ibt_path_info_t		path;
2970 	ibt_path_attr_t		path_attr;
2971 	uint8_t			num_paths;
2972 	uint64_t		ifspeed;
2973 
2974 	/*
2975 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2976 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2977 	 * 2000000000. Start with that as default.
2978 	 */
2979 	ifspeed = 2000000000;
2980 
2981 	bzero(&path_attr, sizeof (path_attr));
2982 
2983 	/*
2984 	 * Get the port speed from Loopback path information.
2985 	 */
2986 	path_attr.pa_dgids = &state->id_sgid;
2987 	path_attr.pa_num_dgids = 1;
2988 	path_attr.pa_sgid = state->id_sgid;
2989 
2990 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2991 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2992 		goto earlydone;
2993 
2994 	if (num_paths < 1)
2995 		goto earlydone;
2996 
2997 	/*
2998 	 * In case SA does not return an expected value, report the default
2999 	 * speed as 1X.
3000 	 */
3001 	ret = 1;
3002 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
3003 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
3004 			ret = 1;
3005 			break;
3006 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
3007 			ret = 4;
3008 			break;
3009 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
3010 			ret = 12;
3011 			break;
3012 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
3013 			ret = 2;
3014 			break;
3015 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
3016 			ret = 8;
3017 			break;
3018 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
3019 			ret = 16;
3020 			break;
3021 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
3022 			ret = 24;
3023 			break;
3024 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
3025 			ret = 32;
3026 			break;
3027 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
3028 			ret = 48;
3029 			break;
3030 	}
3031 
3032 	ifspeed *= ret;
3033 
3034 earlydone:
3035 	return (ifspeed);
3036 }
3037 
3038 /*
3039  * Search input mcg list (id_mc_full or id_mc_non) for an entry
3040  * representing the input mcg mgid.
3041  */
3042 static ibd_mce_t *
3043 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3044 {
3045 	ibd_mce_t *ptr = list_head(mlist);
3046 
3047 	/*
3048 	 * Do plain linear search.
3049 	 */
3050 	while (ptr != NULL) {
3051 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3052 		    sizeof (ib_gid_t)) == 0)
3053 			return (ptr);
3054 		ptr = list_next(mlist, ptr);
3055 	}
3056 	return (NULL);
3057 }
3058 
3059 /*
3060  * Execute IBA JOIN.
3061  */
3062 static ibt_status_t
3063 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3064 {
3065 	ibt_mcg_attr_t mcg_attr;
3066 
3067 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3068 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3069 	mcg_attr.mc_mgid = mgid;
3070 	mcg_attr.mc_join_state = mce->mc_jstate;
3071 	mcg_attr.mc_scope = state->id_scope;
3072 	mcg_attr.mc_pkey = state->id_pkey;
3073 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3074 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3075 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3076 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3077 	    NULL, NULL));
3078 }
3079 
3080 /*
3081  * This code JOINs the port in the proper way (depending on the join
3082  * state) so that IBA fabric will forward mcg packets to/from the port.
3083  * It also attaches the QPN to the mcg so it can receive those mcg
3084  * packets. This code makes sure not to attach the mcg to the QP if
3085  * that has been previously done due to the mcg being joined with a
3086  * different join state, even though this is not required by SWG_0216,
3087  * refid 3610.
3088  */
3089 static ibd_mce_t *
3090 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3091 {
3092 	ibt_status_t ibt_status;
3093 	ibd_mce_t *mce, *tmce, *omce = NULL;
3094 	boolean_t do_attach = B_TRUE;
3095 
3096 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3097 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3098 
3099 	/*
3100 	 * For enable_multicast Full member joins, we need to do some
3101 	 * extra work. If there is already an mce on the list that
3102 	 * indicates full membership, that means the membership has
3103 	 * not yet been dropped (since the disable_multicast was issued)
3104 	 * because there are pending Tx's to the mcg; in that case, just
3105 	 * mark the mce not to be reaped when the Tx completion queues
3106 	 * an async reap operation.
3107 	 *
3108 	 * If there is already an mce on the list indicating sendonly
3109 	 * membership, try to promote to full membership. Be careful
3110 	 * not to deallocate the old mce, since there might be an AH
3111 	 * pointing to it; instead, update the old mce with new data
3112 	 * that tracks the full membership.
3113 	 */
3114 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3115 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3116 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3117 			ASSERT(omce->mc_fullreap);
3118 			omce->mc_fullreap = B_FALSE;
3119 			return (omce);
3120 		} else {
3121 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3122 		}
3123 	}
3124 
3125 	/*
3126 	 * Allocate the ibd_mce_t to track this JOIN.
3127 	 */
3128 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3129 	mce->mc_fullreap = B_FALSE;
3130 	mce->mc_jstate = jstate;
3131 
3132 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3133 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3134 		    ibt_status);
3135 		kmem_free(mce, sizeof (ibd_mce_t));
3136 		return (NULL);
3137 	}
3138 
3139 	/*
3140 	 * Is an IBA attach required? Not if the interface is already joined
3141 	 * to the mcg in a different appropriate join state.
3142 	 */
3143 	if (jstate == IB_MC_JSTATE_NON) {
3144 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3145 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3146 			do_attach = B_FALSE;
3147 	} else if (jstate == IB_MC_JSTATE_FULL) {
3148 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3149 			do_attach = B_FALSE;
3150 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3151 		do_attach = B_FALSE;
3152 	}
3153 
3154 	if (do_attach) {
3155 		/*
3156 		 * Do the IBA attach.
3157 		 */
3158 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3159 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3160 		    &mce->mc_info)) != IBT_SUCCESS) {
3161 			DPRINT(10, "ibd_join_group : failed qp attachment "
3162 			    "%d\n", ibt_status);
3163 			/*
3164 			 * NOTE that we should probably preserve the join info
3165 			 * in the list and later try to leave again at detach
3166 			 * time.
3167 			 */
3168 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3169 			    state->id_sgid, jstate);
3170 			kmem_free(mce, sizeof (ibd_mce_t));
3171 			return (NULL);
3172 		}
3173 	}
3174 
3175 	/*
3176 	 * Insert the ibd_mce_t in the proper list.
3177 	 */
3178 	if (jstate == IB_MC_JSTATE_NON) {
3179 		IBD_MCACHE_INSERT_NON(state, mce);
3180 	} else {
3181 		/*
3182 		 * Set up the mc_req fields used for reaping the
3183 		 * mcg in case of delayed tx completion (see
3184 		 * ibd_tx_cleanup()). Also done for sendonly join in
3185 		 * case we are promoted to fullmembership later and
3186 		 * keep using the same mce.
3187 		 */
3188 		mce->mc_req.rq_gid = mgid;
3189 		mce->mc_req.rq_ptr = mce;
3190 		/*
3191 		 * Check whether this is the case of trying to join
3192 		 * full member, and we were already joined send only.
3193 		 * We try to drop our SendOnly membership, but it is
3194 		 * possible that the mcg does not exist anymore (and
3195 		 * the subnet trap never reached us), so the leave
3196 		 * operation might fail.
3197 		 */
3198 		if (omce != NULL) {
3199 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3200 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3201 			omce->mc_jstate = IB_MC_JSTATE_FULL;
3202 			bcopy(&mce->mc_info, &omce->mc_info,
3203 			    sizeof (ibt_mcg_info_t));
3204 			kmem_free(mce, sizeof (ibd_mce_t));
3205 			return (omce);
3206 		}
3207 		mutex_enter(&state->id_mc_mutex);
3208 		IBD_MCACHE_INSERT_FULL(state, mce);
3209 		mutex_exit(&state->id_mc_mutex);
3210 	}
3211 
3212 	return (mce);
3213 }
3214 
3215 /*
3216  * Called during port up event handling to attempt to reacquire full
3217  * membership to an mcg. Stripped down version of ibd_join_group().
3218  * Note that it is possible that the mcg might have gone away, and
3219  * gets recreated at this point.
3220  */
3221 static void
3222 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3223 {
3224 	ib_gid_t mgid;
3225 
3226 	/*
3227 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
3228 	 * reap/leave is going to try to leave the group. We could prevent
3229 	 * that by adding a boolean flag into ibd_mce_t, if required.
3230 	 */
3231 	if (mce->mc_fullreap)
3232 		return;
3233 
3234 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
3235 
3236 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3237 	    mgid.gid_guid);
3238 
3239 	/* While reacquiring, leave and then join the MCG */
3240 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3241 	    mce->mc_jstate);
3242 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3243 		ibd_print_warn(state, "Failure on port up to rejoin "
3244 		    "multicast gid %016llx:%016llx",
3245 		    (u_longlong_t)mgid.gid_prefix,
3246 		    (u_longlong_t)mgid.gid_guid);
3247 }
3248 
3249 /*
3250  * This code handles delayed Tx completion cleanups for mcg's to which
3251  * disable_multicast has been issued, regular mcg related cleanups during
3252  * disable_multicast, disable_promiscuous and mcg traps, as well as
3253  * cleanups during driver detach time. Depending on the join state,
3254  * it deletes the mce from the appropriate list and issues the IBA
3255  * leave/detach; except in the disable_multicast case when the mce
3256  * is left on the active list for a subsequent Tx completion cleanup.
3257  */
3258 static void
3259 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3260     uint8_t jstate)
3261 {
3262 	ibd_mce_t *tmce;
3263 	boolean_t do_detach = B_TRUE;
3264 
3265 	/*
3266 	 * Before detaching, we must check whether the other list
3267 	 * contains the mcg; if we detach blindly, the consumer
3268 	 * who set up the other list will also stop receiving
3269 	 * traffic.
3270 	 */
3271 	if (jstate == IB_MC_JSTATE_FULL) {
3272 		/*
3273 		 * The following check is only relevant while coming
3274 		 * from the Tx completion path in the reap case.
3275 		 */
3276 		if (!mce->mc_fullreap)
3277 			return;
3278 		mutex_enter(&state->id_mc_mutex);
3279 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3280 		mutex_exit(&state->id_mc_mutex);
3281 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3282 			do_detach = B_FALSE;
3283 	} else if (jstate == IB_MC_JSTATE_NON) {
3284 		IBD_MCACHE_PULLOUT_NON(state, mce);
3285 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3286 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3287 			do_detach = B_FALSE;
3288 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3289 		mutex_enter(&state->id_mc_mutex);
3290 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3291 		mutex_exit(&state->id_mc_mutex);
3292 		do_detach = B_FALSE;
3293 	}
3294 
3295 	/*
3296 	 * If we are reacting to a mcg trap and leaving our sendonly or
3297 	 * non membership, the mcg is possibly already gone, so attempting
3298 	 * to leave might fail. On the other hand, we must try to leave
3299 	 * anyway, since this might be a trap from long ago, and we could
3300 	 * have potentially sendonly joined to a recent incarnation of
3301 	 * the mcg and are about to loose track of this information.
3302 	 */
3303 	if (do_detach) {
3304 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3305 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3306 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3307 	}
3308 
3309 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3310 	kmem_free(mce, sizeof (ibd_mce_t));
3311 }
3312 
3313 /*
3314  * Async code executed due to multicast and promiscuous disable requests
3315  * and mcg trap handling; also executed during driver detach. Mostly, a
3316  * leave and detach is done; except for the fullmember case when Tx
3317  * requests are pending, whence arrangements are made for subsequent
3318  * cleanup on Tx completion.
3319  */
3320 static void
3321 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3322 {
3323 	ipoib_mac_t mcmac;
3324 	boolean_t recycled;
3325 	ibd_mce_t *mce;
3326 
3327 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3328 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3329 
3330 	if (jstate == IB_MC_JSTATE_NON) {
3331 		recycled = B_TRUE;
3332 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3333 		/*
3334 		 * In case we are handling a mcg trap, we might not find
3335 		 * the mcg in the non list.
3336 		 */
3337 		if (mce == NULL) {
3338 			return;
3339 		}
3340 	} else {
3341 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3342 
3343 		/*
3344 		 * In case we are handling a mcg trap, make sure the trap
3345 		 * is not arriving late; if we have an mce that indicates
3346 		 * that we are already a fullmember, that would be a clear
3347 		 * indication that the trap arrived late (ie, is for a
3348 		 * previous incarnation of the mcg).
3349 		 */
3350 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3351 			if ((mce == NULL) || (mce->mc_jstate ==
3352 			    IB_MC_JSTATE_FULL)) {
3353 				return;
3354 			}
3355 		} else {
3356 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3357 
3358 			/*
3359 			 * If join group failed, mce will be NULL here.
3360 			 * This is because in GLDv3 driver, set multicast
3361 			 *  will always return success.
3362 			 */
3363 			if (mce == NULL) {
3364 				return;
3365 			}
3366 
3367 			mce->mc_fullreap = B_TRUE;
3368 		}
3369 
3370 		/*
3371 		 * If no pending Tx's remain that reference the AH
3372 		 * for the mcg, recycle it from active to free list.
3373 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3374 		 * so the last completing Tx will cause an async reap
3375 		 * operation to be invoked, at which time we will drop our
3376 		 * membership to the mcg so that the pending Tx's complete
3377 		 * successfully. Refer to comments on "AH and MCE active
3378 		 * list manipulation" at top of this file. The lock protects
3379 		 * against Tx fast path and Tx cleanup code.
3380 		 */
3381 		mutex_enter(&state->id_ac_mutex);
3382 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3383 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3384 		    IB_MC_JSTATE_SEND_ONLY_NON));
3385 		mutex_exit(&state->id_ac_mutex);
3386 	}
3387 
3388 	if (recycled) {
3389 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3390 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3391 		ibd_async_reap_group(state, mce, mgid, jstate);
3392 	}
3393 }
3394 
3395 /*
3396  * Find the broadcast address as defined by IPoIB; implicitly
3397  * determines the IBA scope, mtu, tclass etc of the link the
3398  * interface is going to be a member of.
3399  */
3400 static ibt_status_t
3401 ibd_find_bgroup(ibd_state_t *state)
3402 {
3403 	ibt_mcg_attr_t mcg_attr;
3404 	uint_t numg;
3405 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3406 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3407 	    IB_MC_SCOPE_GLOBAL };
3408 	int i, mcgmtu;
3409 	boolean_t found = B_FALSE;
3410 	int ret;
3411 	ibt_mcg_info_t mcg_info;
3412 
3413 	state->id_bgroup_created = B_FALSE;
3414 	state->id_bgroup_present = B_FALSE;
3415 
3416 query_bcast_grp:
3417 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3418 	mcg_attr.mc_pkey = state->id_pkey;
3419 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3420 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3421 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3422 
3423 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3424 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3425 
3426 		/*
3427 		 * Look for the IPoIB broadcast group.
3428 		 */
3429 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3430 		state->id_mgid.gid_prefix =
3431 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3432 		    ((uint64_t)state->id_scope << 48) |
3433 		    ((uint32_t)(state->id_pkey << 16)));
3434 		mcg_attr.mc_mgid = state->id_mgid;
3435 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3436 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3437 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3438 			found = B_TRUE;
3439 			break;
3440 		}
3441 	}
3442 
3443 	if (!found) {
3444 		if (state->id_create_broadcast_group) {
3445 			/*
3446 			 * If we created the broadcast group, but failed to
3447 			 * find it, we can't do anything except leave the
3448 			 * one we created and return failure.
3449 			 */
3450 			if (state->id_bgroup_created) {
3451 				ibd_print_warn(state, "IPoIB broadcast group "
3452 				    "absent. Unable to query after create.");
3453 				goto find_bgroup_fail;
3454 			}
3455 
3456 			/*
3457 			 * Create the ipoib broadcast group if it didn't exist
3458 			 */
3459 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3460 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3461 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3462 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3463 			mcg_attr.mc_pkey = state->id_pkey;
3464 			mcg_attr.mc_flow = 0;
3465 			mcg_attr.mc_sl = 0;
3466 			mcg_attr.mc_tclass = 0;
3467 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3468 			state->id_mgid.gid_prefix =
3469 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3470 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3471 			    ((uint32_t)(state->id_pkey << 16)));
3472 			mcg_attr.mc_mgid = state->id_mgid;
3473 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3474 
3475 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3476 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3477 				ibd_print_warn(state, "IPoIB broadcast group "
3478 				    "absent, create failed: ret = %d\n", ret);
3479 				state->id_bgroup_created = B_FALSE;
3480 				return (IBT_FAILURE);
3481 			}
3482 			state->id_bgroup_created = B_TRUE;
3483 			goto query_bcast_grp;
3484 		} else {
3485 			ibd_print_warn(state, "IPoIB broadcast group absent");
3486 			return (IBT_FAILURE);
3487 		}
3488 	}
3489 
3490 	/*
3491 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3492 	 */
3493 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3494 	if (state->id_mtu < mcgmtu) {
3495 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3496 		    "greater than port's maximum MTU %d", mcgmtu,
3497 		    state->id_mtu);
3498 		ibt_free_mcg_info(state->id_mcinfo, 1);
3499 		goto find_bgroup_fail;
3500 	}
3501 	state->id_mtu = mcgmtu;
3502 	state->id_bgroup_present = B_TRUE;
3503 
3504 	return (IBT_SUCCESS);
3505 
3506 find_bgroup_fail:
3507 	if (state->id_bgroup_created) {
3508 		(void) ibt_leave_mcg(state->id_sgid,
3509 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3510 		    IB_MC_JSTATE_FULL);
3511 	}
3512 
3513 	return (IBT_FAILURE);
3514 }
3515 
3516 static int
3517 ibd_alloc_tx_copybufs(ibd_state_t *state)
3518 {
3519 	ibt_mr_attr_t mem_attr;
3520 
3521 	/*
3522 	 * Allocate one big chunk for all regular tx copy bufs
3523 	 */
3524 	state->id_tx_buf_sz = state->id_mtu;
3525 	if (state->id_lso_policy && state->id_lso_capable &&
3526 	    (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3527 		state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3528 	}
3529 
3530 	state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3531 	    state->id_tx_buf_sz, KM_SLEEP);
3532 
3533 	state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3534 	    sizeof (ibd_swqe_t), KM_SLEEP);
3535 
3536 	/*
3537 	 * Do one memory registration on the entire txbuf area
3538 	 */
3539 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3540 	mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3541 	mem_attr.mr_as = NULL;
3542 	mem_attr.mr_flags = IBT_MR_SLEEP;
3543 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3544 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3545 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3546 		kmem_free(state->id_tx_wqes,
3547 		    state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3548 		kmem_free(state->id_tx_bufs,
3549 		    state->id_ud_num_swqe * state->id_tx_buf_sz);
3550 		state->id_tx_bufs = NULL;
3551 		return (DDI_FAILURE);
3552 	}
3553 
3554 	return (DDI_SUCCESS);
3555 }
3556 
3557 static int
3558 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3559 {
3560 	ibt_mr_attr_t mem_attr;
3561 	ibd_lsobuf_t *buflist;
3562 	ibd_lsobuf_t *lbufp;
3563 	ibd_lsobuf_t *tail;
3564 	ibd_lsobkt_t *bktp;
3565 	uint8_t *membase;
3566 	uint8_t *memp;
3567 	uint_t memsz;
3568 	int i;
3569 
3570 	/*
3571 	 * Allocate the lso bucket
3572 	 */
3573 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3574 
3575 	/*
3576 	 * Allocate the entire lso memory and register it
3577 	 */
3578 	memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3579 	membase = kmem_zalloc(memsz, KM_SLEEP);
3580 
3581 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3582 	mem_attr.mr_len = memsz;
3583 	mem_attr.mr_as = NULL;
3584 	mem_attr.mr_flags = IBT_MR_SLEEP;
3585 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3586 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3587 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3588 		kmem_free(membase, memsz);
3589 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3590 		return (DDI_FAILURE);
3591 	}
3592 
3593 	mutex_enter(&state->id_lso_lock);
3594 
3595 	/*
3596 	 * Now allocate the buflist.  Note that the elements in the buflist and
3597 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3598 	 * can always derive the address of a buflist entry from the address of
3599 	 * an lso buffer.
3600 	 */
3601 	buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3602 	    KM_SLEEP);
3603 
3604 	/*
3605 	 * Set up the lso buf chain
3606 	 */
3607 	memp = membase;
3608 	lbufp = buflist;
3609 	for (i = 0; i < state->id_num_lso_bufs; i++) {
3610 		lbufp->lb_isfree = 1;
3611 		lbufp->lb_buf = memp;
3612 		lbufp->lb_next = lbufp + 1;
3613 
3614 		tail = lbufp;
3615 
3616 		memp += IBD_LSO_BUFSZ;
3617 		lbufp++;
3618 	}
3619 	tail->lb_next = NULL;
3620 
3621 	/*
3622 	 * Set up the LSO buffer information in ibd state
3623 	 */
3624 	bktp->bkt_bufl = buflist;
3625 	bktp->bkt_free_head = buflist;
3626 	bktp->bkt_mem = membase;
3627 	bktp->bkt_nelem = state->id_num_lso_bufs;
3628 	bktp->bkt_nfree = bktp->bkt_nelem;
3629 
3630 	state->id_lso = bktp;
3631 	mutex_exit(&state->id_lso_lock);
3632 
3633 	return (DDI_SUCCESS);
3634 }
3635 
3636 /*
3637  * Statically allocate Tx buffer list(s).
3638  */
3639 static int
3640 ibd_init_txlist(ibd_state_t *state)
3641 {
3642 	ibd_swqe_t *swqe;
3643 	ibt_lkey_t lkey;
3644 	int i;
3645 	uint_t len;
3646 	uint8_t *bufaddr;
3647 
3648 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3649 		return (DDI_FAILURE);
3650 
3651 	if (state->id_lso_policy && state->id_lso_capable) {
3652 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3653 			state->id_lso_capable = B_FALSE;
3654 	}
3655 
3656 	mutex_enter(&state->id_tx_list.dl_mutex);
3657 	state->id_tx_list.dl_head = NULL;
3658 	state->id_tx_list.dl_pending_sends = B_FALSE;
3659 	state->id_tx_list.dl_cnt = 0;
3660 	mutex_exit(&state->id_tx_list.dl_mutex);
3661 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3662 	state->id_tx_rel_list.dl_head = NULL;
3663 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3664 	state->id_tx_rel_list.dl_cnt = 0;
3665 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3666 
3667 	/*
3668 	 * Allocate and setup the swqe list
3669 	 */
3670 	lkey = state->id_tx_mr_desc.md_lkey;
3671 	bufaddr = state->id_tx_bufs;
3672 	len = state->id_tx_buf_sz;
3673 	swqe = state->id_tx_wqes;
3674 	mutex_enter(&state->id_tx_list.dl_mutex);
3675 	for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3676 		swqe->swqe_next = NULL;
3677 		swqe->swqe_im_mblk = NULL;
3678 
3679 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3680 		    bufaddr;
3681 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3682 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3683 
3684 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3685 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3686 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3687 
3688 		/* These are set in send */
3689 		swqe->w_swr.wr_nds = 0;
3690 		swqe->w_swr.wr_sgl = NULL;
3691 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3692 
3693 		/* add to list */
3694 		state->id_tx_list.dl_cnt++;
3695 		swqe->swqe_next = state->id_tx_list.dl_head;
3696 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3697 	}
3698 	mutex_exit(&state->id_tx_list.dl_mutex);
3699 
3700 	return (DDI_SUCCESS);
3701 }
3702 
3703 static int
3704 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3705     uint32_t *nds_p)
3706 {
3707 	ibd_lsobkt_t *bktp;
3708 	ibd_lsobuf_t *lbufp;
3709 	ibd_lsobuf_t *nextp;
3710 	ibt_lkey_t lso_lkey;
3711 	uint_t frag_sz;
3712 	uint_t num_needed;
3713 	int i;
3714 
3715 	ASSERT(sgl_p != NULL);
3716 	ASSERT(nds_p != NULL);
3717 	ASSERT(req_sz != 0);
3718 
3719 	/*
3720 	 * Determine how many bufs we'd need for the size requested
3721 	 */
3722 	num_needed = req_sz / IBD_LSO_BUFSZ;
3723 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3724 		num_needed++;
3725 
3726 	mutex_enter(&state->id_lso_lock);
3727 
3728 	/*
3729 	 * If we don't have enough lso bufs, return failure
3730 	 */
3731 	ASSERT(state->id_lso != NULL);
3732 	bktp = state->id_lso;
3733 	if (bktp->bkt_nfree < num_needed) {
3734 		mutex_exit(&state->id_lso_lock);
3735 		return (-1);
3736 	}
3737 
3738 	/*
3739 	 * Pick the first 'num_needed' bufs from the free list
3740 	 */
3741 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3742 	lbufp = bktp->bkt_free_head;
3743 	for (i = 0; i < num_needed; i++) {
3744 		ASSERT(lbufp->lb_isfree != 0);
3745 		ASSERT(lbufp->lb_buf != NULL);
3746 
3747 		nextp = lbufp->lb_next;
3748 
3749 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3750 		sgl_p[i].ds_key = lso_lkey;
3751 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3752 
3753 		lbufp->lb_isfree = 0;
3754 		lbufp->lb_next = NULL;
3755 
3756 		lbufp = nextp;
3757 	}
3758 	bktp->bkt_free_head = lbufp;
3759 
3760 	/*
3761 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3762 	 * to adjust the last sgl entry's length. Since we know we need atleast
3763 	 * one, the i-1 use below is ok.
3764 	 */
3765 	if (frag_sz) {
3766 		sgl_p[i-1].ds_len = frag_sz;
3767 	}
3768 
3769 	/*
3770 	 * Update nfree count and return
3771 	 */
3772 	bktp->bkt_nfree -= num_needed;
3773 
3774 	mutex_exit(&state->id_lso_lock);
3775 
3776 	*nds_p = num_needed;
3777 
3778 	return (0);
3779 }
3780 
3781 static void
3782 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3783 {
3784 	ibd_lsobkt_t *bktp;
3785 	ibd_lsobuf_t *lbufp;
3786 	uint8_t *lso_mem_end;
3787 	uint_t ndx;
3788 	int i;
3789 
3790 	mutex_enter(&state->id_lso_lock);
3791 
3792 	bktp = state->id_lso;
3793 	ASSERT(bktp != NULL);
3794 
3795 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3796 	for (i = 0; i < nds; i++) {
3797 		uint8_t *va;
3798 
3799 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3800 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3801 
3802 		/*
3803 		 * Figure out the buflist element this sgl buffer corresponds
3804 		 * to and put it back at the head
3805 		 */
3806 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3807 		lbufp = bktp->bkt_bufl + ndx;
3808 
3809 		ASSERT(lbufp->lb_isfree == 0);
3810 		ASSERT(lbufp->lb_buf == va);
3811 
3812 		lbufp->lb_isfree = 1;
3813 		lbufp->lb_next = bktp->bkt_free_head;
3814 		bktp->bkt_free_head = lbufp;
3815 	}
3816 	bktp->bkt_nfree += nds;
3817 
3818 	mutex_exit(&state->id_lso_lock);
3819 }
3820 
3821 static void
3822 ibd_free_tx_copybufs(ibd_state_t *state)
3823 {
3824 	/*
3825 	 * Unregister txbuf mr
3826 	 */
3827 	if (ibt_deregister_mr(state->id_hca_hdl,
3828 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3829 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3830 	}
3831 	state->id_tx_mr_hdl = NULL;
3832 
3833 	/*
3834 	 * Free txbuf memory
3835 	 */
3836 	kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3837 	    sizeof (ibd_swqe_t));
3838 	kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3839 	    state->id_tx_buf_sz);
3840 	state->id_tx_wqes = NULL;
3841 	state->id_tx_bufs = NULL;
3842 }
3843 
3844 static void
3845 ibd_free_tx_lsobufs(ibd_state_t *state)
3846 {
3847 	ibd_lsobkt_t *bktp;
3848 
3849 	mutex_enter(&state->id_lso_lock);
3850 
3851 	if ((bktp = state->id_lso) == NULL) {
3852 		mutex_exit(&state->id_lso_lock);
3853 		return;
3854 	}
3855 
3856 	/*
3857 	 * First, free the buflist
3858 	 */
3859 	ASSERT(bktp->bkt_bufl != NULL);
3860 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3861 
3862 	/*
3863 	 * Unregister the LSO memory and free it
3864 	 */
3865 	ASSERT(bktp->bkt_mr_hdl != NULL);
3866 	if (ibt_deregister_mr(state->id_hca_hdl,
3867 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3868 		DPRINT(10,
3869 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3870 	}
3871 	ASSERT(bktp->bkt_mem);
3872 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3873 
3874 	/*
3875 	 * Finally free the bucket
3876 	 */
3877 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3878 	state->id_lso = NULL;
3879 
3880 	mutex_exit(&state->id_lso_lock);
3881 }
3882 
3883 /*
3884  * Free the statically allocated Tx buffer list.
3885  */
3886 static void
3887 ibd_fini_txlist(ibd_state_t *state)
3888 {
3889 	/*
3890 	 * Free the allocated swqes
3891 	 */
3892 	mutex_enter(&state->id_tx_list.dl_mutex);
3893 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3894 	state->id_tx_list.dl_head = NULL;
3895 	state->id_tx_list.dl_pending_sends = B_FALSE;
3896 	state->id_tx_list.dl_cnt = 0;
3897 	state->id_tx_rel_list.dl_head = NULL;
3898 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3899 	state->id_tx_rel_list.dl_cnt = 0;
3900 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3901 	mutex_exit(&state->id_tx_list.dl_mutex);
3902 
3903 	ibd_free_tx_lsobufs(state);
3904 	ibd_free_tx_copybufs(state);
3905 }
3906 
3907 /*
3908  * post a list of rwqes, NULL terminated.
3909  */
3910 static void
3911 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3912 {
3913 	uint_t		i;
3914 	uint_t		num_posted;
3915 	ibt_status_t	ibt_status;
3916 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3917 
3918 	while (rwqe) {
3919 		/* Post up to IBD_RX_POST_CNT receive work requests */
3920 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
3921 			wrs[i] = rwqe->w_rwr;
3922 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3923 			if (rwqe == NULL) {
3924 				i++;
3925 				break;
3926 			}
3927 		}
3928 
3929 		/*
3930 		 * If posting fails for some reason, we'll never receive
3931 		 * completion intimation, so we'll need to cleanup. But
3932 		 * we need to make sure we don't clean up nodes whose
3933 		 * wrs have been successfully posted. We assume that the
3934 		 * hca driver returns on the first failure to post and
3935 		 * therefore the first 'num_posted' entries don't need
3936 		 * cleanup here.
3937 		 */
3938 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
3939 
3940 		num_posted = 0;
3941 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3942 		    &num_posted);
3943 		if (ibt_status != IBT_SUCCESS) {
3944 			/* This cannot happen unless the device has an error. */
3945 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
3946 			    "posting multiple wrs failed: "
3947 			    "requested=%d, done=%d, ret=%d",
3948 			    IBD_RX_POST_CNT, num_posted, ibt_status);
3949 			atomic_add_32(&state->id_rx_list.dl_cnt,
3950 			    num_posted - i);
3951 		}
3952 	}
3953 }
3954 
3955 /*
3956  * Grab a list of rwqes from the array of lists, and post the list.
3957  */
3958 static void
3959 ibd_post_recv_intr(ibd_state_t *state)
3960 {
3961 	ibd_rx_queue_t	*rxp;
3962 	ibd_rwqe_t *list;
3963 
3964 	/* rotate through the rx_queue array, expecting an adequate number */
3965 	state->id_rx_post_queue_index =
3966 	    (state->id_rx_post_queue_index + 1) &
3967 	    (state->id_rx_nqueues - 1);
3968 
3969 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3970 	mutex_enter(&rxp->rx_post_lock);
3971 	list = WQE_TO_RWQE(rxp->rx_head);
3972 	rxp->rx_head = NULL;
3973 	rxp->rx_cnt = 0;
3974 	mutex_exit(&rxp->rx_post_lock);
3975 	ibd_post_recv_list(state, list);
3976 }
3977 
3978 /* macro explained below */
3979 #define	RX_QUEUE_HASH(rwqe) \
3980 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3981 
3982 /*
3983  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3984  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3985  *
3986  * Note: one of 2^N lists is chosen via a hash.  This is done
3987  * because using one list is contentious.  If the first list is busy
3988  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3989  *
3990  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3991  * even distribution of mapping rwqes to the 2^N queues.
3992  */
3993 static void
3994 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3995 {
3996 	ibd_rx_queue_t	*rxp;
3997 
3998 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3999 
4000 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
4001 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
4002 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
4003 		mutex_enter(&rxp->rx_post_lock);
4004 	}
4005 	rwqe->rwqe_next = rxp->rx_head;
4006 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
4007 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4008 
4009 		/* only call ibt_post_recv() every Nth time through here */
4010 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
4011 			rxp->rx_head = NULL;
4012 			rxp->rx_cnt = 0;
4013 			mutex_exit(&rxp->rx_post_lock);
4014 			ibd_post_recv_list(state, rwqe);
4015 			return;
4016 		}
4017 	}
4018 	rxp->rx_head = RWQE_TO_WQE(rwqe);
4019 	mutex_exit(&rxp->rx_post_lock);
4020 }
4021 
4022 static int
4023 ibd_alloc_rx_copybufs(ibd_state_t *state)
4024 {
4025 	ibt_mr_attr_t mem_attr;
4026 	int i;
4027 
4028 	/*
4029 	 * Allocate one big chunk for all regular rx copy bufs
4030 	 */
4031 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4032 
4033 	state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4034 	    state->id_rx_buf_sz, KM_SLEEP);
4035 
4036 	state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4037 	    sizeof (ibd_rwqe_t), KM_SLEEP);
4038 
4039 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4040 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4041 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
4042 	for (i = 0; i < state->id_rx_nqueues; i++) {
4043 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4044 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4045 	}
4046 
4047 	/*
4048 	 * Do one memory registration on the entire rxbuf area
4049 	 */
4050 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4051 	mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4052 	mem_attr.mr_as = NULL;
4053 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4054 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4055 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4056 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4057 		kmem_free(state->id_rx_wqes,
4058 		    state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4059 		kmem_free(state->id_rx_bufs,
4060 		    state->id_ud_num_rwqe * state->id_rx_buf_sz);
4061 		state->id_rx_bufs = NULL;
4062 		state->id_rx_wqes = NULL;
4063 		return (DDI_FAILURE);
4064 	}
4065 
4066 	return (DDI_SUCCESS);
4067 }
4068 
4069 /*
4070  * Allocate the statically allocated Rx buffer list.
4071  */
4072 static int
4073 ibd_init_rxlist(ibd_state_t *state)
4074 {
4075 	ibd_rwqe_t *rwqe, *next;
4076 	ibd_wqe_t *list;
4077 	ibt_lkey_t lkey;
4078 	int i;
4079 	uint_t len;
4080 	uint8_t *bufaddr;
4081 
4082 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4083 	if (state->id_rx_free_list.dl_head != NULL) {
4084 		/* rx rsrcs were never freed.  Just repost them */
4085 		len = state->id_rx_buf_sz;
4086 		list = state->id_rx_free_list.dl_head;
4087 		state->id_rx_free_list.dl_head = NULL;
4088 		state->id_rx_free_list.dl_cnt = 0;
4089 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4090 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4091 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4092 			if ((rwqe->rwqe_im_mblk = desballoc(
4093 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4094 			    &rwqe->w_freemsg_cb)) == NULL) {
4095 				/* allow freemsg_cb to free the rwqes */
4096 				if (atomic_dec_32_nv(&state->id_running) != 0) {
4097 					cmn_err(CE_WARN, "ibd_init_rxlist: "
4098 					    "id_running was not 1\n");
4099 				}
4100 				DPRINT(10, "ibd_init_rxlist : "
4101 				    "failed in desballoc()");
4102 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4103 				    rwqe = next) {
4104 					next = WQE_TO_RWQE(rwqe->rwqe_next);
4105 					if (rwqe->rwqe_im_mblk) {
4106 						atomic_inc_32(&state->
4107 						    id_rx_list.
4108 						    dl_bufs_outstanding);
4109 						freemsg(rwqe->rwqe_im_mblk);
4110 					} else
4111 						ibd_free_rwqe(state, rwqe);
4112 				}
4113 				atomic_inc_32(&state->id_running);
4114 				return (DDI_FAILURE);
4115 			}
4116 		}
4117 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
4118 		return (DDI_SUCCESS);
4119 	}
4120 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4121 
4122 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4123 		return (DDI_FAILURE);
4124 
4125 	/*
4126 	 * Allocate and setup the rwqe list
4127 	 */
4128 	len = state->id_rx_buf_sz;
4129 	lkey = state->id_rx_mr_desc.md_lkey;
4130 	rwqe = state->id_rx_wqes;
4131 	bufaddr = state->id_rx_bufs;
4132 	list = NULL;
4133 	for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4134 		rwqe->w_state = state;
4135 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4136 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4137 
4138 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4139 
4140 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4141 		    &rwqe->w_freemsg_cb)) == NULL) {
4142 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4143 			/* allow freemsg_cb to free the rwqes */
4144 			if (atomic_dec_32_nv(&state->id_running) != 0) {
4145 				cmn_err(CE_WARN, "ibd_init_rxlist: "
4146 				    "id_running was not 1\n");
4147 			}
4148 			DPRINT(10, "ibd_init_rxlist : "
4149 			    "failed in desballoc()");
4150 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4151 			    rwqe = next) {
4152 				next = WQE_TO_RWQE(rwqe->rwqe_next);
4153 				freemsg(rwqe->rwqe_im_mblk);
4154 			}
4155 			atomic_inc_32(&state->id_running);
4156 
4157 			/* remove reference to free'd rwqes */
4158 			mutex_enter(&state->id_rx_free_list.dl_mutex);
4159 			state->id_rx_free_list.dl_head = NULL;
4160 			state->id_rx_free_list.dl_cnt = 0;
4161 			mutex_exit(&state->id_rx_free_list.dl_mutex);
4162 
4163 			ibd_fini_rxlist(state);
4164 			return (DDI_FAILURE);
4165 		}
4166 
4167 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4168 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
4169 		    (ib_vaddr_t)(uintptr_t)bufaddr;
4170 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4171 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4172 		rwqe->w_rwr.wr_nds = 1;
4173 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4174 
4175 		rwqe->rwqe_next = list;
4176 		list = RWQE_TO_WQE(rwqe);
4177 	}
4178 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
4179 
4180 	return (DDI_SUCCESS);
4181 }
4182 
4183 static void
4184 ibd_free_rx_copybufs(ibd_state_t *state)
4185 {
4186 	int i;
4187 
4188 	/*
4189 	 * Unregister rxbuf mr
4190 	 */
4191 	if (ibt_deregister_mr(state->id_hca_hdl,
4192 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
4193 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4194 	}
4195 	state->id_rx_mr_hdl = NULL;
4196 
4197 	/*
4198 	 * Free rxbuf memory
4199 	 */
4200 	for (i = 0; i < state->id_rx_nqueues; i++) {
4201 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4202 		mutex_destroy(&rxp->rx_post_lock);
4203 	}
4204 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4205 	    sizeof (ibd_rx_queue_t));
4206 	kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4207 	    sizeof (ibd_rwqe_t));
4208 	kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4209 	    state->id_rx_buf_sz);
4210 	state->id_rx_queues = NULL;
4211 	state->id_rx_wqes = NULL;
4212 	state->id_rx_bufs = NULL;
4213 }
4214 
4215 static void
4216 ibd_free_rx_rsrcs(ibd_state_t *state)
4217 {
4218 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4219 	if (state->id_rx_free_list.dl_head == NULL) {
4220 		/* already freed */
4221 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4222 		return;
4223 	}
4224 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4225 	ibd_free_rx_copybufs(state);
4226 	state->id_rx_free_list.dl_cnt = 0;
4227 	state->id_rx_free_list.dl_head = NULL;
4228 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4229 }
4230 
4231 /*
4232  * Free the statically allocated Rx buffer list.
4233  */
4234 static void
4235 ibd_fini_rxlist(ibd_state_t *state)
4236 {
4237 	ibd_rwqe_t *rwqe;
4238 	int i;
4239 
4240 	/* run through the rx_queue's, calling freemsg() */
4241 	for (i = 0; i < state->id_rx_nqueues; i++) {
4242 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4243 		mutex_enter(&rxp->rx_post_lock);
4244 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4245 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4246 			freemsg(rwqe->rwqe_im_mblk);
4247 			rxp->rx_cnt--;
4248 		}
4249 		rxp->rx_head = NULL;
4250 		mutex_exit(&rxp->rx_post_lock);
4251 	}
4252 
4253 	/* cannot free rx resources unless gld returned everything */
4254 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4255 		ibd_free_rx_rsrcs(state);
4256 }
4257 
4258 /*
4259  * Free an allocated recv wqe.
4260  */
4261 /* ARGSUSED */
4262 static void
4263 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4264 {
4265 	/*
4266 	 * desballoc() failed (no memory).
4267 	 *
4268 	 * This rwqe is placed on a free list so that it
4269 	 * can be reinstated when memory is available.
4270 	 *
4271 	 * NOTE: no code currently exists to reinstate
4272 	 * these "lost" rwqes.
4273 	 */
4274 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4275 	state->id_rx_free_list.dl_cnt++;
4276 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4277 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4278 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4279 }
4280 
4281 /*
4282  * IBA Rx completion queue handler. Guaranteed to be single
4283  * threaded and nonreentrant for this CQ.
4284  */
4285 /* ARGSUSED */
4286 static void
4287 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4288 {
4289 	ibd_state_t *state = (ibd_state_t *)arg;
4290 
4291 	atomic_inc_64(&state->id_num_intrs);
4292 
4293 	if (ibd_rx_softintr == 1) {
4294 		mutex_enter(&state->id_rcq_poll_lock);
4295 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4296 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4297 			mutex_exit(&state->id_rcq_poll_lock);
4298 			return;
4299 		} else {
4300 			mutex_exit(&state->id_rcq_poll_lock);
4301 			ddi_trigger_softintr(state->id_rx);
4302 		}
4303 	} else
4304 		(void) ibd_intr((caddr_t)state);
4305 }
4306 
4307 /*
4308  * CQ handler for Tx completions, when the Tx CQ is in
4309  * interrupt driven mode.
4310  */
4311 /* ARGSUSED */
4312 static void
4313 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4314 {
4315 	ibd_state_t *state = (ibd_state_t *)arg;
4316 
4317 	atomic_inc_64(&state->id_num_intrs);
4318 
4319 	if (ibd_tx_softintr == 1) {
4320 		mutex_enter(&state->id_scq_poll_lock);
4321 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4322 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4323 			mutex_exit(&state->id_scq_poll_lock);
4324 			return;
4325 		} else {
4326 			mutex_exit(&state->id_scq_poll_lock);
4327 			ddi_trigger_softintr(state->id_tx);
4328 		}
4329 	} else
4330 		(void) ibd_tx_recycle((caddr_t)state);
4331 }
4332 
4333 /*
4334  * Multicast group create/delete trap handler. These will be delivered
4335  * on a kernel thread (handling can thus block) and can be invoked
4336  * concurrently. The handler can be invoked anytime after it is
4337  * registered and before ibt_detach().
4338  */
4339 /* ARGSUSED */
4340 static void
4341 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4342     ibt_subnet_event_t *event)
4343 {
4344 	ibd_state_t *state = (ibd_state_t *)arg;
4345 	ibd_req_t *req;
4346 
4347 	/*
4348 	 * The trap handler will get invoked once for every event for
4349 	 * every port. The input "gid" is the GID0 of the port the
4350 	 * trap came in on; we just need to act on traps that came
4351 	 * to our port, meaning the port on which the ipoib interface
4352 	 * resides. Since ipoib uses GID0 of the port, we just match
4353 	 * the gids to check whether we need to handle the trap.
4354 	 */
4355 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4356 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4357 		return;
4358 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4359 
4360 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4361 
4362 	switch (code) {
4363 		case IBT_SM_EVENT_UNAVAILABLE:
4364 			/*
4365 			 * If we are in promiscuous mode or have
4366 			 * sendnonmembers, we need to print a warning
4367 			 * message right now. Else, just store the
4368 			 * information, print when we enter promiscuous
4369 			 * mode or attempt nonmember send. We might
4370 			 * also want to stop caching sendnonmember.
4371 			 */
4372 			ibd_print_warn(state, "IBA multicast support "
4373 			    "degraded due to unavailability of multicast "
4374 			    "traps");
4375 			break;
4376 		case IBT_SM_EVENT_AVAILABLE:
4377 			/*
4378 			 * If we printed a warning message above or
4379 			 * while trying to nonmember send or get into
4380 			 * promiscuous mode, print an okay message.
4381 			 */
4382 			ibd_print_warn(state, "IBA multicast support "
4383 			    "restored due to availability of multicast "
4384 			    "traps");
4385 			break;
4386 		case IBT_SM_EVENT_MCG_CREATED:
4387 		case IBT_SM_EVENT_MCG_DELETED:
4388 			/*
4389 			 * If it is a "deleted" event and we are in late hca
4390 			 * init, nothing to do.
4391 			 */
4392 			if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4393 			    IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4394 			    IBT_SM_EVENT_MCG_DELETED)) {
4395 				break;
4396 			}
4397 			/*
4398 			 * Common processing of creation/deletion traps.
4399 			 * First check if the instance is being
4400 			 * [de]initialized; back off then, without doing
4401 			 * anything more, since we are not sure if the
4402 			 * async thread is around, or whether we might
4403 			 * be racing with the detach code in ibd_m_stop()
4404 			 * that scans the mcg list.
4405 			 */
4406 			if (!ibd_async_safe(state))
4407 				return;
4408 
4409 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4410 			req->rq_gid = event->sm_notice_gid;
4411 			req->rq_ptr = (void *)code;
4412 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4413 			break;
4414 	}
4415 }
4416 
4417 static void
4418 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4419 {
4420 	ib_gid_t mgid = req->rq_gid;
4421 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4422 	int ret;
4423 	ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4424 
4425 	DPRINT(10, "ibd_async_trap : %d\n", code);
4426 
4427 	/*
4428 	 * Check if we have already joined the IPoIB broadcast group for our
4429 	 * PKEY. If joined, perform the rest of the operation.
4430 	 * Else, the interface is not initialised. Do the initialisation here
4431 	 * by calling ibd_start() and return.
4432 	 */
4433 
4434 	if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4435 	    IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4436 	    (code == IBT_SM_EVENT_MCG_CREATED)) {
4437 		/*
4438 		 * If we are in late HCA init and a notification for the
4439 		 * creation of a MCG came in, check if it is the IPoIB MCG for
4440 		 * this pkey. If not, return.
4441 		 */
4442 		if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4443 		    state->id_pkey)) {
4444 			ibd_async_done(state);
4445 			return;
4446 		}
4447 		ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4448 		/*
4449 		 * Check if there is still a necessity to start the interface.
4450 		 * It is possible that the user attempted unplumb at just about
4451 		 * the same time, and if unplumb succeeded, we have nothing to
4452 		 * do.
4453 		 */
4454 		if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4455 		    IBD_DRV_IN_LATE_HCA_INIT) &&
4456 		    ((ret = ibd_start(state)) != 0)) {
4457 			DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4458 			    "init, ret=%d", ret);
4459 		}
4460 		ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4461 		ibd_async_done(state);
4462 		return;
4463 	}
4464 
4465 	/*
4466 	 * Atomically search the nonmember and sendonlymember lists and
4467 	 * delete.
4468 	 */
4469 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4470 
4471 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4472 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4473 
4474 		/*
4475 		 * If in promiscuous mode, try to join/attach to the new
4476 		 * mcg. Given the unreliable out-of-order mode of trap
4477 		 * delivery, we can never be sure whether it is a problem
4478 		 * if the join fails. Thus, we warn the admin of a failure
4479 		 * if this was a creation trap. Note that the trap might
4480 		 * actually be reporting a long past event, and the mcg
4481 		 * might already have been deleted, thus we might be warning
4482 		 * in vain.
4483 		 */
4484 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4485 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4486 			ibd_print_warn(state, "IBA promiscuous mode missed "
4487 			    "new multicast gid %016llx:%016llx",
4488 			    (u_longlong_t)mgid.gid_prefix,
4489 			    (u_longlong_t)mgid.gid_guid);
4490 	}
4491 
4492 	/*
4493 	 * Free the request slot allocated by the subnet event thread.
4494 	 */
4495 	ibd_async_done(state);
4496 }
4497 
4498 /*
4499  * GLDv3 entry point to get capabilities.
4500  */
4501 static boolean_t
4502 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4503 {
4504 	ibd_state_t *state = arg;
4505 
4506 	if (state->id_type == IBD_PORT_DRIVER)
4507 		return (B_FALSE);
4508 
4509 	switch (cap) {
4510 	case MAC_CAPAB_HCKSUM: {
4511 		uint32_t *txflags = cap_data;
4512 
4513 		/*
4514 		 * We either do full checksum or not do it at all
4515 		 */
4516 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4517 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4518 		else
4519 			return (B_FALSE);
4520 		break;
4521 	}
4522 
4523 	case MAC_CAPAB_LSO: {
4524 		mac_capab_lso_t *cap_lso = cap_data;
4525 
4526 		/*
4527 		 * In addition to the capability and policy, since LSO
4528 		 * relies on hw checksum, we'll not enable LSO if we
4529 		 * don't have hw checksum.  Of course, if the HCA doesn't
4530 		 * provide the reserved lkey capability, enabling LSO will
4531 		 * actually affect performance adversely, so we'll disable
4532 		 * LSO even for that case.
4533 		 */
4534 		if (!state->id_lso_policy || !state->id_lso_capable)
4535 			return (B_FALSE);
4536 
4537 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4538 			return (B_FALSE);
4539 
4540 		if (state->id_hca_res_lkey_capab == 0) {
4541 			ibd_print_warn(state, "no reserved-lkey capability, "
4542 			    "disabling LSO");
4543 			return (B_FALSE);
4544 		}
4545 
4546 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4547 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4548 		break;
4549 	}
4550 
4551 	default:
4552 		return (B_FALSE);
4553 	}
4554 
4555 	return (B_TRUE);
4556 }
4557 
4558 /*
4559  * callback function for set/get of properties
4560  */
4561 static int
4562 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4563     uint_t pr_valsize, const void *pr_val)
4564 {
4565 	ibd_state_t *state = arg;
4566 	int err = 0;
4567 	uint32_t link_mode;
4568 
4569 	/* Cannot set properties on a port driver */
4570 	if (state->id_type == IBD_PORT_DRIVER) {
4571 		return (ENOTSUP);
4572 	}
4573 
4574 	switch (pr_num) {
4575 		case MAC_PROP_IB_LINKMODE:
4576 			if (state->id_mac_state & IBD_DRV_STARTED) {
4577 				err = EBUSY;
4578 				break;
4579 			}
4580 			if (pr_val == NULL) {
4581 				err = EINVAL;
4582 				break;
4583 			}
4584 			bcopy(pr_val, &link_mode, sizeof (link_mode));
4585 			if (link_mode != IBD_LINK_MODE_UD &&
4586 			    link_mode != IBD_LINK_MODE_RC) {
4587 				err = EINVAL;
4588 			} else {
4589 				if (link_mode == IBD_LINK_MODE_RC) {
4590 					if (state->id_enable_rc) {
4591 						return (0);
4592 					}
4593 					state->id_enable_rc = 1;
4594 					/* inform MAC framework of new MTU */
4595 					err = mac_maxsdu_update(state->id_mh,
4596 					    state->rc_mtu - IPOIB_HDRSIZE);
4597 				} else {
4598 					if (!state->id_enable_rc) {
4599 						return (0);
4600 					}
4601 					state->id_enable_rc = 0;
4602 					err = mac_maxsdu_update(state->id_mh,
4603 					    state->id_mtu - IPOIB_HDRSIZE);
4604 				}
4605 				(void) ibd_record_capab(state);
4606 				mac_capab_update(state->id_mh);
4607 			}
4608 			break;
4609 		case MAC_PROP_PRIVATE:
4610 			err = ibd_set_priv_prop(state, pr_name,
4611 			    pr_valsize, pr_val);
4612 			break;
4613 		default:
4614 			err = ENOTSUP;
4615 			break;
4616 	}
4617 	return (err);
4618 }
4619 
4620 static int
4621 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4622     uint_t pr_valsize, void *pr_val)
4623 {
4624 	ibd_state_t *state = arg;
4625 	int err = 0;
4626 
4627 	switch (pr_num) {
4628 		case MAC_PROP_MTU:
4629 			break;
4630 		default:
4631 			if (state->id_type == IBD_PORT_DRIVER) {
4632 				return (ENOTSUP);
4633 			}
4634 			break;
4635 	}
4636 
4637 	switch (pr_num) {
4638 		case MAC_PROP_IB_LINKMODE:
4639 			*(uint_t *)pr_val = state->id_enable_rc;
4640 			break;
4641 		case MAC_PROP_PRIVATE:
4642 			err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4643 			    pr_val);
4644 			break;
4645 		default:
4646 			err = ENOTSUP;
4647 			break;
4648 	}
4649 	return (err);
4650 }
4651 
4652 static void
4653 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4654     mac_prop_info_handle_t prh)
4655 {
4656 	ibd_state_t *state = arg;
4657 
4658 	switch (pr_num) {
4659 	case MAC_PROP_IB_LINKMODE: {
4660 		mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4661 		break;
4662 	}
4663 	case MAC_PROP_MTU: {
4664 		uint32_t min, max;
4665 		if (state->id_type == IBD_PORT_DRIVER) {
4666 			min = 1500;
4667 			max = IBD_DEF_RC_MAX_SDU;
4668 		} else if (state->id_enable_rc) {
4669 			min = max = IBD_DEF_RC_MAX_SDU;
4670 		} else {
4671 			min = max = state->id_mtu - IPOIB_HDRSIZE;
4672 		}
4673 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4674 		mac_prop_info_set_range_uint32(prh, min, max);
4675 		break;
4676 	}
4677 	case MAC_PROP_PRIVATE: {
4678 		char valstr[64];
4679 		int value;
4680 
4681 		if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4682 			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4683 			return;
4684 		} else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4685 			value = IBD_DEF_COALESCE_COMPLETIONS;
4686 		} else if (strcmp(pr_name,
4687 		    "_ibd_create_broadcast_group") == 0) {
4688 			value = IBD_DEF_CREATE_BCAST_GROUP;
4689 		} else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4690 			value = IBD_DEF_HASH_SIZE;
4691 		} else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4692 			value = IBD_DEF_LSO_POLICY;
4693 		} else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4694 			value = IBD_DEF_NUM_AH;
4695 		} else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4696 			value = IBD_DEF_NUM_LSO_BUFS;
4697 		} else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4698 			value = IBD_DEF_RC_ENABLE_SRQ;
4699 		} else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4700 			value = IBD_DEF_RC_NUM_RWQE;
4701 		} else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4702 			value = IBD_DEF_RC_NUM_SRQ;
4703 		} else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4704 			value = IBD_DEF_RC_NUM_SWQE;
4705 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4706 			value = IBD_DEF_RC_RX_COMP_COUNT;
4707 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4708 			value = IBD_DEF_RC_RX_COMP_USEC;
4709 		} else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4710 			value = IBD_DEF_RC_RX_COPY_THRESH;
4711 		} else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4712 			value = IBD_DEF_RC_RX_RWQE_THRESH;
4713 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4714 			value = IBD_DEF_RC_TX_COMP_COUNT;
4715 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4716 			value = IBD_DEF_RC_TX_COMP_USEC;
4717 		} else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4718 			value = IBD_DEF_RC_TX_COPY_THRESH;
4719 		} else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4720 			value = IBD_DEF_UD_NUM_RWQE;
4721 		} else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4722 			value = IBD_DEF_UD_NUM_SWQE;
4723 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4724 			value = IBD_DEF_UD_RX_COMP_COUNT;
4725 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4726 			value = IBD_DEF_UD_RX_COMP_USEC;
4727 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4728 			value = IBD_DEF_UD_TX_COMP_COUNT;
4729 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4730 			value = IBD_DEF_UD_TX_COMP_USEC;
4731 		} else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4732 			value = IBD_DEF_UD_TX_COPY_THRESH;
4733 		} else {
4734 			return;
4735 		}
4736 
4737 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
4738 		mac_prop_info_set_default_str(prh, valstr);
4739 		break;
4740 	}
4741 	} /* switch (pr_num) */
4742 }
4743 
4744 /* ARGSUSED2 */
4745 static int
4746 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4747     uint_t pr_valsize, const void *pr_val)
4748 {
4749 	int err = 0;
4750 	long result;
4751 
4752 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4753 		if (pr_val == NULL) {
4754 			return (EINVAL);
4755 		}
4756 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4757 		if (result < 0 || result > 1) {
4758 			err = EINVAL;
4759 		} else {
4760 			state->id_allow_coalesce_comp_tuning = (result == 1) ?
4761 			    B_TRUE: B_FALSE;
4762 		}
4763 		return (err);
4764 	}
4765 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4766 		if (state->id_mac_state & IBD_DRV_STARTED) {
4767 			return (EBUSY);
4768 		}
4769 		if (pr_val == NULL) {
4770 			return (EINVAL);
4771 		}
4772 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4773 		if (result < 0 || result > 1) {
4774 			err = EINVAL;
4775 		} else {
4776 			state->id_create_broadcast_group = (result == 1) ?
4777 			    B_TRUE: B_FALSE;
4778 		}
4779 		return (err);
4780 	}
4781 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4782 		if (state->id_mac_state & IBD_DRV_STARTED) {
4783 			return (EBUSY);
4784 		}
4785 		if (pr_val == NULL) {
4786 			return (EINVAL);
4787 		}
4788 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4789 		if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4790 			err = EINVAL;
4791 		} else {
4792 			state->id_hash_size = (uint32_t)result;
4793 		}
4794 		return (err);
4795 	}
4796 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4797 		if (state->id_mac_state & IBD_DRV_STARTED) {
4798 			return (EBUSY);
4799 		}
4800 		if (pr_val == NULL) {
4801 			return (EINVAL);
4802 		}
4803 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4804 		if (result < 0 || result > 1) {
4805 			err = EINVAL;
4806 		} else {
4807 			state->id_lso_policy = (result == 1) ?
4808 			    B_TRUE: B_FALSE;
4809 		}
4810 		mac_capab_update(state->id_mh);
4811 		return (err);
4812 	}
4813 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4814 		if (state->id_mac_state & IBD_DRV_STARTED) {
4815 			return (EBUSY);
4816 		}
4817 		if (pr_val == NULL) {
4818 			return (EINVAL);
4819 		}
4820 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4821 		if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4822 			err = EINVAL;
4823 		} else {
4824 			state->id_num_ah = (uint32_t)result;
4825 		}
4826 		return (err);
4827 	}
4828 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4829 		if (state->id_mac_state & IBD_DRV_STARTED) {
4830 			return (EBUSY);
4831 		}
4832 		if (!state->id_lso_policy || !state->id_lso_capable) {
4833 			return (EINVAL);
4834 		}
4835 		if (pr_val == NULL) {
4836 			return (EINVAL);
4837 		}
4838 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4839 		if (result < IBD_MIN_NUM_LSO_BUFS ||
4840 		    result > IBD_MAX_NUM_LSO_BUFS) {
4841 			err = EINVAL;
4842 		} else {
4843 			state->id_num_lso_bufs = (uint32_t)result;
4844 		}
4845 		return (err);
4846 	}
4847 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4848 		if (state->id_mac_state & IBD_DRV_STARTED) {
4849 			return (EBUSY);
4850 		}
4851 		if (pr_val == NULL) {
4852 			return (EINVAL);
4853 		}
4854 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4855 		if (result < 0 || result > 1) {
4856 			err = EINVAL;
4857 		} else {
4858 			state->rc_enable_srq = (result == 1) ?
4859 			    B_TRUE: B_FALSE;
4860 		}
4861 		if (!state->rc_enable_srq) {
4862 			state->id_rc_num_srq = 0;
4863 		}
4864 		return (err);
4865 	}
4866 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4867 		if (state->id_mac_state & IBD_DRV_STARTED) {
4868 			return (EBUSY);
4869 		}
4870 		if (pr_val == NULL) {
4871 			return (EINVAL);
4872 		}
4873 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4874 		if (result < IBD_MIN_RC_NUM_RWQE ||
4875 		    result > IBD_MAX_RC_NUM_RWQE) {
4876 			err = EINVAL;
4877 		} else {
4878 			state->id_rc_num_rwqe = (uint32_t)result;
4879 			if (state->id_allow_coalesce_comp_tuning &&
4880 			    state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4881 				state->id_rc_rx_comp_count =
4882 				    state->id_rc_num_rwqe;
4883 			if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4884 				state->id_rc_num_srq =
4885 				    state->id_rc_num_rwqe - 1;
4886 			/*
4887 			 * If rx_rwqe_threshold is greater than the number of
4888 			 * rwqes, pull it back to 25% of number of rwqes.
4889 			 */
4890 			if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4891 				state->id_rc_rx_rwqe_thresh =
4892 				    (state->id_rc_num_rwqe >> 2);
4893 
4894 		}
4895 		return (err);
4896 	}
4897 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4898 		if (state->id_mac_state & IBD_DRV_STARTED) {
4899 			return (EBUSY);
4900 		}
4901 		if (pr_val == NULL) {
4902 			return (EINVAL);
4903 		}
4904 		if (!state->rc_enable_srq)
4905 			return (EINVAL);
4906 
4907 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4908 		if (result < IBD_MIN_RC_NUM_SRQ ||
4909 		    result >= state->id_rc_num_rwqe) {
4910 			err = EINVAL;
4911 		} else
4912 			state->id_rc_num_srq = (uint32_t)result;
4913 		return (err);
4914 	}
4915 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4916 		if (state->id_mac_state & IBD_DRV_STARTED) {
4917 			return (EBUSY);
4918 		}
4919 		if (pr_val == NULL) {
4920 			return (EINVAL);
4921 		}
4922 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4923 		if (result < IBD_MIN_RC_NUM_SWQE ||
4924 		    result > IBD_MAX_RC_NUM_SWQE) {
4925 			err = EINVAL;
4926 		} else {
4927 			state->id_rc_num_swqe = (uint32_t)result;
4928 			if (state->id_allow_coalesce_comp_tuning &&
4929 			    state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4930 				state->id_rc_tx_comp_count =
4931 				    state->id_rc_num_swqe;
4932 		}
4933 		return (err);
4934 	}
4935 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4936 		if (!state->id_allow_coalesce_comp_tuning) {
4937 			return (ENOTSUP);
4938 		}
4939 		if (pr_val == NULL) {
4940 			return (EINVAL);
4941 		}
4942 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4943 		if (result < 1 || result > state->id_rc_num_rwqe) {
4944 			err = EINVAL;
4945 		} else {
4946 			state->id_rc_rx_comp_count = (uint32_t)result;
4947 		}
4948 		return (err);
4949 	}
4950 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4951 		if (!state->id_allow_coalesce_comp_tuning) {
4952 			return (ENOTSUP);
4953 		}
4954 		if (pr_val == NULL) {
4955 			return (EINVAL);
4956 		}
4957 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4958 		if (result < 1) {
4959 			err = EINVAL;
4960 		} else {
4961 			state->id_rc_rx_comp_usec = (uint32_t)result;
4962 		}
4963 		return (err);
4964 	}
4965 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4966 		if (state->id_mac_state & IBD_DRV_STARTED) {
4967 			return (EBUSY);
4968 		}
4969 		if (pr_val == NULL) {
4970 			return (EINVAL);
4971 		}
4972 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4973 		if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4974 		    result > state->rc_mtu) {
4975 			err = EINVAL;
4976 		} else {
4977 			state->id_rc_rx_copy_thresh = (uint32_t)result;
4978 		}
4979 		return (err);
4980 	}
4981 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4982 		if (state->id_mac_state & IBD_DRV_STARTED) {
4983 			return (EBUSY);
4984 		}
4985 		if (pr_val == NULL) {
4986 			return (EINVAL);
4987 		}
4988 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4989 		if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4990 		    result >= state->id_rc_num_rwqe) {
4991 			err = EINVAL;
4992 		} else {
4993 			state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4994 		}
4995 		return (err);
4996 	}
4997 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4998 		if (!state->id_allow_coalesce_comp_tuning) {
4999 			return (ENOTSUP);
5000 		}
5001 		if (pr_val == NULL) {
5002 			return (EINVAL);
5003 		}
5004 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5005 		if (result < 1 || result > state->id_rc_num_swqe) {
5006 			err = EINVAL;
5007 		} else {
5008 			state->id_rc_tx_comp_count = (uint32_t)result;
5009 		}
5010 		return (err);
5011 	}
5012 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5013 		if (!state->id_allow_coalesce_comp_tuning) {
5014 			return (ENOTSUP);
5015 		}
5016 		if (pr_val == NULL) {
5017 			return (EINVAL);
5018 		}
5019 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5020 		if (result < 1)
5021 			err = EINVAL;
5022 		else {
5023 			state->id_rc_tx_comp_usec = (uint32_t)result;
5024 		}
5025 		return (err);
5026 	}
5027 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5028 		if (state->id_mac_state & IBD_DRV_STARTED) {
5029 			return (EBUSY);
5030 		}
5031 		if (pr_val == NULL) {
5032 			return (EINVAL);
5033 		}
5034 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5035 		if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5036 		    result > state->rc_mtu) {
5037 			err = EINVAL;
5038 		} else {
5039 			state->id_rc_tx_copy_thresh = (uint32_t)result;
5040 		}
5041 		return (err);
5042 	}
5043 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5044 		if (state->id_mac_state & IBD_DRV_STARTED) {
5045 			return (EBUSY);
5046 		}
5047 		if (pr_val == NULL) {
5048 			return (EINVAL);
5049 		}
5050 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5051 		if (result < IBD_MIN_UD_NUM_RWQE ||
5052 		    result > IBD_MAX_UD_NUM_RWQE) {
5053 			err = EINVAL;
5054 		} else {
5055 			if (result > state->id_hca_max_chan_sz) {
5056 				state->id_ud_num_rwqe =
5057 				    state->id_hca_max_chan_sz;
5058 			} else {
5059 				state->id_ud_num_rwqe = (uint32_t)result;
5060 			}
5061 			if (state->id_allow_coalesce_comp_tuning &&
5062 			    state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5063 				state->id_ud_rx_comp_count =
5064 				    state->id_ud_num_rwqe;
5065 		}
5066 		return (err);
5067 	}
5068 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5069 		if (state->id_mac_state & IBD_DRV_STARTED) {
5070 			return (EBUSY);
5071 		}
5072 		if (pr_val == NULL) {
5073 			return (EINVAL);
5074 		}
5075 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5076 		if (result < IBD_MIN_UD_NUM_SWQE ||
5077 		    result > IBD_MAX_UD_NUM_SWQE) {
5078 			err = EINVAL;
5079 		} else {
5080 			if (result > state->id_hca_max_chan_sz) {
5081 				state->id_ud_num_swqe =
5082 				    state->id_hca_max_chan_sz;
5083 			} else {
5084 				state->id_ud_num_swqe = (uint32_t)result;
5085 			}
5086 			if (state->id_allow_coalesce_comp_tuning &&
5087 			    state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5088 				state->id_ud_tx_comp_count =
5089 				    state->id_ud_num_swqe;
5090 		}
5091 		return (err);
5092 	}
5093 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5094 		if (!state->id_allow_coalesce_comp_tuning) {
5095 			return (ENOTSUP);
5096 		}
5097 		if (pr_val == NULL) {
5098 			return (EINVAL);
5099 		}
5100 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5101 		if (result < 1 || result > state->id_ud_num_rwqe) {
5102 			err = EINVAL;
5103 		} else {
5104 			state->id_ud_rx_comp_count = (uint32_t)result;
5105 		}
5106 		return (err);
5107 	}
5108 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5109 		if (!state->id_allow_coalesce_comp_tuning) {
5110 			return (ENOTSUP);
5111 		}
5112 		if (pr_val == NULL) {
5113 			return (EINVAL);
5114 		}
5115 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5116 		if (result < 1) {
5117 			err = EINVAL;
5118 		} else {
5119 			state->id_ud_rx_comp_usec = (uint32_t)result;
5120 		}
5121 		return (err);
5122 	}
5123 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5124 		if (!state->id_allow_coalesce_comp_tuning) {
5125 			return (ENOTSUP);
5126 		}
5127 		if (pr_val == NULL) {
5128 			return (EINVAL);
5129 		}
5130 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5131 		if (result < 1 || result > state->id_ud_num_swqe) {
5132 			err = EINVAL;
5133 		} else {
5134 			state->id_ud_tx_comp_count = (uint32_t)result;
5135 		}
5136 		return (err);
5137 	}
5138 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5139 		if (!state->id_allow_coalesce_comp_tuning) {
5140 			return (ENOTSUP);
5141 		}
5142 		if (pr_val == NULL) {
5143 			return (EINVAL);
5144 		}
5145 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5146 		if (result < 1) {
5147 			err = EINVAL;
5148 		} else {
5149 			state->id_ud_tx_comp_usec = (uint32_t)result;
5150 		}
5151 		return (err);
5152 	}
5153 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5154 		if (state->id_mac_state & IBD_DRV_STARTED) {
5155 			return (EBUSY);
5156 		}
5157 		if (pr_val == NULL) {
5158 			return (EINVAL);
5159 		}
5160 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5161 		if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5162 		    result > IBD_MAX_UD_TX_COPY_THRESH) {
5163 			err = EINVAL;
5164 		} else {
5165 			state->id_ud_tx_copy_thresh = (uint32_t)result;
5166 		}
5167 		return (err);
5168 	}
5169 	return (ENOTSUP);
5170 }
5171 
5172 static int
5173 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5174     void *pr_val)
5175 {
5176 	int err = ENOTSUP;
5177 	int value;
5178 
5179 	if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5180 		value = state->id_bgroup_present;
5181 		err = 0;
5182 		goto done;
5183 	}
5184 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5185 		value = state->id_allow_coalesce_comp_tuning;
5186 		err = 0;
5187 		goto done;
5188 	}
5189 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5190 		value = state->id_create_broadcast_group;
5191 		err = 0;
5192 		goto done;
5193 	}
5194 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5195 		value = state->id_hash_size;
5196 		err = 0;
5197 		goto done;
5198 	}
5199 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5200 		value = state->id_lso_policy;
5201 		err = 0;
5202 		goto done;
5203 	}
5204 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5205 		value = state->id_num_ah;
5206 		err = 0;
5207 		goto done;
5208 	}
5209 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5210 		value = state->id_num_lso_bufs;
5211 		err = 0;
5212 		goto done;
5213 	}
5214 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5215 		value = state->rc_enable_srq;
5216 		err = 0;
5217 		goto done;
5218 	}
5219 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5220 		value = state->id_rc_num_rwqe;
5221 		err = 0;
5222 		goto done;
5223 	}
5224 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5225 		value = state->id_rc_num_srq;
5226 		err = 0;
5227 		goto done;
5228 	}
5229 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5230 		value = state->id_rc_num_swqe;
5231 		err = 0;
5232 		goto done;
5233 	}
5234 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5235 		value = state->id_rc_rx_comp_count;
5236 		err = 0;
5237 		goto done;
5238 	}
5239 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5240 		value = state->id_rc_rx_comp_usec;
5241 		err = 0;
5242 		goto done;
5243 	}
5244 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5245 		value = state->id_rc_rx_copy_thresh;
5246 		err = 0;
5247 		goto done;
5248 	}
5249 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5250 		value = state->id_rc_rx_rwqe_thresh;
5251 		err = 0;
5252 		goto done;
5253 	}
5254 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5255 		value = state->id_rc_tx_comp_count;
5256 		err = 0;
5257 		goto done;
5258 	}
5259 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5260 		value = state->id_rc_tx_comp_usec;
5261 		err = 0;
5262 		goto done;
5263 	}
5264 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5265 		value = state->id_rc_tx_copy_thresh;
5266 		err = 0;
5267 		goto done;
5268 	}
5269 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5270 		value = state->id_ud_num_rwqe;
5271 		err = 0;
5272 		goto done;
5273 	}
5274 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5275 		value = state->id_ud_num_swqe;
5276 		err = 0;
5277 		goto done;
5278 	}
5279 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5280 		value = state->id_ud_rx_comp_count;
5281 		err = 0;
5282 		goto done;
5283 	}
5284 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5285 		value = state->id_ud_rx_comp_usec;
5286 		err = 0;
5287 		goto done;
5288 	}
5289 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5290 		value = state->id_ud_tx_comp_count;
5291 		err = 0;
5292 		goto done;
5293 	}
5294 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5295 		value = state->id_ud_tx_comp_usec;
5296 		err = 0;
5297 		goto done;
5298 	}
5299 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5300 		value = state->id_ud_tx_copy_thresh;
5301 		err = 0;
5302 		goto done;
5303 	}
5304 done:
5305 	if (err == 0) {
5306 		(void) snprintf(pr_val, pr_valsize, "%d", value);
5307 	}
5308 	return (err);
5309 }
5310 
5311 static int
5312 ibd_get_port_details(ibd_state_t *state)
5313 {
5314 	ibt_hca_portinfo_t *port_infop;
5315 	ibt_status_t ret;
5316 	uint_t psize, port_infosz;
5317 
5318 	mutex_enter(&state->id_link_mutex);
5319 
5320 	/*
5321 	 * Query for port information
5322 	 */
5323 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5324 	    &port_infop, &psize, &port_infosz);
5325 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
5326 		mutex_exit(&state->id_link_mutex);
5327 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5328 		    "failed, ret=%d", ret);
5329 		return (ENETDOWN);
5330 	}
5331 
5332 	/*
5333 	 * If the link is active, verify the pkey
5334 	 */
5335 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5336 		if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5337 		    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5338 			state->id_link_state = LINK_STATE_DOWN;
5339 		} else {
5340 			state->id_link_state = LINK_STATE_UP;
5341 		}
5342 		state->id_mtu = (128 << port_infop->p_mtu);
5343 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5344 		state->id_sgid = *port_infop->p_sgid_tbl;
5345 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5346 		/*
5347 		 * Now that the port is active, record the port speed
5348 		 */
5349 		state->id_link_speed = ibd_get_portspeed(state);
5350 	} else {
5351 		/* Make sure that these are handled in PORT_UP/CHANGE */
5352 		state->id_mtu = 0;
5353 		state->id_link_state = LINK_STATE_DOWN;
5354 		state->id_link_speed = 0;
5355 	}
5356 	mutex_exit(&state->id_link_mutex);
5357 	ibt_free_portinfo(port_infop, port_infosz);
5358 
5359 	return (0);
5360 }
5361 
5362 static int
5363 ibd_alloc_cqs(ibd_state_t *state)
5364 {
5365 	ibt_hca_attr_t hca_attrs;
5366 	ibt_cq_attr_t cq_attr;
5367 	ibt_status_t ret;
5368 	uint32_t real_size;
5369 	uint_t num_rwqe_change = 0;
5370 	uint_t num_swqe_change = 0;
5371 
5372 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5373 	ASSERT(ret == IBT_SUCCESS);
5374 
5375 	/*
5376 	 * Allocate Rx/combined CQ:
5377 	 * Theoretically, there is no point in having more than #rwqe
5378 	 * plus #swqe cqe's, except that the CQ will be signaled for
5379 	 * overflow when the last wqe completes, if none of the previous
5380 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
5381 	 * to make sure such overflow does not occur.
5382 	 */
5383 	cq_attr.cq_sched = NULL;
5384 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5385 
5386 	/*
5387 	 * Allocate Receive CQ.
5388 	 */
5389 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5390 		cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5391 	} else {
5392 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5393 		num_rwqe_change = state->id_ud_num_rwqe;
5394 		state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5395 	}
5396 
5397 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5398 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5399 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5400 		    "failed, ret=%d\n", ret);
5401 		return (DDI_FAILURE);
5402 	}
5403 
5404 	if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5405 	    state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5406 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5407 		    "moderation failed, ret=%d\n", ret);
5408 	}
5409 
5410 	/* make the #rx wc's the same as max rx chain size */
5411 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5412 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5413 	    state->id_rxwcs_size, KM_SLEEP);
5414 
5415 	/*
5416 	 * Allocate Send CQ.
5417 	 */
5418 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5419 		cq_attr.cq_size = state->id_ud_num_swqe + 1;
5420 	} else {
5421 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5422 		num_swqe_change = state->id_ud_num_swqe;
5423 		state->id_ud_num_swqe = cq_attr.cq_size - 1;
5424 	}
5425 
5426 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5427 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5428 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5429 		    "failed, ret=%d\n", ret);
5430 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5431 		    state->id_rxwcs_size);
5432 		(void) ibt_free_cq(state->id_rcq_hdl);
5433 		return (DDI_FAILURE);
5434 	}
5435 	if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5436 	    state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5437 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5438 		    "moderation failed, ret=%d\n", ret);
5439 	}
5440 
5441 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
5442 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5443 	    state->id_txwcs_size, KM_SLEEP);
5444 
5445 	/*
5446 	 * Print message in case we could not allocate as many wqe's
5447 	 * as was requested.
5448 	 */
5449 	if (num_rwqe_change) {
5450 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5451 		    "%d", state->id_ud_num_rwqe, num_rwqe_change);
5452 	}
5453 	if (num_swqe_change) {
5454 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
5455 		    "%d", state->id_ud_num_swqe, num_swqe_change);
5456 	}
5457 
5458 	return (DDI_SUCCESS);
5459 }
5460 
5461 static int
5462 ibd_setup_ud_channel(ibd_state_t *state)
5463 {
5464 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
5465 	ibt_ud_chan_query_attr_t ud_chan_attr;
5466 	ibt_status_t ret;
5467 
5468 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
5469 	if (state->id_hca_res_lkey_capab)
5470 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5471 	if (state->id_lso_policy && state->id_lso_capable)
5472 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5473 
5474 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
5475 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5476 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5477 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_ud_num_swqe;
5478 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_ud_num_rwqe;
5479 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
5480 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
5481 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
5482 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
5483 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
5484 	ud_alloc_attr.ud_clone_chan	= NULL;
5485 
5486 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5487 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5488 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5489 		    "failed, ret=%d\n", ret);
5490 		return (DDI_FAILURE);
5491 	}
5492 
5493 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5494 	    &ud_chan_attr)) != IBT_SUCCESS) {
5495 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5496 		    "failed, ret=%d\n", ret);
5497 		(void) ibt_free_channel(state->id_chnl_hdl);
5498 		return (DDI_FAILURE);
5499 	}
5500 
5501 	state->id_qpnum = ud_chan_attr.ud_qpn;
5502 
5503 	return (DDI_SUCCESS);
5504 }
5505 
5506 static int
5507 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5508 {
5509 	uint32_t progress = state->id_mac_state;
5510 	uint_t attempts;
5511 	ibt_status_t ret;
5512 	ib_gid_t mgid;
5513 	ibd_mce_t *mce;
5514 	uint8_t jstate;
5515 	timeout_id_t tid;
5516 
5517 	if (atomic_dec_32_nv(&state->id_running) != 0)
5518 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5519 
5520 	/*
5521 	 * Before we try to stop/undo whatever we did in ibd_start(),
5522 	 * we need to mark the link state appropriately to prevent the
5523 	 * ip layer from using this instance for any new transfers. Note
5524 	 * that if the original state of the link was "up" when we're
5525 	 * here, we'll set the final link state to "unknown", to behave
5526 	 * in the same fashion as other ethernet drivers.
5527 	 */
5528 	mutex_enter(&state->id_link_mutex);
5529 	if (cur_link_state == LINK_STATE_DOWN) {
5530 		state->id_link_state = cur_link_state;
5531 	} else {
5532 		state->id_link_state = LINK_STATE_UNKNOWN;
5533 	}
5534 	mutex_exit(&state->id_link_mutex);
5535 	bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5536 	mac_link_update(state->id_mh, state->id_link_state);
5537 
5538 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5539 	if (progress & IBD_DRV_STARTED) {
5540 		state->id_mac_state &= (~IBD_DRV_STARTED);
5541 	}
5542 
5543 	if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5544 		state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5545 	}
5546 
5547 	/* Stop listen under Reliable Connected Mode */
5548 	if (progress & IBD_DRV_RC_LISTEN) {
5549 		ASSERT(state->id_enable_rc);
5550 		if (state->rc_listen_hdl != NULL) {
5551 			ibd_rc_stop_listen(state);
5552 		}
5553 		state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5554 	}
5555 
5556 	/* Stop timeout routine */
5557 	if (progress & IBD_DRV_RC_TIMEOUT) {
5558 		ASSERT(state->id_enable_rc);
5559 		mutex_enter(&state->rc_timeout_lock);
5560 		state->rc_timeout_start = B_FALSE;
5561 		tid = state->rc_timeout;
5562 		state->rc_timeout = 0;
5563 		mutex_exit(&state->rc_timeout_lock);
5564 		if (tid != 0)
5565 			(void) untimeout(tid);
5566 		state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5567 	}
5568 
5569 	if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5570 		attempts = 100;
5571 		while (state->id_ah_op == IBD_OP_ONGOING) {
5572 			/*
5573 			 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5574 			 * port is connecting to a remote IPoIB port. Wait for
5575 			 * the end of this connecting operation.
5576 			 */
5577 			delay(drv_usectohz(100000));
5578 			if (--attempts == 0) {
5579 				state->rc_stop_connect++;
5580 				DPRINT(40, "ibd_undo_start: connecting");
5581 				break;
5582 			}
5583 		}
5584 		mutex_enter(&state->id_sched_lock);
5585 		state->id_sched_needed = 0;
5586 		mutex_exit(&state->id_sched_lock);
5587 		(void) ibd_rc_close_all_chan(state);
5588 	}
5589 
5590 	/*
5591 	 * First, stop receive interrupts; this stops the driver from
5592 	 * handing up buffers to higher layers.  Wait for receive buffers
5593 	 * to be returned and give up after 1 second.
5594 	 */
5595 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5596 		attempts = 10;
5597 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5598 		    0) > 0) {
5599 			delay(drv_usectohz(100000));
5600 			if (--attempts == 0) {
5601 				/*
5602 				 * There are pending bufs with the network
5603 				 * layer and we have no choice but to wait
5604 				 * for them to be done with. Reap all the
5605 				 * Tx/Rx completions that were posted since
5606 				 * we turned off the notification and
5607 				 * return failure.
5608 				 */
5609 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5610 				DPRINT(2, "ibd_undo_start: "
5611 				    "reclaiming failed");
5612 				break;
5613 			}
5614 		}
5615 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5616 	}
5617 
5618 	if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5619 		ibd_rc_fini_tx_largebuf_list(state);
5620 		state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5621 	}
5622 
5623 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5624 		ASSERT(state->id_enable_rc);
5625 		if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5626 			if (state->id_ah_op == IBD_OP_ONGOING) {
5627 				delay(drv_usectohz(10000));
5628 				if (state->id_ah_op == IBD_OP_ONGOING) {
5629 					/*
5630 					 * "state->id_ah_op == IBD_OP_ONGOING"
5631 					 * means this IPoIB port is connecting
5632 					 * to a remote IPoIB port. We can't
5633 					 * delete SRQ here.
5634 					 */
5635 					state->rc_stop_connect++;
5636 					DPRINT(40, "ibd_undo_start: "
5637 					    "connecting");
5638 				} else {
5639 					ibd_rc_fini_srq_list(state);
5640 					state->id_mac_state &=
5641 					    (~IBD_DRV_RC_SRQ_ALLOCD);
5642 				}
5643 			} else {
5644 				ibd_rc_fini_srq_list(state);
5645 				state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5646 			}
5647 		} else {
5648 			DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5649 		}
5650 	}
5651 
5652 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5653 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5654 
5655 		mutex_enter(&state->id_trap_lock);
5656 		state->id_trap_stop = B_TRUE;
5657 		while (state->id_trap_inprog > 0)
5658 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5659 		mutex_exit(&state->id_trap_lock);
5660 
5661 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5662 	}
5663 
5664 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5665 		/*
5666 		 * Flushing the channel ensures that all pending WQE's
5667 		 * are marked with flush_error and handed to the CQ. It
5668 		 * does not guarantee the invocation of the CQ handler.
5669 		 * This call is guaranteed to return successfully for
5670 		 * UD QPNs.
5671 		 */
5672 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5673 		    IBT_SUCCESS) {
5674 			DPRINT(10, "ibd_undo_start: flush_channel "
5675 			    "failed, ret=%d", ret);
5676 		}
5677 
5678 		/*
5679 		 * Give some time for the TX CQ handler to process the
5680 		 * completions.
5681 		 */
5682 		attempts = 10;
5683 		mutex_enter(&state->id_tx_list.dl_mutex);
5684 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5685 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5686 		    != state->id_ud_num_swqe) {
5687 			if (--attempts == 0)
5688 				break;
5689 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
5690 			mutex_exit(&state->id_tx_list.dl_mutex);
5691 			delay(drv_usectohz(100000));
5692 			mutex_enter(&state->id_tx_list.dl_mutex);
5693 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
5694 		}
5695 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5696 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5697 		    state->id_ud_num_swqe) {
5698 			cmn_err(CE_WARN, "tx resources not freed\n");
5699 		}
5700 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5701 		mutex_exit(&state->id_tx_list.dl_mutex);
5702 
5703 		attempts = 10;
5704 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5705 			if (--attempts == 0)
5706 				break;
5707 			delay(drv_usectohz(100000));
5708 		}
5709 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5710 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5711 			cmn_err(CE_WARN, "rx resources not freed\n");
5712 		}
5713 
5714 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5715 	}
5716 
5717 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5718 		/*
5719 		 * Drop all residual full/non membership. This includes full
5720 		 * membership to the broadcast group, and any nonmembership
5721 		 * acquired during transmits. We do this after the Tx completion
5722 		 * handlers are done, since those might result in some late
5723 		 * leaves; this also eliminates a potential race with that
5724 		 * path wrt the mc full list insert/delete. Trap handling
5725 		 * has also been suppressed at this point. Thus, no locks
5726 		 * are required while traversing the mc full list.
5727 		 */
5728 		DPRINT(2, "ibd_undo_start: clear full cache entries");
5729 		mce = list_head(&state->id_mc_full);
5730 		while (mce != NULL) {
5731 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
5732 			jstate = mce->mc_jstate;
5733 			mce = list_next(&state->id_mc_full, mce);
5734 			ibd_leave_group(state, mgid, jstate);
5735 		}
5736 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5737 	}
5738 
5739 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
5740 		ibd_fini_rxlist(state);
5741 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5742 	}
5743 
5744 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
5745 		ibd_fini_txlist(state);
5746 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5747 	}
5748 
5749 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5750 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5751 		    IBT_SUCCESS) {
5752 			DPRINT(10, "ibd_undo_start: free_channel "
5753 			    "failed, ret=%d", ret);
5754 		}
5755 
5756 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5757 	}
5758 
5759 	if (progress & IBD_DRV_CQS_ALLOCD) {
5760 		kmem_free(state->id_txwcs,
5761 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
5762 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5763 		    IBT_SUCCESS) {
5764 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
5765 			    "failed, ret=%d", ret);
5766 		}
5767 
5768 		kmem_free(state->id_rxwcs,
5769 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
5770 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5771 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5772 			    "ret=%d", ret);
5773 		}
5774 
5775 		state->id_txwcs = NULL;
5776 		state->id_rxwcs = NULL;
5777 		state->id_scq_hdl = NULL;
5778 		state->id_rcq_hdl = NULL;
5779 
5780 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5781 	}
5782 
5783 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5784 		mutex_enter(&state->id_ac_mutex);
5785 		mod_hash_destroy_hash(state->id_ah_active_hash);
5786 		mutex_exit(&state->id_ac_mutex);
5787 		ibd_acache_fini(state);
5788 
5789 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5790 	}
5791 
5792 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5793 		/*
5794 		 * If we'd created the ipoib broadcast group and had
5795 		 * successfully joined it, leave it now
5796 		 */
5797 		if (state->id_bgroup_created) {
5798 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5799 			jstate = IB_MC_JSTATE_FULL;
5800 			(void) ibt_leave_mcg(state->id_sgid, mgid,
5801 			    state->id_sgid, jstate);
5802 		}
5803 		ibt_free_mcg_info(state->id_mcinfo, 1);
5804 
5805 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5806 	}
5807 
5808 	return (DDI_SUCCESS);
5809 }
5810 
5811 /*
5812  * These pair of routines are used to set/clear the condition that
5813  * the caller is likely to do something to change the id_mac_state.
5814  * If there's already someone doing either a start or a stop (possibly
5815  * due to the async handler detecting a pkey relocation event, a plumb
5816  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5817  * that's done.
5818  */
5819 static void
5820 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5821 {
5822 	mutex_enter(&state->id_macst_lock);
5823 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5824 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5825 
5826 	state->id_mac_state |= flag;
5827 	mutex_exit(&state->id_macst_lock);
5828 }
5829 
5830 static void
5831 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5832 {
5833 	mutex_enter(&state->id_macst_lock);
5834 	state->id_mac_state &= (~flag);
5835 	cv_signal(&state->id_macst_cv);
5836 	mutex_exit(&state->id_macst_lock);
5837 }
5838 
5839 /*
5840  * GLDv3 entry point to start hardware.
5841  */
5842 /*ARGSUSED*/
5843 static int
5844 ibd_m_start(void *arg)
5845 {
5846 	ibd_state_t *state = arg;
5847 	int	ret;
5848 
5849 	if (state->id_type == IBD_PORT_DRIVER)
5850 		return (EINVAL);
5851 
5852 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5853 	if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5854 		ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5855 		return (EIO);
5856 	}
5857 
5858 	ret = ibd_start(state);
5859 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5860 	return (ret);
5861 }
5862 
5863 static int
5864 ibd_start(ibd_state_t *state)
5865 {
5866 	int err;
5867 	ibt_status_t ret;
5868 	int late_hca_init = 0;
5869 
5870 	if (state->id_mac_state & IBD_DRV_STARTED)
5871 		return (DDI_SUCCESS);
5872 
5873 	/*
5874 	 * We do not increment the running flag when calling ibd_start() as
5875 	 * a result of some event which moves the state away from late HCA
5876 	 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5877 	 */
5878 	if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5879 	    (atomic_inc_32_nv(&state->id_running) != 1)) {
5880 		DPRINT(10, "ibd_start: id_running is non-zero");
5881 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5882 		atomic_dec_32(&state->id_running);
5883 		return (EINVAL);
5884 	}
5885 
5886 	/*
5887 	 * Get port details; if we fail here, something bad happened.
5888 	 * Fail plumb.
5889 	 */
5890 	if ((err = ibd_get_port_details(state)) != 0) {
5891 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5892 		goto start_fail;
5893 	}
5894 	/*
5895 	 * If state->id_link_state is DOWN, it indicates that either the port
5896 	 * is down, or the pkey is not available. In both cases, resort to late
5897 	 * initialization. Register for subnet notices, and return success.
5898 	 */
5899 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5900 	if (state->id_link_state == LINK_STATE_DOWN) {
5901 		late_hca_init = 1;
5902 		goto late_hca_init_return;
5903 	}
5904 
5905 	/*
5906 	 * Find the IPoIB broadcast group
5907 	 */
5908 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5909 		/* Resort to late initialization */
5910 		late_hca_init = 1;
5911 		goto reg_snet_notices;
5912 	}
5913 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5914 
5915 	/*
5916 	 * Initialize per-interface caches and lists; if we fail here,
5917 	 * it is most likely due to a lack of resources
5918 	 */
5919 	if (ibd_acache_init(state) != DDI_SUCCESS) {
5920 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
5921 		err = ENOMEM;
5922 		goto start_fail;
5923 	}
5924 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5925 
5926 	/*
5927 	 * Allocate send and receive completion queues
5928 	 */
5929 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5930 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5931 		err = ENOMEM;
5932 		goto start_fail;
5933 	}
5934 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5935 
5936 	/*
5937 	 * Setup a UD channel
5938 	 */
5939 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5940 		err = ENOMEM;
5941 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5942 		goto start_fail;
5943 	}
5944 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5945 
5946 	/*
5947 	 * Allocate and initialize the tx buffer list
5948 	 */
5949 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
5950 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5951 		err = ENOMEM;
5952 		goto start_fail;
5953 	}
5954 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5955 
5956 	/*
5957 	 * Create the send cq handler here
5958 	 */
5959 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5960 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5961 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5962 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5963 		    "failed, ret=%d", ret);
5964 		err = EINVAL;
5965 		goto start_fail;
5966 	}
5967 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5968 
5969 	/*
5970 	 * Allocate and initialize the rx buffer list
5971 	 */
5972 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5973 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5974 		err = ENOMEM;
5975 		goto start_fail;
5976 	}
5977 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5978 
5979 	/*
5980 	 * Join IPoIB broadcast group
5981 	 */
5982 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5983 		DPRINT(10, "ibd_start: ibd_join_group() failed");
5984 		err = ENOTACTIVE;
5985 		goto start_fail;
5986 	}
5987 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5988 
5989 	/*
5990 	 * When we did mac_register() in ibd_attach(), we didn't register
5991 	 * the real macaddr and we didn't have the true port mtu. Now that
5992 	 * we're almost ready, set the local mac address and broadcast
5993 	 * addresses and update gldv3 about the real values of these
5994 	 * parameters.
5995 	 */
5996 	if (state->id_enable_rc) {
5997 		ibd_h2n_mac(&state->id_macaddr,
5998 		    IBD_MAC_ADDR_RC + state->id_qpnum,
5999 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6000 		ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
6001 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6002 	} else {
6003 		ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
6004 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6005 	}
6006 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
6007 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
6008 
6009 	if (!state->id_enable_rc) {
6010 		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
6011 		    - IPOIB_HDRSIZE);
6012 	}
6013 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6014 
6015 	/*
6016 	 * Setup the receive cq handler
6017 	 */
6018 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
6019 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
6020 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
6021 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
6022 		    "failed, ret=%d", ret);
6023 		err = EINVAL;
6024 		goto start_fail;
6025 	}
6026 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
6027 
6028 reg_snet_notices:
6029 	/*
6030 	 * In case of normal initialization sequence,
6031 	 * Setup the subnet notices handler after we've initialized the acache/
6032 	 * mcache and started the async thread, both of which are required for
6033 	 * the trap handler to function properly.
6034 	 *
6035 	 * Now that the async thread has been started (and we've already done
6036 	 * a mac_register() during attach so mac_tx_update() can be called
6037 	 * if necessary without any problem), we can enable the trap handler
6038 	 * to queue requests to the async thread.
6039 	 *
6040 	 * In case of late hca initialization, the subnet notices handler will
6041 	 * only handle MCG created/deleted event. The action performed as part
6042 	 * of handling these events is to start the interface. So, the
6043 	 * acache/mcache initialization is not a necessity in such cases for
6044 	 * registering the subnet notices handler. Also, if we are in
6045 	 * ibd_start() as a result of, say, some event handling after entering
6046 	 * late hca initialization phase no need to register again.
6047 	 */
6048 	if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
6049 		ibt_register_subnet_notices(state->id_ibt_hdl,
6050 		    ibd_snet_notices_handler, state);
6051 		mutex_enter(&state->id_trap_lock);
6052 		state->id_trap_stop = B_FALSE;
6053 		mutex_exit(&state->id_trap_lock);
6054 		state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
6055 	}
6056 
6057 late_hca_init_return:
6058 	if (late_hca_init == 1) {
6059 		state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6060 		/*
6061 		 * In case of late initialization, mark the link state as down,
6062 		 * immaterial of the actual link state as reported in the
6063 		 * port_info.
6064 		 */
6065 		state->id_link_state = LINK_STATE_DOWN;
6066 		mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6067 		mac_link_update(state->id_mh, state->id_link_state);
6068 		return (DDI_SUCCESS);
6069 	}
6070 
6071 	if (state->id_enable_rc) {
6072 		if (state->rc_enable_srq) {
6073 			if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6074 				if (ibd_rc_repost_srq_free_list(state) !=
6075 				    IBT_SUCCESS) {
6076 					err = ENOMEM;
6077 					goto start_fail;
6078 				}
6079 			} else {
6080 				/* Allocate SRQ resource */
6081 				if (ibd_rc_init_srq_list(state) !=
6082 				    IBT_SUCCESS) {
6083 					err = ENOMEM;
6084 					goto start_fail;
6085 				}
6086 				state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6087 			}
6088 		}
6089 
6090 		if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6091 			DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6092 			    "failed");
6093 			err = ENOMEM;
6094 			goto start_fail;
6095 		}
6096 		state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6097 
6098 		/* RC: begin to listen only after everything is available */
6099 		if (ibd_rc_listen(state) != IBT_SUCCESS) {
6100 			DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6101 			err = EINVAL;
6102 			goto start_fail;
6103 		}
6104 		state->id_mac_state |= IBD_DRV_RC_LISTEN;
6105 	}
6106 
6107 	/*
6108 	 * Indicate link status to GLDv3 and higher layers. By default,
6109 	 * we assume we are in up state (which must have been true at
6110 	 * least at the time the broadcast mcg's were probed); if there
6111 	 * were any up/down transitions till the time we come here, the
6112 	 * async handler will have updated last known state, which we
6113 	 * use to tell GLDv3. The async handler will not send any
6114 	 * notifications to GLDv3 till we reach here in the initialization
6115 	 * sequence.
6116 	 */
6117 	mac_link_update(state->id_mh, state->id_link_state);
6118 	state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6119 	state->id_mac_state |= IBD_DRV_STARTED;
6120 
6121 	/* Start timer after everything is ready */
6122 	if (state->id_enable_rc) {
6123 		mutex_enter(&state->rc_timeout_lock);
6124 		state->rc_timeout_start = B_TRUE;
6125 		state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
6126 		    SEC_TO_TICK(ibd_rc_conn_timeout));
6127 		mutex_exit(&state->rc_timeout_lock);
6128 		state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
6129 	}
6130 
6131 	return (DDI_SUCCESS);
6132 
6133 start_fail:
6134 	/*
6135 	 * If we ran into a problem during ibd_start() and ran into
6136 	 * some other problem during undoing our partial work, we can't
6137 	 * do anything about it.  Ignore any errors we might get from
6138 	 * ibd_undo_start() and just return the original error we got.
6139 	 */
6140 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
6141 	return (err);
6142 }
6143 
6144 /*
6145  * GLDv3 entry point to stop hardware from receiving packets.
6146  */
6147 /*ARGSUSED*/
6148 static void
6149 ibd_m_stop(void *arg)
6150 {
6151 	ibd_state_t *state = (ibd_state_t *)arg;
6152 
6153 	if (state->id_type == IBD_PORT_DRIVER)
6154 		return;
6155 
6156 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6157 
6158 	(void) ibd_undo_start(state, state->id_link_state);
6159 
6160 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6161 }
6162 
6163 /*
6164  * GLDv3 entry point to modify device's mac address. We do not
6165  * allow address modifications.
6166  */
6167 static int
6168 ibd_m_unicst(void *arg, const uint8_t *macaddr)
6169 {
6170 	ibd_state_t *state = arg;
6171 
6172 	if (state->id_type == IBD_PORT_DRIVER)
6173 		return (EINVAL);
6174 
6175 	/*
6176 	 * Don't bother even comparing the macaddr if we haven't
6177 	 * completed ibd_m_start().
6178 	 */
6179 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6180 		return (0);
6181 
6182 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6183 		return (0);
6184 	else
6185 		return (EINVAL);
6186 }
6187 
6188 /*
6189  * The blocking part of the IBA join/leave operations are done out
6190  * of here on the async thread.
6191  */
6192 static void
6193 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6194 {
6195 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6196 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6197 
6198 	if (op == IBD_ASYNC_JOIN) {
6199 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6200 			ibd_print_warn(state, "Join multicast group failed :"
6201 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6202 		}
6203 	} else {
6204 		/*
6205 		 * Here, we must search for the proper mcg_info and
6206 		 * use that to leave the group.
6207 		 */
6208 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6209 	}
6210 }
6211 
6212 /*
6213  * GLDv3 entry point for multicast enable/disable requests.
6214  * This function queues the operation to the async thread and
6215  * return success for a valid multicast address.
6216  */
6217 static int
6218 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6219 {
6220 	ibd_state_t *state = (ibd_state_t *)arg;
6221 	ipoib_mac_t maddr, *mcast;
6222 	ib_gid_t mgid;
6223 	ibd_req_t *req;
6224 
6225 	if (state->id_type == IBD_PORT_DRIVER)
6226 		return (EINVAL);
6227 
6228 	/*
6229 	 * If we haven't completed ibd_m_start(), async thread wouldn't
6230 	 * have been started and id_bcaddr wouldn't be set, so there's
6231 	 * no point in continuing.
6232 	 */
6233 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6234 		return (0);
6235 
6236 	/*
6237 	 * The incoming multicast address might not be aligned properly
6238 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6239 	 * it to look like one though, to get the offsets of the mc gid,
6240 	 * since we know we are not going to dereference any values with
6241 	 * the ipoib_mac_t pointer.
6242 	 */
6243 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6244 	mcast = &maddr;
6245 
6246 	/*
6247 	 * Check validity of MCG address. We could additionally check
6248 	 * that a enable/disable is not being issued on the "broadcast"
6249 	 * mcg, but since this operation is only invokable by privileged
6250 	 * programs anyway, we allow the flexibility to those dlpi apps.
6251 	 * Note that we do not validate the "scope" of the IBA mcg.
6252 	 */
6253 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6254 		return (EINVAL);
6255 
6256 	/*
6257 	 * fill in multicast pkey and scope
6258 	 */
6259 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6260 
6261 	/*
6262 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
6263 	 * nothing (i.e. we stay JOINed to the broadcast group done in
6264 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6265 	 * requires to be joined to broadcast groups at all times.
6266 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6267 	 * depends on this.
6268 	 */
6269 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6270 		return (0);
6271 
6272 	ibd_n2h_gid(mcast, &mgid);
6273 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6274 	if (req == NULL)
6275 		return (ENOMEM);
6276 
6277 	req->rq_gid = mgid;
6278 
6279 	if (add) {
6280 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6281 		    mgid.gid_prefix, mgid.gid_guid);
6282 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6283 	} else {
6284 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
6285 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6286 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6287 	}
6288 	return (0);
6289 }
6290 
6291 /*
6292  * The blocking part of the IBA promiscuous operations are done
6293  * out of here on the async thread. The dlpireq parameter indicates
6294  * whether this invocation is due to a dlpi request or due to
6295  * a port up/down event.
6296  */
6297 static void
6298 ibd_async_unsetprom(ibd_state_t *state)
6299 {
6300 	ibd_mce_t *mce = list_head(&state->id_mc_non);
6301 	ib_gid_t mgid;
6302 
6303 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6304 
6305 	while (mce != NULL) {
6306 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
6307 		mce = list_next(&state->id_mc_non, mce);
6308 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6309 	}
6310 	state->id_prom_op = IBD_OP_NOTSTARTED;
6311 }
6312 
6313 /*
6314  * The blocking part of the IBA promiscuous operations are done
6315  * out of here on the async thread. The dlpireq parameter indicates
6316  * whether this invocation is due to a dlpi request or due to
6317  * a port up/down event.
6318  */
6319 static void
6320 ibd_async_setprom(ibd_state_t *state)
6321 {
6322 	ibt_mcg_attr_t mcg_attr;
6323 	ibt_mcg_info_t *mcg_info;
6324 	ib_gid_t mgid;
6325 	uint_t numg;
6326 	int i;
6327 	char ret = IBD_OP_COMPLETED;
6328 
6329 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
6330 
6331 	/*
6332 	 * Obtain all active MC groups on the IB fabric with
6333 	 * specified criteria (scope + Pkey + Qkey + mtu).
6334 	 */
6335 	bzero(&mcg_attr, sizeof (mcg_attr));
6336 	mcg_attr.mc_pkey = state->id_pkey;
6337 	mcg_attr.mc_scope = state->id_scope;
6338 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6339 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6340 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6341 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6342 	    IBT_SUCCESS) {
6343 		ibd_print_warn(state, "Could not get list of IBA multicast "
6344 		    "groups");
6345 		ret = IBD_OP_ERRORED;
6346 		goto done;
6347 	}
6348 
6349 	/*
6350 	 * Iterate over the returned mcg's and join as NonMember
6351 	 * to the IP mcg's.
6352 	 */
6353 	for (i = 0; i < numg; i++) {
6354 		/*
6355 		 * Do a NonMember JOIN on the MC group.
6356 		 */
6357 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
6358 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6359 			ibd_print_warn(state, "IBA promiscuous mode missed "
6360 			    "multicast gid %016llx:%016llx",
6361 			    (u_longlong_t)mgid.gid_prefix,
6362 			    (u_longlong_t)mgid.gid_guid);
6363 	}
6364 
6365 	ibt_free_mcg_info(mcg_info, numg);
6366 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6367 done:
6368 	state->id_prom_op = ret;
6369 }
6370 
6371 /*
6372  * GLDv3 entry point for multicast promiscuous enable/disable requests.
6373  * GLDv3 assumes phys state receives more packets than multi state,
6374  * which is not true for IPoIB. Thus, treat the multi and phys
6375  * promiscuous states the same way to work with GLDv3's assumption.
6376  */
6377 static int
6378 ibd_m_promisc(void *arg, boolean_t on)
6379 {
6380 	ibd_state_t *state = (ibd_state_t *)arg;
6381 	ibd_req_t *req;
6382 
6383 	if (state->id_type == IBD_PORT_DRIVER)
6384 		return (EINVAL);
6385 
6386 	/*
6387 	 * Async thread wouldn't have been started if we haven't
6388 	 * passed ibd_m_start()
6389 	 */
6390 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6391 		return (0);
6392 
6393 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6394 	if (req == NULL)
6395 		return (ENOMEM);
6396 	if (on) {
6397 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6398 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6399 	} else {
6400 		DPRINT(1, "ibd_m_promisc : unset_promisc");
6401 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6402 	}
6403 
6404 	return (0);
6405 }
6406 
6407 /*
6408  * GLDv3 entry point for gathering statistics.
6409  */
6410 static int
6411 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6412 {
6413 	ibd_state_t *state = (ibd_state_t *)arg;
6414 
6415 	switch (stat) {
6416 	case MAC_STAT_IFSPEED:
6417 		*val = state->id_link_speed;
6418 		break;
6419 	case MAC_STAT_MULTIRCV:
6420 		*val = state->id_multi_rcv;
6421 		break;
6422 	case MAC_STAT_BRDCSTRCV:
6423 		*val = state->id_brd_rcv;
6424 		break;
6425 	case MAC_STAT_MULTIXMT:
6426 		*val = state->id_multi_xmt;
6427 		break;
6428 	case MAC_STAT_BRDCSTXMT:
6429 		*val = state->id_brd_xmt;
6430 		break;
6431 	case MAC_STAT_RBYTES:
6432 		*val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6433 		    + state->rc_rcv_copy_byte;
6434 		break;
6435 	case MAC_STAT_IPACKETS:
6436 		*val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6437 		    + state->rc_rcv_copy_pkt;
6438 		break;
6439 	case MAC_STAT_OBYTES:
6440 		*val = state->id_xmt_bytes + state->rc_xmt_bytes;
6441 		break;
6442 	case MAC_STAT_OPACKETS:
6443 		*val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6444 		    state->rc_xmt_fragmented_pkt +
6445 		    state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6446 		break;
6447 	case MAC_STAT_OERRORS:
6448 		*val = state->id_ah_error;	/* failed AH translation */
6449 		break;
6450 	case MAC_STAT_IERRORS:
6451 		*val = 0;
6452 		break;
6453 	case MAC_STAT_NOXMTBUF:
6454 		*val = state->id_tx_short + state->rc_swqe_short +
6455 		    state->rc_xmt_buf_short;
6456 		break;
6457 	case MAC_STAT_NORCVBUF:
6458 	default:
6459 		return (ENOTSUP);
6460 	}
6461 
6462 	return (0);
6463 }
6464 
6465 static void
6466 ibd_async_txsched(ibd_state_t *state)
6467 {
6468 	ibd_resume_transmission(state);
6469 }
6470 
6471 static void
6472 ibd_resume_transmission(ibd_state_t *state)
6473 {
6474 	int flag;
6475 	int met_thresh = 0;
6476 	int thresh = 0;
6477 	int ret = -1;
6478 
6479 	mutex_enter(&state->id_sched_lock);
6480 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
6481 		mutex_enter(&state->id_tx_list.dl_mutex);
6482 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
6483 		met_thresh = state->id_tx_list.dl_cnt +
6484 		    state->id_tx_rel_list.dl_cnt;
6485 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6486 		mutex_exit(&state->id_tx_list.dl_mutex);
6487 		thresh = IBD_FREE_SWQES_THRESH;
6488 		flag = IBD_RSRC_SWQE;
6489 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6490 		ASSERT(state->id_lso != NULL);
6491 		mutex_enter(&state->id_lso_lock);
6492 		met_thresh = state->id_lso->bkt_nfree;
6493 		thresh = IBD_FREE_LSOS_THRESH;
6494 		mutex_exit(&state->id_lso_lock);
6495 		flag = IBD_RSRC_LSOBUF;
6496 		if (met_thresh > thresh)
6497 			state->id_sched_lso_cnt++;
6498 	}
6499 	if (met_thresh > thresh) {
6500 		state->id_sched_needed &= ~flag;
6501 		state->id_sched_cnt++;
6502 		ret = 0;
6503 	}
6504 	mutex_exit(&state->id_sched_lock);
6505 
6506 	if (ret == 0)
6507 		mac_tx_update(state->id_mh);
6508 }
6509 
6510 /*
6511  * Release the send wqe back into free list.
6512  */
6513 static void
6514 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6515 {
6516 	/*
6517 	 * Add back on Tx list for reuse.
6518 	 */
6519 	ASSERT(tail->swqe_next == NULL);
6520 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6521 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6522 	tail->swqe_next = state->id_tx_rel_list.dl_head;
6523 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6524 	state->id_tx_rel_list.dl_cnt += n;
6525 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
6526 }
6527 
6528 /*
6529  * Acquire a send wqe from free list.
6530  * Returns error number and send wqe pointer.
6531  */
6532 static ibd_swqe_t *
6533 ibd_acquire_swqe(ibd_state_t *state)
6534 {
6535 	ibd_swqe_t *wqe;
6536 
6537 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6538 	if (state->id_tx_rel_list.dl_head != NULL) {
6539 		/* transfer id_tx_rel_list to id_tx_list */
6540 		state->id_tx_list.dl_head =
6541 		    state->id_tx_rel_list.dl_head;
6542 		state->id_tx_list.dl_cnt =
6543 		    state->id_tx_rel_list.dl_cnt;
6544 		state->id_tx_list.dl_pending_sends = B_FALSE;
6545 
6546 		/* clear id_tx_rel_list */
6547 		state->id_tx_rel_list.dl_head = NULL;
6548 		state->id_tx_rel_list.dl_cnt = 0;
6549 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6550 
6551 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6552 		state->id_tx_list.dl_cnt -= 1;
6553 		state->id_tx_list.dl_head = wqe->swqe_next;
6554 	} else {	/* no free swqe */
6555 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6556 		state->id_tx_list.dl_pending_sends = B_TRUE;
6557 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6558 		state->id_tx_short++;
6559 		wqe = NULL;
6560 	}
6561 	return (wqe);
6562 }
6563 
6564 static int
6565 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6566     ibt_ud_dest_hdl_t ud_dest)
6567 {
6568 	mblk_t	*nmp;
6569 	int iph_len, tcph_len;
6570 	ibt_wr_lso_t *lso;
6571 	uintptr_t ip_start, tcp_start;
6572 	uint8_t *dst;
6573 	uint_t pending, mblen;
6574 
6575 	/*
6576 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6577 	 * we need to adjust it here for lso.
6578 	 */
6579 	lso = &(node->w_swr.wr.ud_lso);
6580 	lso->lso_ud_dest = ud_dest;
6581 	lso->lso_mss = mss;
6582 
6583 	/*
6584 	 * Calculate the LSO header size and set it in the UD LSO structure.
6585 	 * Note that the only assumption we make is that each of the IPoIB,
6586 	 * IP and TCP headers will be contained in a single mblk fragment;
6587 	 * together, the headers may span multiple mblk fragments.
6588 	 */
6589 	nmp = mp;
6590 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6591 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6592 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
6593 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
6594 		nmp = nmp->b_cont;
6595 
6596 	}
6597 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6598 
6599 	tcp_start = ip_start + iph_len;
6600 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6601 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6602 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
6603 		nmp = nmp->b_cont;
6604 	}
6605 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6606 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6607 
6608 	/*
6609 	 * If the lso header fits entirely within a single mblk fragment,
6610 	 * we'll avoid an additional copy of the lso header here and just
6611 	 * pass the b_rptr of the mblk directly.
6612 	 *
6613 	 * If this isn't true, we'd have to allocate for it explicitly.
6614 	 */
6615 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
6616 		lso->lso_hdr = mp->b_rptr;
6617 	} else {
6618 		/* On work completion, remember to free this allocated hdr */
6619 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6620 		if (lso->lso_hdr == NULL) {
6621 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6622 			    "sz = %d", lso->lso_hdr_sz);
6623 			lso->lso_hdr_sz = 0;
6624 			lso->lso_mss = 0;
6625 			return (-1);
6626 		}
6627 	}
6628 
6629 	/*
6630 	 * Copy in the lso header only if we need to
6631 	 */
6632 	if (lso->lso_hdr != mp->b_rptr) {
6633 		dst = lso->lso_hdr;
6634 		pending = lso->lso_hdr_sz;
6635 
6636 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6637 			mblen = MBLKL(nmp);
6638 			if (pending > mblen) {
6639 				bcopy(nmp->b_rptr, dst, mblen);
6640 				dst += mblen;
6641 				pending -= mblen;
6642 			} else {
6643 				bcopy(nmp->b_rptr, dst, pending);
6644 				break;
6645 			}
6646 		}
6647 	}
6648 
6649 	return (0);
6650 }
6651 
6652 static void
6653 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6654 {
6655 	ibt_wr_lso_t *lso;
6656 
6657 	if ((!node) || (!mp))
6658 		return;
6659 
6660 	/*
6661 	 * Free any header space that we might've allocated if we
6662 	 * did an LSO
6663 	 */
6664 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6665 		lso = &(node->w_swr.wr.ud_lso);
6666 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6667 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6668 			lso->lso_hdr = NULL;
6669 			lso->lso_hdr_sz = 0;
6670 		}
6671 	}
6672 }
6673 
6674 static void
6675 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6676 {
6677 	uint_t		i;
6678 	uint_t		num_posted;
6679 	uint_t		n_wrs;
6680 	ibt_status_t	ibt_status;
6681 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
6682 	ibd_swqe_t	*tx_head, *elem;
6683 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
6684 
6685 	/* post the one request, then check for more */
6686 	ibt_status = ibt_post_send(state->id_chnl_hdl,
6687 	    &node->w_swr, 1, NULL);
6688 	if (ibt_status != IBT_SUCCESS) {
6689 		ibd_print_warn(state, "ibd_post_send: "
6690 		    "posting one wr failed: ret=%d", ibt_status);
6691 		ibd_tx_cleanup(state, node);
6692 	}
6693 
6694 	tx_head = NULL;
6695 	for (;;) {
6696 		if (tx_head == NULL) {
6697 			mutex_enter(&state->id_txpost_lock);
6698 			tx_head = state->id_tx_head;
6699 			if (tx_head == NULL) {
6700 				state->id_tx_busy = 0;
6701 				mutex_exit(&state->id_txpost_lock);
6702 				return;
6703 			}
6704 			state->id_tx_head = NULL;
6705 			mutex_exit(&state->id_txpost_lock);
6706 		}
6707 
6708 		/*
6709 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6710 		 * at a time if possible, and keep posting them.
6711 		 */
6712 		for (n_wrs = 0, elem = tx_head;
6713 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6714 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6715 			nodes[n_wrs] = elem;
6716 			wrs[n_wrs] = elem->w_swr;
6717 		}
6718 		tx_head = elem;
6719 
6720 		ASSERT(n_wrs != 0);
6721 
6722 		/*
6723 		 * If posting fails for some reason, we'll never receive
6724 		 * completion intimation, so we'll need to cleanup. But
6725 		 * we need to make sure we don't clean up nodes whose
6726 		 * wrs have been successfully posted. We assume that the
6727 		 * hca driver returns on the first failure to post and
6728 		 * therefore the first 'num_posted' entries don't need
6729 		 * cleanup here.
6730 		 */
6731 		num_posted = 0;
6732 		ibt_status = ibt_post_send(state->id_chnl_hdl,
6733 		    wrs, n_wrs, &num_posted);
6734 		if (ibt_status != IBT_SUCCESS) {
6735 			ibd_print_warn(state, "ibd_post_send: "
6736 			    "posting multiple wrs failed: "
6737 			    "requested=%d, done=%d, ret=%d",
6738 			    n_wrs, num_posted, ibt_status);
6739 
6740 			for (i = num_posted; i < n_wrs; i++)
6741 				ibd_tx_cleanup(state, nodes[i]);
6742 		}
6743 	}
6744 }
6745 
6746 static int
6747 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6748     uint_t lsohdr_sz)
6749 {
6750 	ibt_wr_ds_t *sgl;
6751 	ibt_status_t ibt_status;
6752 	mblk_t *nmp;
6753 	mblk_t *data_mp;
6754 	uchar_t *bufp;
6755 	size_t blksize;
6756 	size_t skip;
6757 	size_t avail;
6758 	uint_t pktsize;
6759 	uint_t frag_len;
6760 	uint_t pending_hdr;
6761 	int nmblks;
6762 	int i;
6763 
6764 	/*
6765 	 * Let's skip ahead to the data if this is LSO
6766 	 */
6767 	data_mp = mp;
6768 	pending_hdr = 0;
6769 	if (lsohdr_sz) {
6770 		pending_hdr = lsohdr_sz;
6771 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
6772 			frag_len = nmp->b_wptr - nmp->b_rptr;
6773 			if (frag_len > pending_hdr)
6774 				break;
6775 			pending_hdr -= frag_len;
6776 		}
6777 		data_mp = nmp;	/* start of data past lso header */
6778 		ASSERT(data_mp != NULL);
6779 	}
6780 
6781 	/*
6782 	 * Calculate the size of message data and number of msg blocks
6783 	 */
6784 	pktsize = 0;
6785 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
6786 	    nmp = nmp->b_cont, nmblks++) {
6787 		pktsize += MBLKL(nmp);
6788 	}
6789 	pktsize -= pending_hdr;
6790 
6791 	/*
6792 	 * We only do ibt_map_mem_iov() if the pktsize is above the
6793 	 * "copy-threshold", and if the number of mp fragments is less than
6794 	 * the maximum acceptable.
6795 	 */
6796 	if ((state->id_hca_res_lkey_capab) &&
6797 	    (pktsize > state->id_ud_tx_copy_thresh) &&
6798 	    (nmblks < state->id_max_sqseg_hiwm)) {
6799 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6800 		ibt_iov_attr_t iov_attr;
6801 
6802 		iov_attr.iov_as = NULL;
6803 		iov_attr.iov = iov_arr;
6804 		iov_attr.iov_buf = NULL;
6805 		iov_attr.iov_list_len = nmblks;
6806 		iov_attr.iov_wr_nds = state->id_max_sqseg;
6807 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6808 		iov_attr.iov_flags = IBT_IOV_SLEEP;
6809 
6810 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6811 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6812 			iov_arr[i].iov_len = MBLKL(nmp);
6813 			if (i == 0) {
6814 				iov_arr[i].iov_addr += pending_hdr;
6815 				iov_arr[i].iov_len -= pending_hdr;
6816 			}
6817 		}
6818 
6819 		node->w_buftype = IBD_WQE_MAPPED;
6820 		node->w_swr.wr_sgl = node->w_sgl;
6821 
6822 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6823 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6824 		if (ibt_status != IBT_SUCCESS) {
6825 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6826 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6827 			goto ibd_copy_path;
6828 		}
6829 
6830 		return (0);
6831 	}
6832 
6833 ibd_copy_path:
6834 	if (pktsize <= state->id_tx_buf_sz) {
6835 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6836 		node->w_swr.wr_nds = 1;
6837 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6838 		node->w_buftype = IBD_WQE_TXBUF;
6839 
6840 		/*
6841 		 * Even though this is the copy path for transfers less than
6842 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
6843 		 * is possible the first data mblk fragment (data_mp) still
6844 		 * contains part of the LSO header that we need to skip.
6845 		 */
6846 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6847 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6848 			blksize = MBLKL(nmp) - pending_hdr;
6849 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6850 			bufp += blksize;
6851 			pending_hdr = 0;
6852 		}
6853 
6854 		return (0);
6855 	}
6856 
6857 	/*
6858 	 * Copy path for transfers greater than id_tx_buf_sz
6859 	 */
6860 	node->w_swr.wr_sgl = node->w_sgl;
6861 	if (ibd_acquire_lsobufs(state, pktsize,
6862 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6863 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6864 		return (-1);
6865 	}
6866 	node->w_buftype = IBD_WQE_LSOBUF;
6867 
6868 	/*
6869 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
6870 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
6871 	 * need to skip part of the LSO header in the first fragment
6872 	 * as before.
6873 	 */
6874 	nmp = data_mp;
6875 	skip = pending_hdr;
6876 	for (i = 0; i < node->w_swr.wr_nds; i++) {
6877 		sgl = node->w_swr.wr_sgl + i;
6878 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6879 		avail = IBD_LSO_BUFSZ;
6880 		while (nmp && avail) {
6881 			blksize = MBLKL(nmp) - skip;
6882 			if (blksize > avail) {
6883 				bcopy(nmp->b_rptr + skip, bufp, avail);
6884 				skip += avail;
6885 				avail = 0;
6886 			} else {
6887 				bcopy(nmp->b_rptr + skip, bufp, blksize);
6888 				skip = 0;
6889 				avail -= blksize;
6890 				bufp += blksize;
6891 				nmp = nmp->b_cont;
6892 			}
6893 		}
6894 	}
6895 
6896 	return (0);
6897 }
6898 
6899 /*
6900  * Schedule a completion queue polling to reap the resource we're
6901  * short on.  If we implement the change to reap tx completions
6902  * in a separate thread, we'll need to wake up that thread here.
6903  */
6904 static int
6905 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6906 {
6907 	ibd_req_t *req;
6908 
6909 	mutex_enter(&state->id_sched_lock);
6910 	state->id_sched_needed |= resource_type;
6911 	mutex_exit(&state->id_sched_lock);
6912 
6913 	/*
6914 	 * If we are asked to queue a work entry, we need to do it
6915 	 */
6916 	if (q_flag) {
6917 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6918 		if (req == NULL)
6919 			return (-1);
6920 
6921 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6922 	}
6923 
6924 	return (0);
6925 }
6926 
6927 /*
6928  * The passed in packet has this format:
6929  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6930  */
6931 static boolean_t
6932 ibd_send(ibd_state_t *state, mblk_t *mp)
6933 {
6934 	ibd_ace_t *ace;
6935 	ibd_swqe_t *node;
6936 	ipoib_mac_t *dest;
6937 	ib_header_info_t *ipibp;
6938 	ip6_t *ip6h;
6939 	uint_t pktsize;
6940 	uint32_t mss;
6941 	uint32_t hckflags;
6942 	uint32_t lsoflags = 0;
6943 	uint_t lsohdr_sz = 0;
6944 	int ret, len;
6945 	boolean_t dofree = B_FALSE;
6946 	boolean_t rc;
6947 	/* if (rc_chan == NULL) send by UD; else send by RC; */
6948 	ibd_rc_chan_t *rc_chan;
6949 	int nmblks;
6950 	mblk_t *nmp;
6951 
6952 	/*
6953 	 * If we aren't done with the device initialization and start,
6954 	 * we shouldn't be here.
6955 	 */
6956 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6957 		return (B_FALSE);
6958 
6959 	/*
6960 	 * Obtain an address handle for the destination.
6961 	 */
6962 	ipibp = (ib_header_info_t *)mp->b_rptr;
6963 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
6964 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6965 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6966 
6967 	rc_chan = NULL;
6968 	ace = ibd_acache_lookup(state, dest, &ret, 1);
6969 	if (state->id_enable_rc && (ace != NULL) &&
6970 	    (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6971 		if (ace->ac_chan == NULL) {
6972 			state->rc_null_conn++;
6973 		} else {
6974 			if (ace->ac_chan->chan_state ==
6975 			    IBD_RC_STATE_ACT_ESTAB) {
6976 				rc_chan = ace->ac_chan;
6977 				rc_chan->is_used = B_TRUE;
6978 				mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6979 				node = WQE_TO_SWQE(
6980 				    rc_chan->tx_wqe_list.dl_head);
6981 				if (node != NULL) {
6982 					rc_chan->tx_wqe_list.dl_cnt -= 1;
6983 					rc_chan->tx_wqe_list.dl_head =
6984 					    node->swqe_next;
6985 				} else {
6986 					node = ibd_rc_acquire_swqes(rc_chan);
6987 				}
6988 				mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6989 
6990 				if (node == NULL) {
6991 					state->rc_swqe_short++;
6992 					mutex_enter(&state->id_sched_lock);
6993 					state->id_sched_needed |=
6994 					    IBD_RSRC_RC_SWQE;
6995 					mutex_exit(&state->id_sched_lock);
6996 					ibd_dec_ref_ace(state, ace);
6997 					return (B_FALSE);
6998 				}
6999 			} else {
7000 				state->rc_no_estab_conn++;
7001 			}
7002 		}
7003 	}
7004 
7005 	if (rc_chan == NULL) {
7006 		mutex_enter(&state->id_tx_list.dl_mutex);
7007 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
7008 		if (node != NULL) {
7009 			state->id_tx_list.dl_cnt -= 1;
7010 			state->id_tx_list.dl_head = node->swqe_next;
7011 		} else {
7012 			node = ibd_acquire_swqe(state);
7013 		}
7014 		mutex_exit(&state->id_tx_list.dl_mutex);
7015 		if (node == NULL) {
7016 			/*
7017 			 * If we don't have an swqe available, schedule a
7018 			 * transmit completion queue cleanup and hold off on
7019 			 * sending more packets until we have some free swqes
7020 			 */
7021 			if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
7022 				if (ace != NULL) {
7023 					ibd_dec_ref_ace(state, ace);
7024 				}
7025 				return (B_FALSE);
7026 			}
7027 
7028 			/*
7029 			 * If a poll cannot be scheduled, we have no choice but
7030 			 * to drop this packet
7031 			 */
7032 			ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
7033 			if (ace != NULL) {
7034 				ibd_dec_ref_ace(state, ace);
7035 			}
7036 			return (B_TRUE);
7037 		}
7038 	}
7039 
7040 	/*
7041 	 * Initialize the commonly used fields in swqe to NULL to protect
7042 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
7043 	 * failure.
7044 	 */
7045 	node->swqe_im_mblk = NULL;
7046 	node->w_swr.wr_nds = 0;
7047 	node->w_swr.wr_sgl = NULL;
7048 	node->w_swr.wr_opcode = IBT_WRC_SEND;
7049 
7050 	/*
7051 	 * Calculate the size of message data and number of msg blocks
7052 	 */
7053 	pktsize = 0;
7054 	for (nmblks = 0, nmp = mp; nmp != NULL;
7055 	    nmp = nmp->b_cont, nmblks++) {
7056 		pktsize += MBLKL(nmp);
7057 	}
7058 
7059 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7060 		atomic_inc_64(&state->id_brd_xmt);
7061 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7062 		atomic_inc_64(&state->id_multi_xmt);
7063 
7064 	if (ace != NULL) {
7065 		node->w_ahandle = ace;
7066 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
7067 	} else {
7068 		DPRINT(5,
7069 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7070 		    ((ret == EFAULT) ? "failed" : "queued"),
7071 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7072 		    htonl(dest->ipoib_gidpref[1]),
7073 		    htonl(dest->ipoib_gidsuff[0]),
7074 		    htonl(dest->ipoib_gidsuff[1]));
7075 		state->rc_ace_not_found++;
7076 		node->w_ahandle = NULL;
7077 
7078 		/*
7079 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7080 		 * can not find a path for the specific dest address. We
7081 		 * should get rid of this kind of packet.  We also should get
7082 		 * rid of the packet if we cannot schedule a poll via the
7083 		 * async thread.  For the normal case, ibd will return the
7084 		 * packet to upper layer and wait for AH creating.
7085 		 *
7086 		 * Note that we always queue a work slot entry for the async
7087 		 * thread when we fail AH lookup (even in intr mode); this is
7088 		 * due to the convoluted way the code currently looks for AH.
7089 		 */
7090 		if (ret == EFAULT) {
7091 			dofree = B_TRUE;
7092 			rc = B_TRUE;
7093 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7094 			dofree = B_TRUE;
7095 			rc = B_TRUE;
7096 		} else {
7097 			dofree = B_FALSE;
7098 			rc = B_FALSE;
7099 		}
7100 		goto ibd_send_fail;
7101 	}
7102 
7103 	/*
7104 	 * For ND6 packets, padding is at the front of the source lladdr.
7105 	 * Insert the padding at front.
7106 	 */
7107 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7108 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7109 			if (!pullupmsg(mp, IPV6_HDR_LEN +
7110 			    sizeof (ib_header_info_t))) {
7111 				DPRINT(10, "ibd_send: pullupmsg failure ");
7112 				dofree = B_TRUE;
7113 				rc = B_TRUE;
7114 				goto ibd_send_fail;
7115 			}
7116 			ipibp = (ib_header_info_t *)mp->b_rptr;
7117 		}
7118 		ip6h = (ip6_t *)((uchar_t *)ipibp +
7119 		    sizeof (ib_header_info_t));
7120 		len = ntohs(ip6h->ip6_plen);
7121 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7122 			mblk_t	*pad;
7123 
7124 			pad = allocb(4, 0);
7125 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7126 			linkb(mp, pad);
7127 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
7128 			    IPV6_HDR_LEN + len + 4) {
7129 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7130 				    IPV6_HDR_LEN + len + 4)) {
7131 					DPRINT(10, "ibd_send: pullupmsg "
7132 					    "failure ");
7133 					dofree = B_TRUE;
7134 					rc = B_TRUE;
7135 					goto ibd_send_fail;
7136 				}
7137 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7138 				    sizeof (ib_header_info_t));
7139 			}
7140 
7141 			/* LINTED: E_CONSTANT_CONDITION */
7142 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7143 		}
7144 	}
7145 
7146 	ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7147 	mp->b_rptr += sizeof (ib_addrs_t);
7148 	pktsize -= sizeof (ib_addrs_t);
7149 
7150 	if (rc_chan) {	/* send in RC mode */
7151 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7152 		ibt_iov_attr_t iov_attr;
7153 		uint_t		i;
7154 		size_t	blksize;
7155 		uchar_t *bufp;
7156 		ibd_rc_tx_largebuf_t *lbufp;
7157 
7158 		atomic_add_64(&state->rc_xmt_bytes, pktsize);
7159 
7160 		/*
7161 		 * Upper layer does Tx checksum, we don't need do any
7162 		 * checksum here.
7163 		 */
7164 		ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7165 
7166 		/*
7167 		 * We only do ibt_map_mem_iov() if the pktsize is above
7168 		 * the "copy-threshold", and if the number of mp
7169 		 * fragments is less than the maximum acceptable.
7170 		 */
7171 		if (pktsize <= state->id_rc_tx_copy_thresh) {
7172 			atomic_inc_64(&state->rc_xmt_small_pkt);
7173 			/*
7174 			 * Only process unicast packet in Reliable Connected
7175 			 * mode.
7176 			 */
7177 			node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7178 			node->w_swr.wr_nds = 1;
7179 			node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7180 			node->w_buftype = IBD_WQE_TXBUF;
7181 
7182 			bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7183 			for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7184 				blksize = MBLKL(nmp);
7185 				bcopy(nmp->b_rptr, bufp, blksize);
7186 				bufp += blksize;
7187 			}
7188 			freemsg(mp);
7189 			ASSERT(node->swqe_im_mblk == NULL);
7190 		} else {
7191 			if ((state->rc_enable_iov_map) &&
7192 			    (nmblks < state->rc_max_sqseg_hiwm)) {
7193 
7194 				/* do ibt_map_mem_iov() */
7195 				iov_attr.iov_as = NULL;
7196 				iov_attr.iov = iov_arr;
7197 				iov_attr.iov_buf = NULL;
7198 				iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7199 				iov_attr.iov_lso_hdr_sz = 0;
7200 				iov_attr.iov_flags = IBT_IOV_SLEEP;
7201 
7202 				i = 0;
7203 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7204 					iov_arr[i].iov_len = MBLKL(nmp);
7205 					if (iov_arr[i].iov_len != 0) {
7206 						iov_arr[i].iov_addr = (caddr_t)
7207 						    (void *)nmp->b_rptr;
7208 						i++;
7209 					}
7210 				}
7211 				iov_attr.iov_list_len = i;
7212 				node->w_swr.wr_sgl = node->w_sgl;
7213 
7214 				ret = ibt_map_mem_iov(state->id_hca_hdl,
7215 				    &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7216 				    &node->w_mi_hdl);
7217 				if (ret != IBT_SUCCESS) {
7218 					atomic_inc_64(
7219 					    &state->rc_xmt_map_fail_pkt);
7220 					DPRINT(30, "ibd_send: ibt_map_mem_iov("
7221 					    ") failed, nmblks=%d, real_nmblks"
7222 					    "=%d, ret=0x%x", nmblks, i, ret);
7223 					goto ibd_rc_large_copy;
7224 				}
7225 
7226 				atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7227 				node->w_buftype = IBD_WQE_MAPPED;
7228 				node->swqe_im_mblk = mp;
7229 			} else {
7230 				atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7231 ibd_rc_large_copy:
7232 				mutex_enter(&state->rc_tx_large_bufs_lock);
7233 				if (state->rc_tx_largebuf_nfree == 0) {
7234 					state->rc_xmt_buf_short++;
7235 					mutex_exit
7236 					    (&state->rc_tx_large_bufs_lock);
7237 					mutex_enter(&state->id_sched_lock);
7238 					state->id_sched_needed |=
7239 					    IBD_RSRC_RC_TX_LARGEBUF;
7240 					mutex_exit(&state->id_sched_lock);
7241 					dofree = B_FALSE;
7242 					rc = B_FALSE;
7243 					/*
7244 					 * If we don't have Tx large bufs,
7245 					 * return failure. node->w_buftype
7246 					 * should not be IBD_WQE_RC_COPYBUF,
7247 					 * otherwise it will cause problem
7248 					 * in ibd_rc_tx_cleanup()
7249 					 */
7250 					node->w_buftype = IBD_WQE_TXBUF;
7251 					goto ibd_send_fail;
7252 				}
7253 
7254 				lbufp = state->rc_tx_largebuf_free_head;
7255 				ASSERT(lbufp->lb_buf != NULL);
7256 				state->rc_tx_largebuf_free_head =
7257 				    lbufp->lb_next;
7258 				lbufp->lb_next = NULL;
7259 				/* Update nfree count */
7260 				state->rc_tx_largebuf_nfree --;
7261 				mutex_exit(&state->rc_tx_large_bufs_lock);
7262 				bufp = lbufp->lb_buf;
7263 				node->w_sgl[0].ds_va =
7264 				    (ib_vaddr_t)(uintptr_t)bufp;
7265 				node->w_sgl[0].ds_key =
7266 				    state->rc_tx_mr_desc.md_lkey;
7267 				node->w_sgl[0].ds_len = pktsize;
7268 				node->w_swr.wr_sgl = node->w_sgl;
7269 				node->w_swr.wr_nds = 1;
7270 				node->w_buftype = IBD_WQE_RC_COPYBUF;
7271 				node->w_rc_tx_largebuf = lbufp;
7272 
7273 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7274 					blksize = MBLKL(nmp);
7275 					if (blksize != 0) {
7276 						bcopy(nmp->b_rptr, bufp,
7277 						    blksize);
7278 						bufp += blksize;
7279 					}
7280 				}
7281 				freemsg(mp);
7282 				ASSERT(node->swqe_im_mblk == NULL);
7283 			}
7284 		}
7285 
7286 		node->swqe_next = NULL;
7287 		mutex_enter(&rc_chan->tx_post_lock);
7288 		if (rc_chan->tx_busy) {
7289 			if (rc_chan->tx_head) {
7290 				rc_chan->tx_tail->swqe_next =
7291 				    SWQE_TO_WQE(node);
7292 			} else {
7293 				rc_chan->tx_head = node;
7294 			}
7295 			rc_chan->tx_tail = node;
7296 			mutex_exit(&rc_chan->tx_post_lock);
7297 		} else {
7298 			rc_chan->tx_busy = 1;
7299 			mutex_exit(&rc_chan->tx_post_lock);
7300 			ibd_rc_post_send(rc_chan, node);
7301 		}
7302 
7303 		return (B_TRUE);
7304 	} /* send by RC */
7305 
7306 	if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7307 		/*
7308 		 * Too long pktsize. The packet size from GLD should <=
7309 		 * state->id_mtu + sizeof (ib_addrs_t)
7310 		 */
7311 		if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7312 			ibd_req_t *req;
7313 
7314 			mutex_enter(&ace->tx_too_big_mutex);
7315 			if (ace->tx_too_big_ongoing) {
7316 				mutex_exit(&ace->tx_too_big_mutex);
7317 				state->rc_xmt_reenter_too_long_pkt++;
7318 				dofree = B_TRUE;
7319 			} else {
7320 				ace->tx_too_big_ongoing = B_TRUE;
7321 				mutex_exit(&ace->tx_too_big_mutex);
7322 				state->rc_xmt_icmp_too_long_pkt++;
7323 
7324 				req = kmem_cache_alloc(state->id_req_kmc,
7325 				    KM_NOSLEEP);
7326 				if (req == NULL) {
7327 					ibd_print_warn(state, "ibd_send: alloc "
7328 					    "ibd_req_t fail");
7329 					/* Drop it. */
7330 					dofree = B_TRUE;
7331 				} else {
7332 					req->rq_ptr = mp;
7333 					req->rq_ptr2 = ace;
7334 					ibd_queue_work_slot(state, req,
7335 					    IBD_ASYNC_RC_TOO_BIG);
7336 					dofree = B_FALSE;
7337 				}
7338 			}
7339 		} else {
7340 			ibd_print_warn(state, "Reliable Connected mode is on. "
7341 			    "Multicast packet length %d > %d is too long to "
7342 			    "send packet (%d > %d), drop it",
7343 			    pktsize, state->id_mtu);
7344 			state->rc_xmt_drop_too_long_pkt++;
7345 			/* Drop it. */
7346 			dofree = B_TRUE;
7347 		}
7348 		rc = B_TRUE;
7349 		goto ibd_send_fail;
7350 	}
7351 
7352 	atomic_add_64(&state->id_xmt_bytes, pktsize);
7353 	atomic_inc_64(&state->id_xmt_pkt);
7354 
7355 	/*
7356 	 * Do LSO and checksum related work here.  For LSO send, adjust the
7357 	 * ud destination, the opcode and the LSO header information to the
7358 	 * work request.
7359 	 */
7360 	mac_lso_get(mp, &mss, &lsoflags);
7361 	if ((lsoflags & HW_LSO) != HW_LSO) {
7362 		node->w_swr.wr_opcode = IBT_WRC_SEND;
7363 		lsohdr_sz = 0;
7364 	} else {
7365 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7366 			/*
7367 			 * The routine can only fail if there's no memory; we
7368 			 * can only drop the packet if this happens
7369 			 */
7370 			ibd_print_warn(state,
7371 			    "ibd_send: no memory, lso posting failed");
7372 			dofree = B_TRUE;
7373 			rc = B_TRUE;
7374 			goto ibd_send_fail;
7375 		}
7376 
7377 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7378 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7379 	}
7380 
7381 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7382 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7383 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7384 	else
7385 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7386 
7387 	/*
7388 	 * Prepare the sgl for posting; the routine can only fail if there's
7389 	 * no lso buf available for posting. If this is the case, we should
7390 	 * probably resched for lso bufs to become available and then try again.
7391 	 */
7392 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7393 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7394 			dofree = B_TRUE;
7395 			rc = B_TRUE;
7396 		} else {
7397 			dofree = B_FALSE;
7398 			rc = B_FALSE;
7399 		}
7400 		goto ibd_send_fail;
7401 	}
7402 	node->swqe_im_mblk = mp;
7403 
7404 	/*
7405 	 * Queue the wqe to hardware; since we can now simply queue a
7406 	 * post instead of doing it serially, we cannot assume anything
7407 	 * about the 'node' after ibd_post_send() returns.
7408 	 */
7409 	node->swqe_next = NULL;
7410 
7411 	mutex_enter(&state->id_txpost_lock);
7412 	if (state->id_tx_busy) {
7413 		if (state->id_tx_head) {
7414 			state->id_tx_tail->swqe_next =
7415 			    SWQE_TO_WQE(node);
7416 		} else {
7417 			state->id_tx_head = node;
7418 		}
7419 		state->id_tx_tail = node;
7420 		mutex_exit(&state->id_txpost_lock);
7421 	} else {
7422 		state->id_tx_busy = 1;
7423 		mutex_exit(&state->id_txpost_lock);
7424 		ibd_post_send(state, node);
7425 	}
7426 
7427 	return (B_TRUE);
7428 
7429 ibd_send_fail:
7430 	if (node && mp)
7431 		ibd_free_lsohdr(node, mp);
7432 
7433 	if (dofree)
7434 		freemsg(mp);
7435 
7436 	if (node != NULL) {
7437 		if (rc_chan) {
7438 			ibd_rc_tx_cleanup(node);
7439 		} else {
7440 			ibd_tx_cleanup(state, node);
7441 		}
7442 	}
7443 
7444 	return (rc);
7445 }
7446 
7447 /*
7448  * GLDv3 entry point for transmitting datagram.
7449  */
7450 static mblk_t *
7451 ibd_m_tx(void *arg, mblk_t *mp)
7452 {
7453 	ibd_state_t *state = (ibd_state_t *)arg;
7454 	mblk_t *next;
7455 
7456 	if (state->id_type == IBD_PORT_DRIVER) {
7457 		freemsgchain(mp);
7458 		return (NULL);
7459 	}
7460 
7461 	if ((state->id_link_state != LINK_STATE_UP) ||
7462 	    !(state->id_mac_state & IBD_DRV_STARTED)) {
7463 		freemsgchain(mp);
7464 		mp = NULL;
7465 	}
7466 
7467 	while (mp != NULL) {
7468 		next = mp->b_next;
7469 		mp->b_next = NULL;
7470 		if (ibd_send(state, mp) == B_FALSE) {
7471 			/* Send fail */
7472 			mp->b_next = next;
7473 			break;
7474 		}
7475 		mp = next;
7476 	}
7477 
7478 	return (mp);
7479 }
7480 
7481 /*
7482  * this handles Tx and Rx completions. With separate CQs, this handles
7483  * only Rx completions.
7484  */
7485 static uint_t
7486 ibd_intr(caddr_t arg)
7487 {
7488 	ibd_state_t *state = (ibd_state_t *)arg;
7489 
7490 	ibd_poll_rcq(state, state->id_rcq_hdl);
7491 
7492 	return (DDI_INTR_CLAIMED);
7493 }
7494 
7495 /*
7496  * Poll and fully drain the send cq
7497  */
7498 static void
7499 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7500 {
7501 	ibt_wc_t *wcs = state->id_txwcs;
7502 	uint_t numwcs = state->id_txwcs_size;
7503 	ibd_wqe_t *wqe;
7504 	ibd_swqe_t *head, *tail;
7505 	ibt_wc_t *wc;
7506 	uint_t num_polled;
7507 	int i;
7508 
7509 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7510 		head = tail = NULL;
7511 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7512 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7513 			if (wc->wc_status != IBT_WC_SUCCESS) {
7514 				/*
7515 				 * Channel being torn down.
7516 				 */
7517 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7518 					DPRINT(5, "ibd_drain_scq: flush error");
7519 					DPRINT(10, "ibd_drain_scq: Bad "
7520 					    "status %d", wc->wc_status);
7521 				} else {
7522 					DPRINT(10, "ibd_drain_scq: "
7523 					    "unexpected wc_status %d",
7524 					    wc->wc_status);
7525 				}
7526 				/*
7527 				 * Fallthrough to invoke the Tx handler to
7528 				 * release held resources, e.g., AH refcount.
7529 				 */
7530 			}
7531 			/*
7532 			 * Add this swqe to the list to be cleaned up.
7533 			 */
7534 			if (head)
7535 				tail->swqe_next = wqe;
7536 			else
7537 				head = WQE_TO_SWQE(wqe);
7538 			tail = WQE_TO_SWQE(wqe);
7539 		}
7540 		tail->swqe_next = NULL;
7541 		ibd_tx_cleanup_list(state, head, tail);
7542 
7543 		/*
7544 		 * Resume any blocked transmissions if possible
7545 		 */
7546 		ibd_resume_transmission(state);
7547 	}
7548 }
7549 
7550 /*
7551  * Poll and fully drain the receive cq
7552  */
7553 static void
7554 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7555 {
7556 	ibt_wc_t *wcs = state->id_rxwcs;
7557 	uint_t numwcs = state->id_rxwcs_size;
7558 	ibd_rwqe_t *rwqe;
7559 	ibt_wc_t *wc;
7560 	uint_t num_polled;
7561 	int i;
7562 	mblk_t *head, *tail, *mp;
7563 
7564 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7565 		head = tail = NULL;
7566 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7567 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7568 			if (wc->wc_status != IBT_WC_SUCCESS) {
7569 				/*
7570 				 * Channel being torn down.
7571 				 */
7572 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7573 					DPRINT(5, "ibd_drain_rcq: "
7574 					    "expected flushed rwqe");
7575 				} else {
7576 					DPRINT(5, "ibd_drain_rcq: "
7577 					    "unexpected wc_status %d",
7578 					    wc->wc_status);
7579 				}
7580 				atomic_inc_32(
7581 				    &state->id_rx_list.dl_bufs_outstanding);
7582 				freemsg(rwqe->rwqe_im_mblk);
7583 				continue;
7584 			}
7585 			mp = ibd_process_rx(state, rwqe, wc);
7586 			if (mp == NULL)
7587 				continue;
7588 
7589 			/*
7590 			 * Add this mp to the list to send to the nw layer.
7591 			 */
7592 			if (head)
7593 				tail->b_next = mp;
7594 			else
7595 				head = mp;
7596 			tail = mp;
7597 		}
7598 		if (head)
7599 			mac_rx(state->id_mh, state->id_rh, head);
7600 
7601 		/*
7602 		 * Account for #rwqes polled.
7603 		 * Post more here, if less than one fourth full.
7604 		 */
7605 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7606 		    (state->id_ud_num_rwqe / 4))
7607 			ibd_post_recv_intr(state);
7608 	}
7609 }
7610 
7611 /*
7612  * Common code for interrupt handling as well as for polling
7613  * for all completed wqe's while detaching.
7614  */
7615 static void
7616 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7617 {
7618 	int flag, redo_flag;
7619 	int redo = 1;
7620 
7621 	flag = IBD_CQ_POLLING;
7622 	redo_flag = IBD_REDO_CQ_POLLING;
7623 
7624 	mutex_enter(&state->id_scq_poll_lock);
7625 	if (state->id_scq_poll_busy & flag) {
7626 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7627 		state->id_scq_poll_busy |= redo_flag;
7628 		mutex_exit(&state->id_scq_poll_lock);
7629 		return;
7630 	}
7631 	state->id_scq_poll_busy |= flag;
7632 	mutex_exit(&state->id_scq_poll_lock);
7633 
7634 	/*
7635 	 * In some cases (eg detaching), this code can be invoked on
7636 	 * any cpu after disabling cq notification (thus no concurrency
7637 	 * exists). Apart from that, the following applies normally:
7638 	 * Transmit completion handling could be from any cpu if
7639 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7640 	 * is interrupt driven.
7641 	 */
7642 
7643 	/*
7644 	 * Poll and drain the CQ
7645 	 */
7646 	ibd_drain_scq(state, cq_hdl);
7647 
7648 	/*
7649 	 * Enable CQ notifications and redrain the cq to catch any
7650 	 * completions we might have missed after the ibd_drain_scq()
7651 	 * above and before the ibt_enable_cq_notify() that follows.
7652 	 * Finally, service any new requests to poll the cq that
7653 	 * could've come in after the ibt_enable_cq_notify().
7654 	 */
7655 	do {
7656 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7657 		    IBT_SUCCESS) {
7658 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7659 		}
7660 
7661 		ibd_drain_scq(state, cq_hdl);
7662 
7663 		mutex_enter(&state->id_scq_poll_lock);
7664 		if (state->id_scq_poll_busy & redo_flag)
7665 			state->id_scq_poll_busy &= ~redo_flag;
7666 		else {
7667 			state->id_scq_poll_busy &= ~flag;
7668 			redo = 0;
7669 		}
7670 		mutex_exit(&state->id_scq_poll_lock);
7671 
7672 	} while (redo);
7673 }
7674 
7675 /*
7676  * Common code for interrupt handling as well as for polling
7677  * for all completed wqe's while detaching.
7678  */
7679 static void
7680 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7681 {
7682 	int flag, redo_flag;
7683 	int redo = 1;
7684 
7685 	flag = IBD_CQ_POLLING;
7686 	redo_flag = IBD_REDO_CQ_POLLING;
7687 
7688 	mutex_enter(&state->id_rcq_poll_lock);
7689 	if (state->id_rcq_poll_busy & flag) {
7690 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7691 		state->id_rcq_poll_busy |= redo_flag;
7692 		mutex_exit(&state->id_rcq_poll_lock);
7693 		return;
7694 	}
7695 	state->id_rcq_poll_busy |= flag;
7696 	mutex_exit(&state->id_rcq_poll_lock);
7697 
7698 	/*
7699 	 * Poll and drain the CQ
7700 	 */
7701 	ibd_drain_rcq(state, rcq);
7702 
7703 	/*
7704 	 * Enable CQ notifications and redrain the cq to catch any
7705 	 * completions we might have missed after the ibd_drain_cq()
7706 	 * above and before the ibt_enable_cq_notify() that follows.
7707 	 * Finally, service any new requests to poll the cq that
7708 	 * could've come in after the ibt_enable_cq_notify().
7709 	 */
7710 	do {
7711 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7712 		    IBT_SUCCESS) {
7713 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7714 		}
7715 
7716 		ibd_drain_rcq(state, rcq);
7717 
7718 		mutex_enter(&state->id_rcq_poll_lock);
7719 		if (state->id_rcq_poll_busy & redo_flag)
7720 			state->id_rcq_poll_busy &= ~redo_flag;
7721 		else {
7722 			state->id_rcq_poll_busy &= ~flag;
7723 			redo = 0;
7724 		}
7725 		mutex_exit(&state->id_rcq_poll_lock);
7726 
7727 	} while (redo);
7728 }
7729 
7730 /*
7731  * Unmap the memory area associated with a given swqe.
7732  */
7733 void
7734 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7735 {
7736 	ibt_status_t stat;
7737 
7738 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7739 
7740 	if (swqe->w_mi_hdl) {
7741 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7742 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
7743 			DPRINT(10,
7744 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7745 		}
7746 		swqe->w_mi_hdl = NULL;
7747 	}
7748 	swqe->w_swr.wr_nds = 0;
7749 }
7750 
7751 void
7752 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7753 {
7754 	/*
7755 	 * The recycling logic can be eliminated from here
7756 	 * and put into the async thread if we create another
7757 	 * list to hold ACE's for unjoined mcg's.
7758 	 */
7759 	if (DEC_REF_DO_CYCLE(ace)) {
7760 		ibd_mce_t *mce;
7761 
7762 		/*
7763 		 * Check with the lock taken: we decremented
7764 		 * reference count without the lock, and some
7765 		 * transmitter might already have bumped the
7766 		 * reference count (possible in case of multicast
7767 		 * disable when we leave the AH on the active
7768 		 * list). If not still 0, get out, leaving the
7769 		 * recycle bit intact.
7770 		 *
7771 		 * Atomically transition the AH from active
7772 		 * to free list, and queue a work request to
7773 		 * leave the group and destroy the mce. No
7774 		 * transmitter can be looking at the AH or
7775 		 * the MCE in between, since we have the
7776 		 * ac_mutex lock. In the SendOnly reap case,
7777 		 * it is not necessary to hold the ac_mutex
7778 		 * and recheck the ref count (since the AH was
7779 		 * taken off the active list), we just do it
7780 		 * to have uniform processing with the Full
7781 		 * reap case.
7782 		 */
7783 		mutex_enter(&state->id_ac_mutex);
7784 		mce = ace->ac_mce;
7785 		if (GET_REF_CYCLE(ace) == 0) {
7786 			CLEAR_REFCYCLE(ace);
7787 			/*
7788 			 * Identify the case of fullmember reap as
7789 			 * opposed to mcg trap reap. Also, port up
7790 			 * might set ac_mce to NULL to indicate Tx
7791 			 * cleanup should do no more than put the
7792 			 * AH in the free list (see ibd_async_link).
7793 			 */
7794 			if (mce != NULL) {
7795 				ace->ac_mce = NULL;
7796 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7797 				/*
7798 				 * mc_req was initialized at mce
7799 				 * creation time.
7800 				 */
7801 				ibd_queue_work_slot(state,
7802 				    &mce->mc_req, IBD_ASYNC_REAP);
7803 			}
7804 			IBD_ACACHE_INSERT_FREE(state, ace);
7805 		}
7806 		mutex_exit(&state->id_ac_mutex);
7807 	}
7808 }
7809 
7810 /*
7811  * Common code that deals with clean ups after a successful or
7812  * erroneous transmission attempt.
7813  */
7814 static void
7815 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7816 {
7817 	ibd_ace_t *ace = swqe->w_ahandle;
7818 
7819 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7820 
7821 	/*
7822 	 * If this was a dynamic mapping in ibd_send(), we need to
7823 	 * unmap here. If this was an lso buffer we'd used for sending,
7824 	 * we need to release the lso buf to the pool, since the resource
7825 	 * is scarce. However, if this was simply a normal send using
7826 	 * the copybuf (present in each swqe), we don't need to release it.
7827 	 */
7828 	if (swqe->swqe_im_mblk != NULL) {
7829 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
7830 			ibd_unmap_mem(state, swqe);
7831 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7832 			ibd_release_lsobufs(state,
7833 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7834 		}
7835 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7836 		freemsg(swqe->swqe_im_mblk);
7837 		swqe->swqe_im_mblk = NULL;
7838 	}
7839 
7840 	/*
7841 	 * Drop the reference count on the AH; it can be reused
7842 	 * now for a different destination if there are no more
7843 	 * posted sends that will use it. This can be eliminated
7844 	 * if we can always associate each Tx buffer with an AH.
7845 	 * The ace can be null if we are cleaning up from the
7846 	 * ibd_send() error path.
7847 	 */
7848 	if (ace != NULL) {
7849 		ibd_dec_ref_ace(state, ace);
7850 	}
7851 
7852 	/*
7853 	 * Release the send wqe for reuse.
7854 	 */
7855 	swqe->swqe_next = NULL;
7856 	ibd_release_swqe(state, swqe, swqe, 1);
7857 }
7858 
7859 static void
7860 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7861 {
7862 	ibd_ace_t *ace;
7863 	ibd_swqe_t *swqe;
7864 	int n = 0;
7865 
7866 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7867 
7868 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7869 
7870 		/*
7871 		 * If this was a dynamic mapping in ibd_send(), we need to
7872 		 * unmap here. If this was an lso buffer we'd used for sending,
7873 		 * we need to release the lso buf to the pool, since the
7874 		 * resource is scarce. However, if this was simply a normal
7875 		 * send using the copybuf (present in each swqe), we don't need
7876 		 * to release it.
7877 		 */
7878 		if (swqe->swqe_im_mblk != NULL) {
7879 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
7880 				ibd_unmap_mem(state, swqe);
7881 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7882 				ibd_release_lsobufs(state,
7883 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7884 			}
7885 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7886 			freemsg(swqe->swqe_im_mblk);
7887 			swqe->swqe_im_mblk = NULL;
7888 		}
7889 
7890 		/*
7891 		 * Drop the reference count on the AH; it can be reused
7892 		 * now for a different destination if there are no more
7893 		 * posted sends that will use it. This can be eliminated
7894 		 * if we can always associate each Tx buffer with an AH.
7895 		 * The ace can be null if we are cleaning up from the
7896 		 * ibd_send() error path.
7897 		 */
7898 		ace = swqe->w_ahandle;
7899 		if (ace != NULL) {
7900 			ibd_dec_ref_ace(state, ace);
7901 		}
7902 		n++;
7903 	}
7904 
7905 	/*
7906 	 * Release the send wqes for reuse.
7907 	 */
7908 	ibd_release_swqe(state, head, tail, n);
7909 }
7910 
7911 /*
7912  * Processing to be done after receipt of a packet; hand off to GLD
7913  * in the format expected by GLD.  The received packet has this
7914  * format: 2b sap :: 00 :: data.
7915  */
7916 static mblk_t *
7917 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7918 {
7919 	ib_header_info_t *phdr;
7920 	mblk_t *mp;
7921 	ipoib_hdr_t *ipibp;
7922 	ipha_t *iphap;
7923 	ip6_t *ip6h;
7924 	int len;
7925 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7926 	uint32_t bufs;
7927 
7928 	/*
7929 	 * Track number handed to upper layer that need to be returned.
7930 	 */
7931 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7932 
7933 	/* Never run out of rwqes, use allocb when running low */
7934 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
7935 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7936 		atomic_inc_32(&state->id_rx_allocb);
7937 		mp = allocb(pkt_len, BPRI_HI);
7938 		if (mp) {
7939 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7940 			ibd_post_recv(state, rwqe);
7941 		} else {	/* no memory */
7942 			atomic_inc_32(&state->id_rx_allocb_failed);
7943 			ibd_post_recv(state, rwqe);
7944 			return (NULL);
7945 		}
7946 	} else {
7947 		mp = rwqe->rwqe_im_mblk;
7948 	}
7949 
7950 
7951 	/*
7952 	 * Adjust write pointer depending on how much data came in.
7953 	 */
7954 	mp->b_wptr = mp->b_rptr + pkt_len;
7955 
7956 	/*
7957 	 * Make sure this is NULL or we're in trouble.
7958 	 */
7959 	if (mp->b_next != NULL) {
7960 		ibd_print_warn(state,
7961 		    "ibd_process_rx: got duplicate mp from rcq?");
7962 		mp->b_next = NULL;
7963 	}
7964 
7965 	/*
7966 	 * the IB link will deliver one of the IB link layer
7967 	 * headers called, the Global Routing Header (GRH).
7968 	 * ibd driver uses the information in GRH to build the
7969 	 * Header_info structure and pass it with the datagram up
7970 	 * to GLDv3.
7971 	 * If the GRH is not valid, indicate to GLDv3 by setting
7972 	 * the VerTcFlow field to 0.
7973 	 */
7974 	phdr = (ib_header_info_t *)mp->b_rptr;
7975 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7976 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7977 
7978 		/* if it is loop back packet, just drop it. */
7979 		if (state->id_enable_rc) {
7980 			if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7981 			    &state->rc_macaddr_loopback,
7982 			    IPOIB_ADDRL) == 0) {
7983 				freemsg(mp);
7984 				return (NULL);
7985 			}
7986 		} else {
7987 			if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7988 			    IPOIB_ADDRL) == 0) {
7989 				freemsg(mp);
7990 				return (NULL);
7991 			}
7992 		}
7993 
7994 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7995 		    sizeof (ipoib_mac_t));
7996 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
7997 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
7998 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
7999 		} else {
8000 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
8001 		}
8002 	} else {
8003 		/*
8004 		 * It can not be a IBA multicast packet. Must have been
8005 		 * unicast for us. Just copy the interface address to dst.
8006 		 */
8007 		phdr->ib_grh.ipoib_vertcflow = 0;
8008 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
8009 		    sizeof (ipoib_mac_t));
8010 	}
8011 
8012 	/*
8013 	 * For ND6 packets, padding is at the front of the source/target
8014 	 * lladdr. However the inet6 layer is not aware of it, hence remove
8015 	 * the padding from such packets.
8016 	 */
8017 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
8018 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
8019 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8020 		len = ntohs(ip6h->ip6_plen);
8021 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8022 			/* LINTED: E_CONSTANT_CONDITION */
8023 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
8024 		}
8025 	}
8026 
8027 	/*
8028 	 * Update statistics
8029 	 */
8030 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
8031 	atomic_inc_64(&state->id_rcv_pkt);
8032 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
8033 		atomic_inc_64(&state->id_brd_rcv);
8034 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
8035 		atomic_inc_64(&state->id_multi_rcv);
8036 
8037 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8038 	/*
8039 	 * Set receive checksum status in mp
8040 	 * Hardware checksumming can be considered valid only if:
8041 	 * 1. CQE.IP_OK bit is set
8042 	 * 2. CQE.CKSUM = 0xffff
8043 	 * 3. IPv6 routing header is not present in the packet
8044 	 * 4. If there are no IP_OPTIONS in the IP HEADER
8045 	 */
8046 
8047 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
8048 	    (wc->wc_cksum == 0xFFFF) &&
8049 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
8050 		mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
8051 	}
8052 
8053 	return (mp);
8054 }
8055 
8056 /*
8057  * Callback code invoked from STREAMs when the receive data buffer is
8058  * free for recycling.
8059  */
8060 static void
8061 ibd_freemsg_cb(char *arg)
8062 {
8063 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
8064 	ibd_state_t *state = rwqe->w_state;
8065 
8066 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
8067 
8068 	/*
8069 	 * If the driver is stopped, just free the rwqe.
8070 	 */
8071 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8072 		DPRINT(6, "ibd_freemsg: wqe being freed");
8073 		rwqe->rwqe_im_mblk = NULL;
8074 		ibd_free_rwqe(state, rwqe);
8075 		return;
8076 	}
8077 
8078 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8079 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8080 	if (rwqe->rwqe_im_mblk == NULL) {
8081 		ibd_free_rwqe(state, rwqe);
8082 		DPRINT(6, "ibd_freemsg: desballoc failed");
8083 		return;
8084 	}
8085 
8086 	ibd_post_recv(state, rwqe);
8087 }
8088 
8089 static uint_t
8090 ibd_tx_recycle(caddr_t arg)
8091 {
8092 	ibd_state_t *state = (ibd_state_t *)arg;
8093 
8094 	/*
8095 	 * Poll for completed entries
8096 	 */
8097 	ibd_poll_scq(state, state->id_scq_hdl);
8098 
8099 	return (DDI_INTR_CLAIMED);
8100 }
8101 
8102 #ifdef IBD_LOGGING
8103 static void
8104 ibd_log_init(void)
8105 {
8106 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8107 	ibd_lbuf_ndx = 0;
8108 
8109 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8110 }
8111 
8112 static void
8113 ibd_log_fini(void)
8114 {
8115 	if (ibd_lbuf)
8116 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
8117 	ibd_lbuf_ndx = 0;
8118 	ibd_lbuf = NULL;
8119 
8120 	mutex_destroy(&ibd_lbuf_lock);
8121 }
8122 
8123 static void
8124 ibd_log(const char *fmt, ...)
8125 {
8126 	va_list	ap;
8127 	uint32_t off;
8128 	uint32_t msglen;
8129 	char tmpbuf[IBD_DMAX_LINE];
8130 
8131 	if (ibd_lbuf == NULL)
8132 		return;
8133 
8134 	va_start(ap, fmt);
8135 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8136 	va_end(ap);
8137 
8138 	if (msglen >= IBD_DMAX_LINE)
8139 		msglen = IBD_DMAX_LINE - 1;
8140 
8141 	mutex_enter(&ibd_lbuf_lock);
8142 
8143 	off = ibd_lbuf_ndx;		/* current msg should go here */
8144 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8145 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8146 
8147 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
8148 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
8149 
8150 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8151 		ibd_lbuf_ndx = 0;
8152 
8153 	mutex_exit(&ibd_lbuf_lock);
8154 
8155 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
8156 }
8157 #endif
8158 
8159 /* ARGSUSED */
8160 static int
8161 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8162     int *rvalp)
8163 {
8164 	ibd_create_ioctl_t	*cmd = karg;
8165 	ibd_state_t		*state, *port_state, *p;
8166 	int			i, err, rval = 0;
8167 	mac_register_t		*macp;
8168 	ibt_hca_portinfo_t 	*pinfop = NULL;
8169 	ibt_status_t 		ibt_status;
8170 	uint_t 			psize, pinfosz;
8171 	boolean_t		force_create = B_FALSE;
8172 
8173 	cmd->ibdioc.ioc_status = 0;
8174 
8175 	if (cmd->ibdioc.ioc_port_inst < 0) {
8176 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8177 		return (EINVAL);
8178 	}
8179 	port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8180 	if (port_state == NULL) {
8181 		DPRINT(10, "ibd_create_partition: failed to get state %d",
8182 		    cmd->ibdioc.ioc_port_inst);
8183 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8184 		return (EINVAL);
8185 	}
8186 
8187 	/* Limited PKeys not supported */
8188 	if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8189 		rval = EINVAL;
8190 		goto part_create_return;
8191 	}
8192 
8193 	if (cmd->ioc_force_create == 0) {
8194 		/*
8195 		 * Check if the port pkey table contains the pkey for which
8196 		 * this partition is being created.
8197 		 */
8198 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8199 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8200 
8201 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8202 			rval = EINVAL;
8203 			goto part_create_return;
8204 		}
8205 
8206 		if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8207 			rval = ENETDOWN;
8208 			cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8209 			goto part_create_return;
8210 		}
8211 
8212 		for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8213 			if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8214 				break;
8215 			}
8216 		}
8217 		if (i == pinfop->p_pkey_tbl_sz) {
8218 			rval = EINVAL;
8219 			cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8220 			goto part_create_return;
8221 		}
8222 	} else {
8223 		force_create = B_TRUE;
8224 	}
8225 
8226 	mutex_enter(&ibd_objlist_lock);
8227 	for (p = ibd_objlist_head; p; p = p->id_next) {
8228 		if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8229 		    (p->id_pkey == cmd->ioc_pkey) &&
8230 		    (p->id_plinkid == cmd->ioc_partid)) {
8231 			mutex_exit(&ibd_objlist_lock);
8232 			rval = EEXIST;
8233 			cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8234 			goto part_create_return;
8235 		}
8236 	}
8237 	mutex_exit(&ibd_objlist_lock);
8238 
8239 	state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8240 
8241 	state->id_type		= IBD_PARTITION_OBJ;
8242 
8243 	state->id_plinkid	= cmd->ioc_partid;
8244 	state->id_dlinkid	= cmd->ibdioc.ioc_linkid;
8245 	state->id_port_inst	= cmd->ibdioc.ioc_port_inst;
8246 
8247 	state->id_dip		= port_state->id_dip;
8248 	state->id_port		= port_state->id_port;
8249 	state->id_pkey		= cmd->ioc_pkey;
8250 	state->id_hca_guid	= port_state->id_hca_guid;
8251 	state->id_port_guid	= port_state->id_port_guid;
8252 	state->id_force_create	= force_create;
8253 
8254 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8255 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8256 
8257 	if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8258 		rval = EIO;
8259 		cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8260 		goto fail;
8261 	}
8262 
8263 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8264 		rval = EAGAIN;
8265 		goto fail;
8266 	}
8267 
8268 	macp->m_type_ident	= MAC_PLUGIN_IDENT_IB;
8269 	macp->m_dip		= port_state->id_dip;
8270 	macp->m_instance	= (uint_t)-1;
8271 	macp->m_driver		= state;
8272 	macp->m_src_addr	= (uint8_t *)&state->id_macaddr;
8273 	macp->m_callbacks	= &ibd_m_callbacks;
8274 	macp->m_min_sdu		= 0;
8275 	if (state->id_enable_rc) {
8276 		macp->m_max_sdu		= IBD_DEF_RC_MAX_SDU;
8277 	} else {
8278 		macp->m_max_sdu		= IBD_DEF_MAX_SDU;
8279 	}
8280 	macp->m_priv_props = ibd_priv_props;
8281 
8282 	err = mac_register(macp, &state->id_mh);
8283 	mac_free(macp);
8284 
8285 	if (err != 0) {
8286 		DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8287 		    err);
8288 		rval = err;
8289 		goto fail;
8290 	}
8291 
8292 	err = dls_devnet_create(state->id_mh,
8293 	    cmd->ioc_partid, crgetzoneid(credp));
8294 	if (err != 0) {
8295 		DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8296 		    "%d", err);
8297 		rval = err;
8298 		(void) mac_unregister(state->id_mh);
8299 		goto fail;
8300 	}
8301 
8302 	/*
8303 	 * Add the new partition state structure to the list
8304 	 */
8305 	mutex_enter(&ibd_objlist_lock);
8306 	if (ibd_objlist_head)
8307 		state->id_next = ibd_objlist_head;
8308 
8309 	ibd_objlist_head = state;
8310 	mutex_exit(&ibd_objlist_lock);
8311 
8312 part_create_return:
8313 	if (pinfop) {
8314 		ibt_free_portinfo(pinfop, pinfosz);
8315 	}
8316 	return (rval);
8317 
8318 fail:
8319 	if (pinfop) {
8320 		ibt_free_portinfo(pinfop, pinfosz);
8321 	}
8322 	ibd_part_unattach(state);
8323 	kmem_free(state, sizeof (ibd_state_t));
8324 	return (rval);
8325 }
8326 
8327 /* ARGSUSED */
8328 static int
8329 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8330     int *rvalp)
8331 {
8332 	int err;
8333 	datalink_id_t tmpid;
8334 	ibd_state_t *node, *prev;
8335 	ibd_delete_ioctl_t *cmd = karg;
8336 
8337 	prev = NULL;
8338 
8339 	mutex_enter(&ibd_objlist_lock);
8340 	node = ibd_objlist_head;
8341 
8342 	/* Find the ibd state structure corresponding to the partition */
8343 	while (node != NULL) {
8344 		if (node->id_plinkid == cmd->ioc_partid)
8345 			break;
8346 		prev = node;
8347 		node = node->id_next;
8348 	}
8349 
8350 	if (node == NULL) {
8351 		mutex_exit(&ibd_objlist_lock);
8352 		return (ENOENT);
8353 	}
8354 
8355 	if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8356 		DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8357 		    "%d", err);
8358 		mutex_exit(&ibd_objlist_lock);
8359 		return (err);
8360 	}
8361 
8362 	/*
8363 	 * Call ibd_part_unattach() only after making sure that the instance has
8364 	 * not been started yet and is also not in late hca init mode.
8365 	 */
8366 	ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8367 
8368 	err = 0;
8369 	if ((node->id_mac_state & IBD_DRV_STARTED) ||
8370 	    (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8371 	    (ibd_part_busy(node) != DDI_SUCCESS) ||
8372 	    ((err = mac_disable(node->id_mh)) != 0)) {
8373 		(void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8374 		    crgetzoneid(credp));
8375 		ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8376 		mutex_exit(&ibd_objlist_lock);
8377 		return (err != 0 ? err : EBUSY);
8378 	}
8379 
8380 	node->id_mac_state |= IBD_DRV_IN_DELETION;
8381 
8382 	ibd_part_unattach(node);
8383 
8384 	ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8385 
8386 	/* Remove the partition state structure from the linked list */
8387 	if (prev == NULL)
8388 		ibd_objlist_head = node->id_next;
8389 	else
8390 		prev->id_next = node->id_next;
8391 	mutex_exit(&ibd_objlist_lock);
8392 
8393 	if ((err = mac_unregister(node->id_mh)) != 0) {
8394 		DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8395 		    err);
8396 	}
8397 
8398 	cv_destroy(&node->id_macst_cv);
8399 	mutex_destroy(&node->id_macst_lock);
8400 
8401 	kmem_free(node, sizeof (ibd_state_t));
8402 
8403 	return (0);
8404 }
8405 
8406 /* ARGSUSED */
8407 static int
8408 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8409     int *rvalp)
8410 {
8411 	ibd_ioctl_t		cmd;
8412 	ibpart_ioctl_t		partioc;
8413 	ibport_ioctl_t		portioc;
8414 #ifdef _MULTI_DATAMODEL
8415 	ibport_ioctl32_t	portioc32;
8416 #endif
8417 	ibd_state_t		*state, *port_state;
8418 	int			size;
8419 	ibt_hca_portinfo_t 	*pinfop = NULL;
8420 	ibt_status_t 		ibt_status;
8421 	uint_t 			psize, pinfosz;
8422 	int			rval = 0;
8423 
8424 	size = sizeof (ibd_ioctl_t);
8425 	if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8426 		return (EFAULT);
8427 	}
8428 	cmd.ioc_status = 0;
8429 	switch (cmd.ioc_info_cmd) {
8430 	case IBD_INFO_CMD_IBPART:
8431 		size = sizeof (ibpart_ioctl_t);
8432 		if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8433 			return (EFAULT);
8434 		}
8435 
8436 		mutex_enter(&ibd_objlist_lock);
8437 		/* Find the ibd state structure corresponding the partition */
8438 		for (state = ibd_objlist_head; state; state = state->id_next) {
8439 			if (state->id_plinkid == cmd.ioc_linkid) {
8440 				break;
8441 			}
8442 		}
8443 
8444 		if (state == NULL) {
8445 			mutex_exit(&ibd_objlist_lock);
8446 			return (ENOENT);
8447 		}
8448 
8449 		partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8450 		partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8451 		partioc.ibdioc.ioc_portnum = state->id_port;
8452 		partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8453 		partioc.ibdioc.ioc_portguid = state->id_port_guid;
8454 		partioc.ibdioc.ioc_status = 0;
8455 		partioc.ioc_partid = state->id_plinkid;
8456 		partioc.ioc_pkey = state->id_pkey;
8457 		partioc.ioc_force_create = state->id_force_create;
8458 		if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8459 			mutex_exit(&ibd_objlist_lock);
8460 			return (EFAULT);
8461 		}
8462 		mutex_exit(&ibd_objlist_lock);
8463 
8464 		break;
8465 
8466 	case IBD_INFO_CMD_IBPORT:
8467 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8468 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8469 			DPRINT(10, "ibd_create_partition: failed to get"
8470 			    " state %d", cmd.ioc_port_inst);
8471 			size = sizeof (ibd_ioctl_t);
8472 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8473 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8474 			    mode)) {
8475 				return (EFAULT);
8476 			}
8477 			return (EINVAL);
8478 		}
8479 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8480 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8481 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8482 			return (EINVAL);
8483 		}
8484 #ifdef _MULTI_DATAMODEL
8485 		switch (ddi_model_convert_from(mode & FMODELS)) {
8486 		case DDI_MODEL_ILP32: {
8487 			size = sizeof (ibport_ioctl32_t);
8488 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8489 				rval = EFAULT;
8490 				goto fail;
8491 			}
8492 			portioc32.ibdioc.ioc_status = 0;
8493 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8494 			portioc32.ibdioc.ioc_hcaguid =
8495 			    port_state->id_hca_guid;
8496 			portioc32.ibdioc.ioc_portguid =
8497 			    port_state->id_port_guid;
8498 			if (portioc32.ioc_pkey_tbl_sz !=
8499 			    pinfop->p_pkey_tbl_sz) {
8500 				rval = EINVAL;
8501 				size = sizeof (ibd_ioctl_t);
8502 				portioc32.ibdioc.ioc_status =
8503 				    IBD_INVALID_PKEY_TBL_SIZE;
8504 				if (ddi_copyout((void *)&portioc32.ibdioc,
8505 				    (void *)arg, size, mode)) {
8506 					rval = EFAULT;
8507 					goto fail;
8508 				}
8509 				goto fail;
8510 			}
8511 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8512 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8513 			    (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8514 			    mode)) {
8515 				rval = EFAULT;
8516 				goto fail;
8517 			}
8518 			size = sizeof (ibport_ioctl32_t);
8519 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8520 			    mode)) {
8521 				rval = EFAULT;
8522 				goto fail;
8523 			}
8524 			break;
8525 		}
8526 		case DDI_MODEL_NONE:
8527 			size = sizeof (ibport_ioctl_t);
8528 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8529 				rval = EFAULT;
8530 				goto fail;
8531 			}
8532 			portioc.ibdioc.ioc_status = 0;
8533 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8534 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8535 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8536 			if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8537 				rval = EINVAL;
8538 				size = sizeof (ibd_ioctl_t);
8539 				portioc.ibdioc.ioc_status =
8540 				    IBD_INVALID_PKEY_TBL_SIZE;
8541 				if (ddi_copyout((void *)&portioc.ibdioc,
8542 				    (void *)arg, size, mode)) {
8543 					rval = EFAULT;
8544 					goto fail;
8545 				}
8546 				goto fail;
8547 			}
8548 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8549 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8550 			    (void *)(portioc.ioc_pkeys), size, mode)) {
8551 				rval = EFAULT;
8552 				goto fail;
8553 			}
8554 			size = sizeof (ibport_ioctl_t);
8555 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8556 			    mode)) {
8557 				rval = EFAULT;
8558 				goto fail;
8559 			}
8560 			break;
8561 		}
8562 #else /* ! _MULTI_DATAMODEL */
8563 		size = sizeof (ibport_ioctl_t);
8564 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8565 			rval = EFAULT;
8566 			goto fail;
8567 		}
8568 		portioc.ibdioc.ioc_status = 0;
8569 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8570 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8571 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8572 		if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8573 			rval = EINVAL;
8574 			size = sizeof (ibd_ioctl_t);
8575 			portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8576 			if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8577 			    size, mode)) {
8578 				rval = EFAULT;
8579 				goto fail;
8580 			}
8581 			goto fail;
8582 		}
8583 		size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8584 		if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8585 		    (void *)(portioc.ioc_pkeys), size, mode)) {
8586 			rval = EFAULT;
8587 			goto fail;
8588 		}
8589 		size = sizeof (ibport_ioctl_t);
8590 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8591 		    mode)) {
8592 			rval = EFAULT;
8593 			goto fail;
8594 		}
8595 #endif /* _MULTI_DATAMODEL */
8596 
8597 		break;
8598 
8599 	case IBD_INFO_CMD_PKEYTBLSZ:
8600 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8601 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8602 			DPRINT(10, "ibd_create_partition: failed to get"
8603 			    " state %d", cmd.ioc_port_inst);
8604 			size = sizeof (ibd_ioctl_t);
8605 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8606 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8607 			    mode)) {
8608 				return (EFAULT);
8609 			}
8610 			return (EINVAL);
8611 		}
8612 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8613 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8614 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8615 			return (EINVAL);
8616 		}
8617 #ifdef _MULTI_DATAMODEL
8618 		switch (ddi_model_convert_from(mode & FMODELS)) {
8619 		case DDI_MODEL_ILP32: {
8620 			size = sizeof (ibport_ioctl32_t);
8621 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8622 				rval = EFAULT;
8623 				goto fail;
8624 			}
8625 			portioc32.ibdioc.ioc_status = 0;
8626 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8627 			portioc32.ibdioc.ioc_hcaguid =
8628 			    port_state->id_hca_guid;
8629 			portioc32.ibdioc.ioc_portguid =
8630 			    port_state->id_port_guid;
8631 			portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8632 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8633 			    mode)) {
8634 				rval = EFAULT;
8635 				goto fail;
8636 			}
8637 			break;
8638 		}
8639 		case DDI_MODEL_NONE:
8640 			size = sizeof (ibport_ioctl_t);
8641 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8642 				rval = EFAULT;
8643 				goto fail;
8644 			}
8645 			portioc.ibdioc.ioc_status = 0;
8646 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8647 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8648 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8649 			portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8650 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8651 			    mode)) {
8652 				rval = EFAULT;
8653 				goto fail;
8654 			}
8655 			break;
8656 		}
8657 #else /* ! _MULTI_DATAMODEL */
8658 		size = sizeof (ibport_ioctl_t);
8659 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8660 			rval = EFAULT;
8661 			goto fail;
8662 		}
8663 		portioc.ibdioc.ioc_status = 0;
8664 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8665 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8666 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8667 		portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8668 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8669 		    mode)) {
8670 			rval = EFAULT;
8671 			goto fail;
8672 		}
8673 #endif /* _MULTI_DATAMODEL */
8674 		break;
8675 
8676 	default:
8677 		return (EINVAL);
8678 
8679 	} /* switch (cmd.ioc_info_cmd) */
8680 fail:
8681 	if (pinfop) {
8682 		ibt_free_portinfo(pinfop, pinfosz);
8683 	}
8684 	return (rval);
8685 }
8686 
8687 /* ARGSUSED */
8688 static void
8689 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8690     ibt_async_code_t code, ibt_async_event_t *event)
8691 {
8692 	ibd_state_t *state = (ibd_state_t *)arg;
8693 	link_state_t	lstate;
8694 
8695 	switch (code) {
8696 	case IBT_EVENT_PORT_UP:
8697 	case IBT_ERROR_PORT_DOWN:
8698 		if (ibd_get_port_state(state, &lstate) != 0)
8699 			break;
8700 
8701 		if (state->id_link_state != lstate) {
8702 			state->id_link_state = lstate;
8703 			mac_link_update(state->id_mh, lstate);
8704 		}
8705 		break;
8706 	default:
8707 		break;
8708 	}
8709 }
8710 
8711 static int
8712 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8713 {
8714 	ibt_hca_portinfo_t *port_infop;
8715 	uint_t psize, port_infosz;
8716 	ibt_status_t	ret;
8717 
8718 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8719 	    &port_infop, &psize, &port_infosz);
8720 	if ((ret != IBT_SUCCESS) || (psize != 1))
8721 		return (-1);
8722 
8723 	state->id_sgid = *port_infop->p_sgid_tbl;
8724 	state->id_link_speed = ibd_get_portspeed(state);
8725 
8726 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8727 		*lstate = LINK_STATE_UP;
8728 	else
8729 		*lstate = LINK_STATE_DOWN;
8730 
8731 	ibt_free_portinfo(port_infop, port_infosz);
8732 	return (0);
8733 }
8734 
8735 static int
8736 ibd_port_attach(dev_info_t *dip)
8737 {
8738 	ibd_state_t		*state;
8739 	link_state_t		lstate;
8740 	int			instance;
8741 	ibt_status_t		ret;
8742 
8743 	/*
8744 	 * Allocate softstate structure
8745 	 */
8746 	instance = ddi_get_instance(dip);
8747 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8748 		DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8749 		return (DDI_FAILURE);
8750 	}
8751 
8752 	state = ddi_get_soft_state(ibd_list, instance);
8753 
8754 	state->id_dip = dip;
8755 	state->id_type = IBD_PORT_DRIVER;
8756 
8757 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8758 	    "port-number", 0)) == 0) {
8759 		DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8760 		    state->id_port);
8761 		return (DDI_FAILURE);
8762 	}
8763 	if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8764 	    "hca-guid", 0)) == 0) {
8765 		DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8766 		    state->id_hca_guid);
8767 		return (DDI_FAILURE);
8768 	}
8769 	if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8770 	    "port-guid", 0)) == 0) {
8771 		DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8772 		    state->id_port_guid);
8773 		return (DDI_FAILURE);
8774 	}
8775 
8776 	/*
8777 	 * Attach to IBTL
8778 	 */
8779 	if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8780 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
8781 		DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8782 		    ret);
8783 		goto done;
8784 	}
8785 
8786 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8787 
8788 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8789 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
8790 		DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8791 		    ret);
8792 		goto done;
8793 	}
8794 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
8795 
8796 	/* Update link status */
8797 
8798 	if (ibd_get_port_state(state, &lstate) != 0) {
8799 		DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8800 		    ret);
8801 		goto done;
8802 	}
8803 	state->id_link_state = lstate;
8804 	/*
8805 	 * Register ibd interfaces with the Nemo framework
8806 	 */
8807 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8808 		DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8809 		goto done;
8810 	}
8811 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8812 
8813 	mac_link_update(state->id_mh, lstate);
8814 
8815 	return (DDI_SUCCESS);
8816 done:
8817 	(void) ibd_port_unattach(state, dip);
8818 	return (DDI_FAILURE);
8819 }
8820 
8821 static int
8822 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8823 {
8824 	int instance;
8825 	uint32_t progress = state->id_mac_state;
8826 	ibt_status_t ret;
8827 
8828 	if (progress & IBD_DRV_MAC_REGISTERED) {
8829 		(void) mac_unregister(state->id_mh);
8830 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8831 	}
8832 
8833 	if (progress & IBD_DRV_HCA_OPENED) {
8834 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8835 		    IBT_SUCCESS) {
8836 			ibd_print_warn(state, "failed to close "
8837 			    "HCA device, ret=%d", ret);
8838 		}
8839 		state->id_hca_hdl = NULL;
8840 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8841 	}
8842 
8843 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8844 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8845 			ibd_print_warn(state,
8846 			    "ibt_detach() failed, ret=%d", ret);
8847 		}
8848 		state->id_ibt_hdl = NULL;
8849 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8850 	}
8851 	instance = ddi_get_instance(dip);
8852 	ddi_soft_state_free(ibd_list, instance);
8853 
8854 	return (DDI_SUCCESS);
8855 }
8856 
8857 ibt_status_t
8858 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8859 {
8860 	ibd_state_t	*state;
8861 
8862 	mutex_enter(&ibd_objlist_lock);
8863 
8864 	/* Find the ibd state structure corresponding the partition */
8865 	for (state = ibd_objlist_head; state; state = state->id_next) {
8866 		if (state->id_plinkid == linkid) {
8867 			break;
8868 		}
8869 	}
8870 
8871 	if (state == NULL) {
8872 		mutex_exit(&ibd_objlist_lock);
8873 		return (IBT_NO_SUCH_OBJECT);
8874 	}
8875 
8876 	attr->pa_dlinkid = state->id_dlinkid;
8877 	attr->pa_plinkid = state->id_plinkid;
8878 	attr->pa_port = state->id_port;
8879 	attr->pa_hca_guid = state->id_hca_guid;
8880 	attr->pa_port_guid = state->id_port_guid;
8881 	attr->pa_pkey = state->id_pkey;
8882 
8883 	mutex_exit(&ibd_objlist_lock);
8884 
8885 	return (IBT_SUCCESS);
8886 }
8887 
8888 ibt_status_t
8889 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8890 {
8891 	ibd_state_t	*state;
8892 	int		n = 0;
8893 	ibt_part_attr_t	*attr;
8894 
8895 	mutex_enter(&ibd_objlist_lock);
8896 
8897 	for (state = ibd_objlist_head; state; state = state->id_next)
8898 		n++;
8899 
8900 	*nparts = n;
8901 	if (n == 0) {
8902 		*attr_list = NULL;
8903 		mutex_exit(&ibd_objlist_lock);
8904 		return (IBT_SUCCESS);
8905 	}
8906 
8907 	*attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8908 	attr = *attr_list;
8909 	for (state = ibd_objlist_head; state; state = state->id_next) {
8910 #ifdef DEBUG
8911 		ASSERT(n > 0);
8912 		n--;
8913 #endif
8914 		attr->pa_dlinkid = state->id_dlinkid;
8915 		attr->pa_plinkid = state->id_plinkid;
8916 		attr->pa_port = state->id_port;
8917 		attr->pa_hca_guid = state->id_hca_guid;
8918 		attr->pa_port_guid = state->id_port_guid;
8919 		attr->pa_pkey = state->id_pkey;
8920 		attr++;
8921 	}
8922 
8923 	mutex_exit(&ibd_objlist_lock);
8924 	return (IBT_SUCCESS);
8925 }
8926