xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 03514dd70879e522caeae9bc4b36d18c43e15a43)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * An implementation of the IPoIB standard based on PSARC 2001/289.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/sysmacros.h>	/* for offsetof */
44 #include <sys/disp.h>		/* for async thread pri */
45 #include <sys/atomic.h>		/* for atomic_add*() */
46 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
47 #include <netinet/in.h>		/* for netinet/ip.h below */
48 #include <netinet/ip.h>		/* for struct ip */
49 #include <netinet/udp.h>	/* for struct udphdr */
50 #include <inet/common.h>	/* for inet/ip.h below */
51 #include <inet/ip.h>		/* for ipha_t */
52 #include <inet/ip6.h>		/* for ip6_t */
53 #include <inet/tcp.h>		/* for tcph_t */
54 #include <netinet/icmp6.h>	/* for icmp6_t */
55 #include <sys/callb.h>
56 #include <sys/modhash.h>
57 
58 #include <sys/ib/clients/ibd/ibd.h>
59 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
60 #include <sys/note.h>
61 #include <sys/multidata.h>
62 
63 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
64 
65 #include <sys/priv_names.h>
66 #include <sys/dls.h>
67 #include <sys/dld_ioc.h>
68 #include <sys/policy.h>
69 #include <sys/ibpart.h>
70 #include <sys/file.h>
71 
72 /*
73  * The write-up below includes details on the following:
74  * 1. The dladm administrative model.
75  * 2. Late HCA initialization feature.
76  * 3. Brussels support and its implications to the current architecture.
77  *
78  * 1. The dladm administrative model.
79  * ------------------------------------------
80  * With the dladm model, ibnex will create one ibd instance per port. These
81  * instances will be created independent of the port state.
82  *
83  * The ibd driver is two faceted: One side of it working as the port driver and
84  * the other as the partition object driver.
85  *
86  * The port instance is a child of the HCA, and will have an entry in the devfs.
87  * A DDI attach only happens for the port driver, and its attach is
88  * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89  * handled in ibd_port_unattach().
90  *
91  * The partition object is only a registrant to the mac layer via mac_register()
92  * and does not have an entry in the device tree. There is no DDI softstate
93  * managed by the DDI framework for the partition objects. However, the state is
94  * managed inside the ibd driver, and every partition object hangs off the
95  * "ibd_objlist_head".
96  *
97  * The partition object first comes into existence when a user runs the
98  * 'create-part' subcommand of dladm. This is like invoking the attach entry
99  * point of the partition object. The partition object goes away with the
100  * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101  * point of the partition object.
102  *
103  * The create-part and delete-part subcommands result in dld ioctls that end up
104  * calling ibd_create_parition() and ibd_delete_partition respectively.
105  * There ioctls are registered with the dld layer in _init() via a call to
106  * dld_ioc_register().
107  *
108  * The port instance by itself cannot be plumbed. It is only the partition
109  * objects that can be plumbed and they alone participate in I/O and not the
110  * port driver.
111  *
112  * There are some info ioctls supported in ibd which are used by dladm(1M) to
113  * display useful information. The info entry point for ibd is
114  * ibd_get_partition_info().
115  *
116  * 2. Late HCA initialization feature.
117  * ------------------------------------
118  * As mentioned in section 1, the user creates the partition objects via
119  * dladm(1M). It is possible that:
120  * a) The physical port itself is down and the SM cannot be reached.
121  * b) The PKEY specified by the used has not been created in the SM yet.
122  * c) An IPoIB broadcast group for the specified PKEY is not present.
123  *
124  * In all of the above cases, complete initialization of the partition object is
125  * not possible. However, the new model allows the creation of partition
126  * objects even in such cases but will defer the initialization for later.
127  * When such a partition object is plumbed, the link state will be displayed as
128  * "down".
129  * The driver, at this point, is listening to events that herald the
130  * availability of resources -
131  * i)   LINK_UP when the link becomes available
132  * ii)  PORT_CHANGE when the PKEY has been created
133  * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134  * created
135  * via ibd_async_handler() for events i) and ii), and via
136  * ibd_snet_notices_handler() for iii.
137  * The driver handles these events (as and when they arrive) and completes the
138  * initialization of the partition object and transitions it to a usable state.
139  *
140  * 3. Brussels support and its implications to the current architecture.
141  * ---------------------------------------------------------------------
142  * The brussels support introduces two new interfaces to the ibd driver -
143  * ibd_m_getprop() and ibd_m_setprop().
144  * These interfaces allow setting and retrieval of certain properties.
145  * Some of them are public properties while most other are private properties
146  * meant to be used by developers. Tuning the latter kind can cause
147  * performance issues and should not be used without understanding the
148  * implications. All properties are specific to an instance of either the
149  * partition object or the port driver.
150  *
151  * The public properties are : mtu and linkmode.
152  * mtu is a read-only property.
153  * linkmode can take two values - UD and CM.
154  *
155  * Changing the linkmode requires some bookkeeping in the driver. The
156  * capabilities need to be re-reported to the mac layer. This is done by
157  * calling mac_capab_update().  The maxsdu is updated by calling
158  * mac_maxsdu_update().
159  * The private properties retain their values across the change of linkmode.
160  * NOTE:
161  * - The port driver does not support any property apart from mtu.
162  * - All other properties are only meant for the partition object.
163  * - The properties cannot be set when an instance is plumbed. The
164  * instance has to be unplumbed to effect any setting.
165  */
166 
167 /*
168  * Driver wide tunables
169  *
170  * ibd_tx_softintr
171  * ibd_rx_softintr
172  *     The softintr mechanism allows ibd to avoid event queue overflows if
173  *     the receive/completion handlers are to be expensive. These are enabled
174  *     by default.
175  *
176  * ibd_log_sz
177  *     This specifies the size of the ibd log buffer in bytes. The buffer is
178  *     allocated and logging is enabled only when IBD_LOGGING is defined.
179  *
180  */
181 uint_t ibd_rx_softintr = 1;
182 uint_t ibd_tx_softintr = 1;
183 
184 #ifdef IBD_LOGGING
185 uint_t ibd_log_sz = 0x20000;
186 #endif
187 
188 #ifdef IBD_LOGGING
189 #define	IBD_LOG_SZ			ibd_log_sz
190 #endif
191 
192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 #define	IBD_RX_POST_CNT			8
194 
195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 #define	IBD_LOG_RX_POST			4
197 
198 /* Minimum number of receive work requests driver needs to always have */
199 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200 
201 /*
202  * LSO parameters
203  */
204 #define	IBD_LSO_MAXLEN			65536
205 #define	IBD_LSO_BUFSZ			8192
206 
207 /*
208  * Async operation states
209  */
210 #define	IBD_OP_NOTSTARTED		0
211 #define	IBD_OP_ONGOING			1
212 #define	IBD_OP_COMPLETED		2
213 #define	IBD_OP_ERRORED			3
214 #define	IBD_OP_ROUTERED			4
215 
216 /*
217  * Start/stop in-progress flags; note that restart must always remain
218  * the OR of start and stop flag values.
219  */
220 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
221 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
222 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
223 #define	IBD_DRV_DELETE_IN_PROGRESS	IBD_DRV_RESTART_IN_PROGRESS
224 
225 /*
226  * Miscellaneous constants
227  */
228 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
229 #define	IBD_DEF_MAX_SDU			2044
230 #define	IBD_DEF_MAX_MTU			(IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
231 #define	IBD_DEF_RC_MAX_SDU		65520
232 #define	IBD_DEF_RC_MAX_MTU		(IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
233 #define	IBD_DEFAULT_QKEY		0xB1B
234 #ifdef IBD_LOGGING
235 #define	IBD_DMAX_LINE			100
236 #endif
237 
238 /*
239  * Enumerations for link states
240  */
241 typedef enum {
242 	IBD_LINK_DOWN,
243 	IBD_LINK_UP,
244 	IBD_LINK_UP_ABSENT
245 } ibd_link_op_t;
246 
247 /*
248  * Driver State Pointer
249  */
250 void *ibd_list;
251 
252 /*
253  * Driver Global Data
254  */
255 ibd_global_state_t ibd_gstate;
256 
257 /*
258  * Partition object list
259  */
260 ibd_state_t	*ibd_objlist_head = NULL;
261 kmutex_t	ibd_objlist_lock;
262 
263 int ibd_rc_conn_timeout = 60 * 10;	/* 10 minutes */
264 
265 /*
266  * Logging
267  */
268 #ifdef IBD_LOGGING
269 kmutex_t ibd_lbuf_lock;
270 uint8_t *ibd_lbuf;
271 uint32_t ibd_lbuf_ndx;
272 #endif
273 
274 /*
275  * Required system entry points
276  */
277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
279 
280 /*
281  * Required driver entry points for GLDv3
282  */
283 static int ibd_m_stat(void *, uint_t, uint64_t *);
284 static int ibd_m_start(void *);
285 static void ibd_m_stop(void *);
286 static int ibd_m_promisc(void *, boolean_t);
287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
288 static int ibd_m_unicst(void *, const uint8_t *);
289 static mblk_t *ibd_m_tx(void *, mblk_t *);
290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
291 
292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
293     const void *);
294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
296     mac_prop_info_handle_t);
297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
298     const void *);
299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
300 
301 /*
302  * Private driver entry points for GLDv3
303  */
304 
305 /*
306  * Initialization
307  */
308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
309 static int ibd_init_txlist(ibd_state_t *);
310 static int ibd_init_rxlist(ibd_state_t *);
311 static int ibd_acache_init(ibd_state_t *);
312 #ifdef IBD_LOGGING
313 static void ibd_log_init(void);
314 #endif
315 
316 /*
317  * Termination/cleanup
318  */
319 static void ibd_state_fini(ibd_state_t *);
320 static void ibd_fini_txlist(ibd_state_t *);
321 static void ibd_fini_rxlist(ibd_state_t *);
322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
324 static void ibd_acache_fini(ibd_state_t *);
325 #ifdef IBD_LOGGING
326 static void ibd_log_fini(void);
327 #endif
328 
329 /*
330  * Allocation/acquire/map routines
331  */
332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
337     uint32_t *);
338 
339 /*
340  * Free/release/unmap routines
341  */
342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
343 static void ibd_free_tx_copybufs(ibd_state_t *);
344 static void ibd_free_rx_copybufs(ibd_state_t *);
345 static void ibd_free_rx_rsrcs(ibd_state_t *);
346 static void ibd_free_tx_lsobufs(ibd_state_t *);
347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
350 
351 /*
352  * Handlers/callback routines
353  */
354 static uint_t ibd_intr(caddr_t);
355 static uint_t ibd_tx_recycle(caddr_t);
356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
362 static void ibd_freemsg_cb(char *);
363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
364     ibt_async_event_t *);
365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
366     ibt_async_event_t *);
367 static void ibd_snet_notices_handler(void *, ib_gid_t,
368     ibt_subnet_event_code_t, ibt_subnet_event_t *);
369 
370 /*
371  * Send/receive routines
372  */
373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
377 
378 /*
379  * Threads
380  */
381 static void ibd_async_work(ibd_state_t *);
382 
383 /*
384  * Async tasks
385  */
386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
388 static void ibd_async_setprom(ibd_state_t *);
389 static void ibd_async_unsetprom(ibd_state_t *);
390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
392 static void ibd_async_txsched(ibd_state_t *);
393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
394 
395 /*
396  * Async task helpers
397  */
398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
402     ipoib_mac_t *, ipoib_mac_t *);
403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
409 static uint64_t ibd_get_portspeed(ibd_state_t *);
410 static boolean_t ibd_async_safe(ibd_state_t *);
411 static void ibd_async_done(ibd_state_t *);
412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
416 
417 /*
418  * Helpers for attach/start routines
419  */
420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
421 static int ibd_record_capab(ibd_state_t *);
422 static int ibd_get_port_details(ibd_state_t *);
423 static int ibd_alloc_cqs(ibd_state_t *);
424 static int ibd_setup_ud_channel(ibd_state_t *);
425 static int ibd_start(ibd_state_t *);
426 static int ibd_undo_start(ibd_state_t *, link_state_t);
427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
430 static void ibd_part_unattach(ibd_state_t *state);
431 static int ibd_port_attach(dev_info_t *);
432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
434 static int ibd_part_busy(ibd_state_t *);
435 
436 /*
437  * Miscellaneous helpers
438  */
439 static int ibd_sched_poll(ibd_state_t *, int, int);
440 static void ibd_resume_transmission(ibd_state_t *);
441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
443 static void *list_get_head(list_t *);
444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
446 
447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
449 
450 #ifdef IBD_LOGGING
451 static void ibd_log(const char *, ...);
452 #endif
453 
454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
455     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
456 
457 /* Module Driver Info */
458 static struct modldrv ibd_modldrv = {
459 	&mod_driverops,			/* This one is a driver */
460 	"InfiniBand GLDv3 Driver",	/* short description */
461 	&ibd_dev_ops			/* driver specific ops */
462 };
463 
464 /* Module Linkage */
465 static struct modlinkage ibd_modlinkage = {
466 	MODREV_1, (void *)&ibd_modldrv, NULL
467 };
468 
469 /*
470  * Module (static) info passed to IBTL during ibt_attach
471  */
472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
473 	IBTI_V_CURR,
474 	IBT_NETWORK,
475 	ibd_async_handler,
476 	NULL,
477 	"IBPART"
478 };
479 
480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
481 	IBTI_V_CURR,
482 	IBT_NETWORK,
483 	ibdpd_async_handler,
484 	NULL,
485 	"IPIB"
486 };
487 
488 /*
489  * GLDv3 entry points
490  */
491 #define	IBD_M_CALLBACK_FLAGS	\
492 	(MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
493 
494 static mac_callbacks_t ibd_m_callbacks = {
495 	IBD_M_CALLBACK_FLAGS,
496 	ibd_m_stat,
497 	ibd_m_start,
498 	ibd_m_stop,
499 	ibd_m_promisc,
500 	ibd_m_multicst,
501 	ibd_m_unicst,
502 	ibd_m_tx,
503 	NULL,
504 	NULL,
505 	ibd_m_getcapab,
506 	NULL,
507 	NULL,
508 	ibd_m_setprop,
509 	ibd_m_getprop,
510 	ibd_m_propinfo
511 };
512 
513 /* Private properties */
514 char *ibd_priv_props[] = {
515 	"_ibd_broadcast_group",
516 	"_ibd_coalesce_completions",
517 	"_ibd_create_broadcast_group",
518 	"_ibd_hash_size",
519 	"_ibd_lso_enable",
520 	"_ibd_num_ah",
521 	"_ibd_num_lso_bufs",
522 	"_ibd_rc_enable_srq",
523 	"_ibd_rc_num_rwqe",
524 	"_ibd_rc_num_srq",
525 	"_ibd_rc_num_swqe",
526 	"_ibd_rc_rx_comp_count",
527 	"_ibd_rc_rx_comp_usec",
528 	"_ibd_rc_rx_copy_thresh",
529 	"_ibd_rc_rx_rwqe_thresh",
530 	"_ibd_rc_tx_comp_count",
531 	"_ibd_rc_tx_comp_usec",
532 	"_ibd_rc_tx_copy_thresh",
533 	"_ibd_ud_num_rwqe",
534 	"_ibd_ud_num_swqe",
535 	"_ibd_ud_rx_comp_count",
536 	"_ibd_ud_rx_comp_usec",
537 	"_ibd_ud_tx_comp_count",
538 	"_ibd_ud_tx_comp_usec",
539 	"_ibd_ud_tx_copy_thresh",
540 	NULL
541 };
542 
543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
546 
547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
548 	{IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
549 	    ibd_create_partition, secpolicy_dl_config},
550 	{IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
551 	    ibd_delete_partition, secpolicy_dl_config},
552 	{IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
553 	    ibd_get_partition_info, NULL}
554 };
555 
556 /*
557  * Fill/clear <scope> and <p_key> in multicast/broadcast address
558  */
559 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
560 {							\
561 	*(uint32_t *)((char *)(maddr) + 4) |=		\
562 	    htonl((uint32_t)(scope) << 16);		\
563 	*(uint32_t *)((char *)(maddr) + 8) |=		\
564 	    htonl((uint32_t)(pkey) << 16);		\
565 }
566 
567 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
568 {							\
569 	*(uint32_t *)((char *)(maddr) + 4) &=		\
570 	    htonl(~((uint32_t)0xF << 16));		\
571 	*(uint32_t *)((char *)(maddr) + 8) &=		\
572 	    htonl(~((uint32_t)0xFFFF << 16));		\
573 }
574 
575 /*
576  * Rudimentary debugging support
577  */
578 #ifdef DEBUG
579 int ibd_debuglevel = 100;
580 void
581 debug_print(int l, char *fmt, ...)
582 {
583 	va_list ap;
584 
585 	if (l < ibd_debuglevel)
586 		return;
587 	va_start(ap, fmt);
588 	vcmn_err(CE_CONT, fmt, ap);
589 	va_end(ap);
590 }
591 #endif
592 
593 /*
594  * Common routine to print warning messages; adds in hca guid, port number
595  * and pkey to be able to identify the IBA interface.
596  */
597 void
598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
599 {
600 	ib_guid_t hca_guid;
601 	char ibd_print_buf[256];
602 	int len;
603 	va_list ap;
604 
605 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
606 	    0, "hca-guid", 0);
607 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
608 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
609 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
610 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
611 	va_start(ap, fmt);
612 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
613 	    fmt, ap);
614 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
615 	va_end(ap);
616 }
617 
618 /*
619  * Warlock directives
620  */
621 
622 /*
623  * id_lso_lock
624  *
625  * state->id_lso->bkt_nfree may be accessed without a lock to
626  * determine the threshold at which we have to ask the nw layer
627  * to resume transmission (see ibd_resume_transmission()).
628  */
629 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
630     ibd_state_t::id_lso))
631 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
632 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
633 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
634 
635 /*
636  * id_scq_poll_lock
637  */
638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
639     ibd_state_t::id_scq_poll_busy))
640 
641 /*
642  * id_txpost_lock
643  */
644 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
645     ibd_state_t::id_tx_head))
646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
647     ibd_state_t::id_tx_busy))
648 
649 /*
650  * id_acache_req_lock
651  */
652 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
653     ibd_state_t::id_acache_req_cv))
654 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
655     ibd_state_t::id_req_list))
656 _NOTE(SCHEME_PROTECTS_DATA("atomic",
657     ibd_acache_s::ac_ref))
658 
659 /*
660  * id_ac_mutex
661  *
662  * This mutex is actually supposed to protect id_ah_op as well,
663  * but this path of the code isn't clean (see update of id_ah_op
664  * in ibd_async_acache(), immediately after the call to
665  * ibd_async_mcache()). For now, we'll skip this check by
666  * declaring that id_ah_op is protected by some internal scheme
667  * that warlock isn't aware of.
668  */
669 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
670     ibd_state_t::id_ah_active))
671 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
672     ibd_state_t::id_ah_free))
673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
674     ibd_state_t::id_ah_addr))
675 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
676     ibd_state_t::id_ah_op))
677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
678     ibd_state_t::id_ah_error))
679 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
680     ibd_state_t::id_ac_hot_ace))
681 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
682 
683 /*
684  * id_mc_mutex
685  */
686 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
687     ibd_state_t::id_mc_full))
688 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
689     ibd_state_t::id_mc_non))
690 
691 /*
692  * id_trap_lock
693  */
694 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
695     ibd_state_t::id_trap_cv))
696 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
697     ibd_state_t::id_trap_stop))
698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
699     ibd_state_t::id_trap_inprog))
700 
701 /*
702  * id_prom_op
703  */
704 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
705     ibd_state_t::id_prom_op))
706 
707 /*
708  * id_sched_lock
709  */
710 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
711     ibd_state_t::id_sched_needed))
712 
713 /*
714  * id_link_mutex
715  */
716 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
717     ibd_state_t::id_link_state))
718 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
719 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
720     ibd_state_t::id_link_speed))
721 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
722 
723 /*
724  * id_tx_list.dl_mutex
725  */
726 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
727     ibd_state_t::id_tx_list.dl_head))
728 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
729     ibd_state_t::id_tx_list.dl_pending_sends))
730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
731     ibd_state_t::id_tx_list.dl_cnt))
732 
733 /*
734  * id_rx_list.dl_mutex
735  */
736 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
737     ibd_state_t::id_rx_list.dl_bufs_outstanding))
738 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
739     ibd_state_t::id_rx_list.dl_cnt))
740 
741 /*
742  * rc_timeout_lock
743  */
744 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
745     ibd_state_t::rc_timeout_start))
746 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
747     ibd_state_t::rc_timeout))
748 
749 
750 /*
751  * Items protected by atomic updates
752  */
753 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
754     ibd_state_s::id_brd_rcv
755     ibd_state_s::id_brd_xmt
756     ibd_state_s::id_multi_rcv
757     ibd_state_s::id_multi_xmt
758     ibd_state_s::id_num_intrs
759     ibd_state_s::id_rcv_bytes
760     ibd_state_s::id_rcv_pkt
761     ibd_state_s::id_rx_post_queue_index
762     ibd_state_s::id_tx_short
763     ibd_state_s::id_xmt_bytes
764     ibd_state_s::id_xmt_pkt
765     ibd_state_s::rc_rcv_trans_byte
766     ibd_state_s::rc_rcv_trans_pkt
767     ibd_state_s::rc_rcv_copy_byte
768     ibd_state_s::rc_rcv_copy_pkt
769     ibd_state_s::rc_xmt_bytes
770     ibd_state_s::rc_xmt_small_pkt
771     ibd_state_s::rc_xmt_fragmented_pkt
772     ibd_state_s::rc_xmt_map_fail_pkt
773     ibd_state_s::rc_xmt_map_succ_pkt
774     ibd_rc_chan_s::rcq_invoking))
775 
776 /*
777  * Non-mutex protection schemes for data elements. Almost all of
778  * these are non-shared items.
779  */
780 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
781     callb_cpr
782     ib_gid_s
783     ib_header_info
784     ibd_acache_rq
785     ibd_acache_s::ac_mce
786     ibd_acache_s::ac_chan
787     ibd_mcache::mc_fullreap
788     ibd_mcache::mc_jstate
789     ibd_mcache::mc_req
790     ibd_rwqe_s
791     ibd_swqe_s
792     ibd_wqe_s
793     ibt_wr_ds_s::ds_va
794     ibt_wr_lso_s
795     ipoib_mac::ipoib_qpn
796     mac_capab_lso_s
797     msgb::b_next
798     msgb::b_cont
799     msgb::b_rptr
800     msgb::b_wptr
801     ibd_state_s::id_bgroup_created
802     ibd_state_s::id_mac_state
803     ibd_state_s::id_mtu
804     ibd_state_s::id_ud_num_rwqe
805     ibd_state_s::id_ud_num_swqe
806     ibd_state_s::id_qpnum
807     ibd_state_s::id_rcq_hdl
808     ibd_state_s::id_rx_buf_sz
809     ibd_state_s::id_rx_bufs
810     ibd_state_s::id_rx_mr_hdl
811     ibd_state_s::id_rx_wqes
812     ibd_state_s::id_rxwcs
813     ibd_state_s::id_rxwcs_size
814     ibd_state_s::id_rx_nqueues
815     ibd_state_s::id_rx_queues
816     ibd_state_s::id_scope
817     ibd_state_s::id_scq_hdl
818     ibd_state_s::id_tx_buf_sz
819     ibd_state_s::id_tx_bufs
820     ibd_state_s::id_tx_mr_hdl
821     ibd_state_s::id_tx_rel_list.dl_cnt
822     ibd_state_s::id_tx_wqes
823     ibd_state_s::id_txwcs
824     ibd_state_s::id_txwcs_size
825     ibd_state_s::rc_listen_hdl
826     ibd_state_s::rc_listen_hdl_OFED_interop
827     ibd_state_s::rc_srq_size
828     ibd_state_s::rc_srq_rwqes
829     ibd_state_s::rc_srq_rx_bufs
830     ibd_state_s::rc_srq_rx_mr_hdl
831     ibd_state_s::rc_tx_largebuf_desc_base
832     ibd_state_s::rc_tx_mr_bufs
833     ibd_state_s::rc_tx_mr_hdl
834     ipha_s
835     icmph_s
836     ibt_path_info_s::pi_sid
837     ibd_rc_chan_s::ace
838     ibd_rc_chan_s::chan_hdl
839     ibd_rc_chan_s::state
840     ibd_rc_chan_s::chan_state
841     ibd_rc_chan_s::is_tx_chan
842     ibd_rc_chan_s::rcq_hdl
843     ibd_rc_chan_s::rcq_size
844     ibd_rc_chan_s::scq_hdl
845     ibd_rc_chan_s::scq_size
846     ibd_rc_chan_s::rx_bufs
847     ibd_rc_chan_s::rx_mr_hdl
848     ibd_rc_chan_s::rx_rwqes
849     ibd_rc_chan_s::tx_wqes
850     ibd_rc_chan_s::tx_mr_bufs
851     ibd_rc_chan_s::tx_mr_hdl
852     ibd_rc_chan_s::tx_rel_list.dl_cnt
853     ibd_rc_chan_s::is_used
854     ibd_rc_tx_largebuf_s::lb_buf
855     ibd_rc_msg_hello_s
856     ibt_cm_return_args_s))
857 
858 /*
859  * ibd_rc_chan_s::next is protected by two mutexes:
860  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
861  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
862  */
863 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
864     ibd_rc_chan_s::next))
865 
866 /*
867  * ibd_state_s.rc_tx_large_bufs_lock
868  */
869 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
870     ibd_state_s::rc_tx_largebuf_free_head))
871 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
872     ibd_state_s::rc_tx_largebuf_nfree))
873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
874     ibd_rc_tx_largebuf_s::lb_next))
875 
876 /*
877  * ibd_acache_s.tx_too_big_mutex
878  */
879 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
880     ibd_acache_s::tx_too_big_ongoing))
881 
882 /*
883  * tx_wqe_list.dl_mutex
884  */
885 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
886     ibd_rc_chan_s::tx_wqe_list.dl_head))
887 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
888     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
890     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
891 
892 /*
893  * ibd_state_s.rc_ace_recycle_lock
894  */
895 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
896     ibd_state_s::rc_ace_recycle))
897 
898 /*
899  * rc_srq_rwqe_list.dl_mutex
900  */
901 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
902     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
903 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
904     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
905 
906 /*
907  * Non-mutex protection schemes for data elements. They are counters
908  * for problem diagnosis. Don't need be protected.
909  */
910 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
911     ibd_state_s::rc_rcv_alloc_fail
912     ibd_state_s::rc_rcq_err
913     ibd_state_s::rc_ace_not_found
914     ibd_state_s::rc_xmt_drop_too_long_pkt
915     ibd_state_s::rc_xmt_icmp_too_long_pkt
916     ibd_state_s::rc_xmt_reenter_too_long_pkt
917     ibd_state_s::rc_swqe_short
918     ibd_state_s::rc_swqe_mac_update
919     ibd_state_s::rc_xmt_buf_short
920     ibd_state_s::rc_xmt_buf_mac_update
921     ibd_state_s::rc_scq_no_swqe
922     ibd_state_s::rc_scq_no_largebuf
923     ibd_state_s::rc_conn_succ
924     ibd_state_s::rc_conn_fail
925     ibd_state_s::rc_null_conn
926     ibd_state_s::rc_no_estab_conn
927     ibd_state_s::rc_act_close
928     ibd_state_s::rc_pas_close
929     ibd_state_s::rc_delay_ace_recycle
930     ibd_state_s::rc_act_close_simultaneous
931     ibd_state_s::rc_act_close_not_clean
932     ibd_state_s::rc_pas_close_rcq_invoking
933     ibd_state_s::rc_reset_cnt
934     ibd_state_s::rc_timeout_act
935     ibd_state_s::rc_timeout_pas
936     ibd_state_s::rc_stop_connect))
937 
938 #ifdef DEBUG
939 /*
940  * Non-mutex protection schemes for data elements. They are counters
941  * for problem diagnosis. Don't need be protected.
942  */
943 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
944     ibd_state_s::rc_rwqe_short
945     ibd_rc_stat_s::rc_rcv_trans_byte
946     ibd_rc_stat_s::rc_rcv_trans_pkt
947     ibd_rc_stat_s::rc_rcv_copy_byte
948     ibd_rc_stat_s::rc_rcv_copy_pkt
949     ibd_rc_stat_s::rc_rcv_alloc_fail
950     ibd_rc_stat_s::rc_rcq_err
951     ibd_rc_stat_s::rc_rwqe_short
952     ibd_rc_stat_s::rc_xmt_bytes
953     ibd_rc_stat_s::rc_xmt_small_pkt
954     ibd_rc_stat_s::rc_xmt_fragmented_pkt
955     ibd_rc_stat_s::rc_xmt_map_fail_pkt
956     ibd_rc_stat_s::rc_xmt_map_succ_pkt
957     ibd_rc_stat_s::rc_ace_not_found
958     ibd_rc_stat_s::rc_scq_no_swqe
959     ibd_rc_stat_s::rc_scq_no_largebuf
960     ibd_rc_stat_s::rc_swqe_short
961     ibd_rc_stat_s::rc_swqe_mac_update
962     ibd_rc_stat_s::rc_xmt_buf_short
963     ibd_rc_stat_s::rc_xmt_buf_mac_update
964     ibd_rc_stat_s::rc_conn_succ
965     ibd_rc_stat_s::rc_conn_fail
966     ibd_rc_stat_s::rc_null_conn
967     ibd_rc_stat_s::rc_no_estab_conn
968     ibd_rc_stat_s::rc_act_close
969     ibd_rc_stat_s::rc_pas_close
970     ibd_rc_stat_s::rc_delay_ace_recycle
971     ibd_rc_stat_s::rc_act_close_simultaneous
972     ibd_rc_stat_s::rc_reset_cnt
973     ibd_rc_stat_s::rc_timeout_act
974     ibd_rc_stat_s::rc_timeout_pas))
975 #endif
976 
977 int
978 _init()
979 {
980 	int status;
981 
982 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
983 	    PAGESIZE), 0);
984 	if (status != 0) {
985 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
986 		return (status);
987 	}
988 
989 	mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
990 
991 	mac_init_ops(&ibd_dev_ops, "ibp");
992 	status = mod_install(&ibd_modlinkage);
993 	if (status != 0) {
994 		DPRINT(10, "_init:failed in mod_install()");
995 		ddi_soft_state_fini(&ibd_list);
996 		mac_fini_ops(&ibd_dev_ops);
997 		return (status);
998 	}
999 
1000 	mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1001 	mutex_enter(&ibd_gstate.ig_mutex);
1002 	ibd_gstate.ig_ibt_hdl = NULL;
1003 	ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1004 	ibd_gstate.ig_service_list = NULL;
1005 	mutex_exit(&ibd_gstate.ig_mutex);
1006 
1007 	if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1008 	    DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1009 		return (EIO);
1010 	}
1011 
1012 	ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1013 
1014 #ifdef IBD_LOGGING
1015 	ibd_log_init();
1016 #endif
1017 	return (0);
1018 }
1019 
1020 int
1021 _info(struct modinfo *modinfop)
1022 {
1023 	return (mod_info(&ibd_modlinkage, modinfop));
1024 }
1025 
1026 int
1027 _fini()
1028 {
1029 	int status;
1030 
1031 	status = mod_remove(&ibd_modlinkage);
1032 	if (status != 0)
1033 		return (status);
1034 
1035 	ibt_unregister_part_attr_cb();
1036 
1037 	mac_fini_ops(&ibd_dev_ops);
1038 	mutex_destroy(&ibd_objlist_lock);
1039 	ddi_soft_state_fini(&ibd_list);
1040 	mutex_destroy(&ibd_gstate.ig_mutex);
1041 #ifdef IBD_LOGGING
1042 	ibd_log_fini();
1043 #endif
1044 	return (0);
1045 }
1046 
1047 /*
1048  * Convert the GID part of the mac address from network byte order
1049  * to host order.
1050  */
1051 static void
1052 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1053 {
1054 	ib_sn_prefix_t nbopref;
1055 	ib_guid_t nboguid;
1056 
1057 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1058 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1059 	dgid->gid_prefix = b2h64(nbopref);
1060 	dgid->gid_guid = b2h64(nboguid);
1061 }
1062 
1063 /*
1064  * Create the IPoIB address in network byte order from host order inputs.
1065  */
1066 static void
1067 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1068     ib_guid_t guid)
1069 {
1070 	ib_sn_prefix_t nbopref;
1071 	ib_guid_t nboguid;
1072 
1073 	mac->ipoib_qpn = htonl(qpn);
1074 	nbopref = h2b64(prefix);
1075 	nboguid = h2b64(guid);
1076 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1077 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1078 }
1079 
1080 /*
1081  * Send to the appropriate all-routers group when the IBA multicast group
1082  * does not exist, based on whether the target group is v4 or v6.
1083  */
1084 static boolean_t
1085 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1086     ipoib_mac_t *rmac)
1087 {
1088 	boolean_t retval = B_TRUE;
1089 	uint32_t adjscope = state->id_scope << 16;
1090 	uint32_t topword;
1091 
1092 	/*
1093 	 * Copy the first 4 bytes in without assuming any alignment of
1094 	 * input mac address; this will have IPoIB signature, flags and
1095 	 * scope bits.
1096 	 */
1097 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1098 	topword = ntohl(topword);
1099 
1100 	/*
1101 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
1102 	 */
1103 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1104 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1105 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1106 		    ((uint32_t)(state->id_pkey << 16))),
1107 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1108 	else
1109 		/*
1110 		 * Does not have proper bits in the mgid address.
1111 		 */
1112 		retval = B_FALSE;
1113 
1114 	return (retval);
1115 }
1116 
1117 /*
1118  * Membership states for different mcg's are tracked by two lists:
1119  * the "non" list is used for promiscuous mode, when all mcg traffic
1120  * needs to be inspected. This type of membership is never used for
1121  * transmission, so there can not be an AH in the active list
1122  * corresponding to a member in this list. This list does not need
1123  * any protection, since all operations are performed by the async
1124  * thread.
1125  *
1126  * "Full" and "SendOnly" membership is tracked using a single list,
1127  * the "full" list. This is because this single list can then be
1128  * searched during transmit to a multicast group (if an AH for the
1129  * mcg is not found in the active list), since at least one type
1130  * of membership must be present before initiating the transmit.
1131  * This list is also emptied during driver detach, since sendonly
1132  * membership acquired during transmit is dropped at detach time
1133  * along with ipv4 broadcast full membership. Insert/deletes to
1134  * this list are done only by the async thread, but it is also
1135  * searched in program context (see multicast disable case), thus
1136  * the id_mc_mutex protects the list. The driver detach path also
1137  * deconstructs the "full" list, but it ensures that the async
1138  * thread will not be accessing the list (by blocking out mcg
1139  * trap handling and making sure no more Tx reaping will happen).
1140  *
1141  * Currently, an IBA attach is done in the SendOnly case too,
1142  * although this is not required.
1143  */
1144 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1145 	list_insert_head(&state->id_mc_full, mce)
1146 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1147 	list_insert_head(&state->id_mc_non, mce)
1148 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1149 	ibd_mcache_find(mgid, &state->id_mc_full)
1150 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1151 	ibd_mcache_find(mgid, &state->id_mc_non)
1152 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1153 	list_remove(&state->id_mc_full, mce)
1154 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1155 	list_remove(&state->id_mc_non, mce)
1156 
1157 static void *
1158 list_get_head(list_t *list)
1159 {
1160 	list_node_t *lhead = list_head(list);
1161 
1162 	if (lhead != NULL)
1163 		list_remove(list, lhead);
1164 	return (lhead);
1165 }
1166 
1167 /*
1168  * This is always guaranteed to be able to queue the work.
1169  */
1170 void
1171 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1172 {
1173 	/* Initialize request */
1174 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1175 	ptr->rq_op = op;
1176 
1177 	/*
1178 	 * Queue provided slot onto request pool.
1179 	 */
1180 	mutex_enter(&state->id_acache_req_lock);
1181 	list_insert_tail(&state->id_req_list, ptr);
1182 
1183 	/* Go, fetch, async thread */
1184 	cv_signal(&state->id_acache_req_cv);
1185 	mutex_exit(&state->id_acache_req_lock);
1186 }
1187 
1188 /*
1189  * Main body of the per interface async thread.
1190  */
1191 static void
1192 ibd_async_work(ibd_state_t *state)
1193 {
1194 	ibd_req_t *ptr;
1195 	callb_cpr_t cprinfo;
1196 
1197 	mutex_enter(&state->id_acache_req_lock);
1198 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1199 	    callb_generic_cpr, "ibd_async_work");
1200 
1201 	for (;;) {
1202 		ptr = list_get_head(&state->id_req_list);
1203 		if (ptr != NULL) {
1204 			mutex_exit(&state->id_acache_req_lock);
1205 
1206 			/*
1207 			 * If we are in late hca initialization mode, do not
1208 			 * process any other async request other than TRAP. TRAP
1209 			 * is used for indicating creation of a broadcast group;
1210 			 * in which case, we need to join/create the group.
1211 			 */
1212 			if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1213 			    (ptr->rq_op != IBD_ASYNC_TRAP)) {
1214 				goto free_req_and_continue;
1215 			}
1216 
1217 			/*
1218 			 * Once we have done the operation, there is no
1219 			 * guarantee the request slot is going to be valid,
1220 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1221 			 * TRAP).
1222 			 *
1223 			 * Perform the request.
1224 			 */
1225 			switch (ptr->rq_op) {
1226 				case IBD_ASYNC_GETAH:
1227 					ibd_async_acache(state, &ptr->rq_mac);
1228 					break;
1229 				case IBD_ASYNC_JOIN:
1230 				case IBD_ASYNC_LEAVE:
1231 					ibd_async_multicast(state,
1232 					    ptr->rq_gid, ptr->rq_op);
1233 					break;
1234 				case IBD_ASYNC_PROMON:
1235 					ibd_async_setprom(state);
1236 					break;
1237 				case IBD_ASYNC_PROMOFF:
1238 					ibd_async_unsetprom(state);
1239 					break;
1240 				case IBD_ASYNC_REAP:
1241 					ibd_async_reap_group(state,
1242 					    ptr->rq_ptr, ptr->rq_gid,
1243 					    IB_MC_JSTATE_FULL);
1244 					/*
1245 					 * the req buf contains in mce
1246 					 * structure, so we do not need
1247 					 * to free it here.
1248 					 */
1249 					ptr = NULL;
1250 					break;
1251 				case IBD_ASYNC_TRAP:
1252 					ibd_async_trap(state, ptr);
1253 					break;
1254 				case IBD_ASYNC_SCHED:
1255 					ibd_async_txsched(state);
1256 					break;
1257 				case IBD_ASYNC_LINK:
1258 					ibd_async_link(state, ptr);
1259 					break;
1260 				case IBD_ASYNC_EXIT:
1261 					mutex_enter(&state->id_acache_req_lock);
1262 #ifndef __lock_lint
1263 					CALLB_CPR_EXIT(&cprinfo);
1264 #else
1265 					mutex_exit(&state->id_acache_req_lock);
1266 #endif
1267 					return;
1268 				case IBD_ASYNC_RC_TOO_BIG:
1269 					ibd_async_rc_process_too_big(state,
1270 					    ptr);
1271 					break;
1272 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1273 					ibd_async_rc_close_act_chan(state, ptr);
1274 					break;
1275 				case IBD_ASYNC_RC_RECYCLE_ACE:
1276 					ibd_async_rc_recycle_ace(state, ptr);
1277 					break;
1278 				case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
1279 					(void) ibd_rc_pas_close(ptr->rq_ptr,
1280 					    B_TRUE, B_TRUE);
1281 					break;
1282 			}
1283 free_req_and_continue:
1284 			if (ptr != NULL)
1285 				kmem_cache_free(state->id_req_kmc, ptr);
1286 
1287 			mutex_enter(&state->id_acache_req_lock);
1288 		} else {
1289 #ifndef __lock_lint
1290 			/*
1291 			 * Nothing to do: wait till new request arrives.
1292 			 */
1293 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1294 			cv_wait(&state->id_acache_req_cv,
1295 			    &state->id_acache_req_lock);
1296 			CALLB_CPR_SAFE_END(&cprinfo,
1297 			    &state->id_acache_req_lock);
1298 #endif
1299 		}
1300 	}
1301 
1302 	/*NOTREACHED*/
1303 	_NOTE(NOT_REACHED)
1304 }
1305 
1306 /*
1307  * Return when it is safe to queue requests to the async daemon; primarily
1308  * for subnet trap and async event handling. Disallow requests before the
1309  * daemon is created, and when interface deinitilization starts.
1310  */
1311 static boolean_t
1312 ibd_async_safe(ibd_state_t *state)
1313 {
1314 	mutex_enter(&state->id_trap_lock);
1315 	if (state->id_trap_stop) {
1316 		mutex_exit(&state->id_trap_lock);
1317 		return (B_FALSE);
1318 	}
1319 	state->id_trap_inprog++;
1320 	mutex_exit(&state->id_trap_lock);
1321 	return (B_TRUE);
1322 }
1323 
1324 /*
1325  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1326  * trap or event handling to complete to kill the async thread and deconstruct
1327  * the mcg/ace list.
1328  */
1329 static void
1330 ibd_async_done(ibd_state_t *state)
1331 {
1332 	mutex_enter(&state->id_trap_lock);
1333 	if (--state->id_trap_inprog == 0)
1334 		cv_signal(&state->id_trap_cv);
1335 	mutex_exit(&state->id_trap_lock);
1336 }
1337 
1338 /*
1339  * Hash functions:
1340  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1341  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1342  * These operate on mac addresses input into ibd_send, but there is no
1343  * guarantee on the alignment of the ipoib_mac_t structure.
1344  */
1345 /*ARGSUSED*/
1346 static uint_t
1347 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1348 {
1349 	ulong_t ptraddr = (ulong_t)key;
1350 	uint_t hval;
1351 
1352 	/*
1353 	 * If the input address is 4 byte aligned, we can just dereference
1354 	 * it. This is most common, since IP will send in a 4 byte aligned
1355 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1356 	 * 4 byte aligned too.
1357 	 */
1358 	if ((ptraddr & 3) == 0)
1359 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1360 
1361 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1362 	return (hval);
1363 }
1364 
1365 static int
1366 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1367 {
1368 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1369 		return (0);
1370 	else
1371 		return (1);
1372 }
1373 
1374 /*
1375  * Initialize all the per interface caches and lists; AH cache,
1376  * MCG list etc.
1377  */
1378 static int
1379 ibd_acache_init(ibd_state_t *state)
1380 {
1381 	ibd_ace_t *ce;
1382 	int i;
1383 
1384 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1385 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1386 	mutex_enter(&state->id_ac_mutex);
1387 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1388 	    offsetof(ibd_ace_t, ac_list));
1389 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1390 	    offsetof(ibd_ace_t, ac_list));
1391 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1392 	    state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1393 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1394 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1395 	    offsetof(ibd_mce_t, mc_list));
1396 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1397 	    offsetof(ibd_mce_t, mc_list));
1398 	state->id_ac_hot_ace = NULL;
1399 
1400 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1401 	    state->id_num_ah, KM_SLEEP);
1402 	for (i = 0; i < state->id_num_ah; i++, ce++) {
1403 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1404 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1405 			mutex_exit(&state->id_ac_mutex);
1406 			ibd_acache_fini(state);
1407 			return (DDI_FAILURE);
1408 		} else {
1409 			CLEAR_REFCYCLE(ce);
1410 			ce->ac_mce = NULL;
1411 			mutex_init(&ce->tx_too_big_mutex, NULL,
1412 			    MUTEX_DRIVER, NULL);
1413 			IBD_ACACHE_INSERT_FREE(state, ce);
1414 		}
1415 	}
1416 	mutex_exit(&state->id_ac_mutex);
1417 	return (DDI_SUCCESS);
1418 }
1419 
1420 static void
1421 ibd_acache_fini(ibd_state_t *state)
1422 {
1423 	ibd_ace_t *ptr;
1424 
1425 	mutex_enter(&state->id_ac_mutex);
1426 
1427 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1428 		ASSERT(GET_REF(ptr) == 0);
1429 		mutex_destroy(&ptr->tx_too_big_mutex);
1430 		(void) ibt_free_ud_dest(ptr->ac_dest);
1431 	}
1432 
1433 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1434 		ASSERT(GET_REF(ptr) == 0);
1435 		mutex_destroy(&ptr->tx_too_big_mutex);
1436 		(void) ibt_free_ud_dest(ptr->ac_dest);
1437 	}
1438 
1439 	list_destroy(&state->id_ah_free);
1440 	list_destroy(&state->id_ah_active);
1441 	list_destroy(&state->id_mc_full);
1442 	list_destroy(&state->id_mc_non);
1443 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1444 	mutex_exit(&state->id_ac_mutex);
1445 	mutex_destroy(&state->id_ac_mutex);
1446 	mutex_destroy(&state->id_mc_mutex);
1447 }
1448 
1449 /*
1450  * Search AH active hash list for a cached path to input destination.
1451  * If we are "just looking", hold == F. When we are in the Tx path,
1452  * we set hold == T to grab a reference on the AH so that it can not
1453  * be recycled to a new destination while the Tx request is posted.
1454  */
1455 ibd_ace_t *
1456 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1457 {
1458 	ibd_ace_t *ptr;
1459 
1460 	ASSERT(mutex_owned(&state->id_ac_mutex));
1461 
1462 	/*
1463 	 * Do hash search.
1464 	 */
1465 	if (mod_hash_find(state->id_ah_active_hash,
1466 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1467 		if (hold)
1468 			INC_REF(ptr, num);
1469 		return (ptr);
1470 	}
1471 	return (NULL);
1472 }
1473 
1474 /*
1475  * This is called by the tx side; if an initialized AH is found in
1476  * the active list, it is locked down and can be used; if no entry
1477  * is found, an async request is queued to do path resolution.
1478  */
1479 static ibd_ace_t *
1480 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1481 {
1482 	ibd_ace_t *ptr;
1483 	ibd_req_t *req;
1484 
1485 	/*
1486 	 * Only attempt to print when we can; in the mdt pattr case, the
1487 	 * address is not aligned properly.
1488 	 */
1489 	if (((ulong_t)mac & 3) == 0) {
1490 		DPRINT(4,
1491 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1492 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1493 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1494 		    htonl(mac->ipoib_gidsuff[1]));
1495 	}
1496 
1497 	mutex_enter(&state->id_ac_mutex);
1498 
1499 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1500 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1501 		INC_REF(ptr, numwqe);
1502 		mutex_exit(&state->id_ac_mutex);
1503 		return (ptr);
1504 	}
1505 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1506 		state->id_ac_hot_ace = ptr;
1507 		mutex_exit(&state->id_ac_mutex);
1508 		return (ptr);
1509 	}
1510 
1511 	/*
1512 	 * Implementation of a single outstanding async request; if
1513 	 * the operation is not started yet, queue a request and move
1514 	 * to ongoing state. Remember in id_ah_addr for which address
1515 	 * we are queueing the request, in case we need to flag an error;
1516 	 * Any further requests, for the same or different address, until
1517 	 * the operation completes, is sent back to GLDv3 to be retried.
1518 	 * The async thread will update id_ah_op with an error indication
1519 	 * or will set it to indicate the next look up can start; either
1520 	 * way, it will mac_tx_update() so that all blocked requests come
1521 	 * back here.
1522 	 */
1523 	*err = EAGAIN;
1524 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1525 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1526 		if (req != NULL) {
1527 			/*
1528 			 * We did not even find the entry; queue a request
1529 			 * for it.
1530 			 */
1531 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1532 			state->id_ah_op = IBD_OP_ONGOING;
1533 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1534 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1535 		}
1536 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1537 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1538 		/*
1539 		 * Check the status of the pathrecord lookup request
1540 		 * we had queued before.
1541 		 */
1542 		if (state->id_ah_op == IBD_OP_ERRORED) {
1543 			*err = EFAULT;
1544 			state->id_ah_error++;
1545 		} else {
1546 			/*
1547 			 * IBD_OP_ROUTERED case: We need to send to the
1548 			 * all-router MCG. If we can find the AH for
1549 			 * the mcg, the Tx will be attempted. If we
1550 			 * do not find the AH, we return NORESOURCES
1551 			 * to retry.
1552 			 */
1553 			ipoib_mac_t routermac;
1554 
1555 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1556 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1557 			    numwqe);
1558 		}
1559 		state->id_ah_op = IBD_OP_NOTSTARTED;
1560 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1561 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1562 		/*
1563 		 * This case can happen when we get a higher band
1564 		 * packet. The easiest way is to reset the state machine
1565 		 * to accommodate the higher priority packet.
1566 		 */
1567 		state->id_ah_op = IBD_OP_NOTSTARTED;
1568 	}
1569 	mutex_exit(&state->id_ac_mutex);
1570 
1571 	return (ptr);
1572 }
1573 
1574 /*
1575  * Grab a not-currently-in-use AH/PathRecord from the active
1576  * list to recycle to a new destination. Only the async thread
1577  * executes this code.
1578  */
1579 static ibd_ace_t *
1580 ibd_acache_get_unref(ibd_state_t *state)
1581 {
1582 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1583 	boolean_t try_rc_chan_recycle = B_FALSE;
1584 
1585 	ASSERT(mutex_owned(&state->id_ac_mutex));
1586 
1587 	/*
1588 	 * Do plain linear search.
1589 	 */
1590 	while (ptr != NULL) {
1591 		/*
1592 		 * Note that it is possible that the "cycle" bit
1593 		 * is set on the AH w/o any reference count. The
1594 		 * mcg must have been deleted, and the tx cleanup
1595 		 * just decremented the reference count to 0, but
1596 		 * hasn't gotten around to grabbing the id_ac_mutex
1597 		 * to move the AH into the free list.
1598 		 */
1599 		if (GET_REF(ptr) == 0) {
1600 			if (ptr->ac_chan != NULL) {
1601 				ASSERT(state->id_enable_rc == B_TRUE);
1602 				if (!try_rc_chan_recycle) {
1603 					try_rc_chan_recycle = B_TRUE;
1604 					ibd_rc_signal_ace_recycle(state, ptr);
1605 				}
1606 			} else {
1607 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1608 				break;
1609 			}
1610 		}
1611 		ptr = list_prev(&state->id_ah_active, ptr);
1612 	}
1613 	return (ptr);
1614 }
1615 
1616 /*
1617  * Invoked to clean up AH from active list in case of multicast
1618  * disable and to handle sendonly memberships during mcg traps.
1619  * And for port up processing for multicast and unicast AHs.
1620  * Normally, the AH is taken off the active list, and put into
1621  * the free list to be recycled for a new destination. In case
1622  * Tx requests on the AH have not completed yet, the AH is marked
1623  * for reaping (which will put the AH on the free list) once the Tx's
1624  * complete; in this case, depending on the "force" input, we take
1625  * out the AH from the active list right now, or leave it also for
1626  * the reap operation. Returns TRUE if the AH is taken off the active
1627  * list (and either put into the free list right now, or arranged for
1628  * later), FALSE otherwise.
1629  */
1630 boolean_t
1631 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1632 {
1633 	ibd_ace_t *acactive;
1634 	boolean_t ret = B_TRUE;
1635 
1636 	ASSERT(mutex_owned(&state->id_ac_mutex));
1637 
1638 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1639 
1640 		/*
1641 		 * Note that the AH might already have the cycle bit set
1642 		 * on it; this might happen if sequences of multicast
1643 		 * enables and disables are coming so fast, that posted
1644 		 * Tx's to the mcg have not completed yet, and the cycle
1645 		 * bit is set successively by each multicast disable.
1646 		 */
1647 		if (SET_CYCLE_IF_REF(acactive)) {
1648 			if (!force) {
1649 				/*
1650 				 * The ace is kept on the active list, further
1651 				 * Tx's can still grab a reference on it; the
1652 				 * ace is reaped when all pending Tx's
1653 				 * referencing the AH complete.
1654 				 */
1655 				ret = B_FALSE;
1656 			} else {
1657 				/*
1658 				 * In the mcg trap case, we always pull the
1659 				 * AH from the active list. And also the port
1660 				 * up multi/unicast case.
1661 				 */
1662 				ASSERT(acactive->ac_chan == NULL);
1663 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1664 				acactive->ac_mce = NULL;
1665 			}
1666 		} else {
1667 			/*
1668 			 * Determined the ref count is 0, thus reclaim
1669 			 * immediately after pulling out the ace from
1670 			 * the active list.
1671 			 */
1672 			ASSERT(acactive->ac_chan == NULL);
1673 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1674 			acactive->ac_mce = NULL;
1675 			IBD_ACACHE_INSERT_FREE(state, acactive);
1676 		}
1677 
1678 	}
1679 	return (ret);
1680 }
1681 
1682 /*
1683  * Helper function for async path record lookup. If we are trying to
1684  * Tx to a MCG, check our membership, possibly trying to join the
1685  * group if required. If that fails, try to send the packet to the
1686  * all router group (indicated by the redirect output), pointing
1687  * the input mac address to the router mcg address.
1688  */
1689 static ibd_mce_t *
1690 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1691 {
1692 	ib_gid_t mgid;
1693 	ibd_mce_t *mce;
1694 	ipoib_mac_t routermac;
1695 
1696 	*redirect = B_FALSE;
1697 	ibd_n2h_gid(mac, &mgid);
1698 
1699 	/*
1700 	 * Check the FullMember+SendOnlyNonMember list.
1701 	 * Since we are the only one who manipulates the
1702 	 * id_mc_full list, no locks are needed.
1703 	 */
1704 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1705 	if (mce != NULL) {
1706 		DPRINT(4, "ibd_async_mcache : already joined to group");
1707 		return (mce);
1708 	}
1709 
1710 	/*
1711 	 * Not found; try to join(SendOnlyNonMember) and attach.
1712 	 */
1713 	DPRINT(4, "ibd_async_mcache : not joined to group");
1714 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1715 	    NULL) {
1716 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1717 		return (mce);
1718 	}
1719 
1720 	/*
1721 	 * MCGroup not present; try to join the all-router group. If
1722 	 * any of the following steps succeed, we will be redirecting
1723 	 * to the all router group.
1724 	 */
1725 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1726 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1727 		return (NULL);
1728 	*redirect = B_TRUE;
1729 	ibd_n2h_gid(&routermac, &mgid);
1730 	bcopy(&routermac, mac, IPOIB_ADDRL);
1731 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1732 	    mgid.gid_prefix, mgid.gid_guid);
1733 
1734 	/*
1735 	 * Are we already joined to the router group?
1736 	 */
1737 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1738 		DPRINT(4, "ibd_async_mcache : using already joined router"
1739 		    "group\n");
1740 		return (mce);
1741 	}
1742 
1743 	/*
1744 	 * Can we join(SendOnlyNonMember) the router group?
1745 	 */
1746 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1747 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1748 	    NULL) {
1749 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1750 		return (mce);
1751 	}
1752 
1753 	return (NULL);
1754 }
1755 
1756 /*
1757  * Async path record lookup code.
1758  */
1759 static void
1760 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1761 {
1762 	ibd_ace_t *ce;
1763 	ibd_mce_t *mce = NULL;
1764 	ibt_path_attr_t path_attr;
1765 	ibt_path_info_t path_info;
1766 	ib_gid_t destgid;
1767 	char ret = IBD_OP_NOTSTARTED;
1768 
1769 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1770 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1771 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1772 	    htonl(mac->ipoib_gidsuff[1]));
1773 
1774 	/*
1775 	 * Check whether we are trying to transmit to a MCG.
1776 	 * In that case, we need to make sure we are a member of
1777 	 * the MCG.
1778 	 */
1779 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1780 		boolean_t redirected;
1781 
1782 		/*
1783 		 * If we can not find or join the group or even
1784 		 * redirect, error out.
1785 		 */
1786 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1787 		    NULL) {
1788 			state->id_ah_op = IBD_OP_ERRORED;
1789 			return;
1790 		}
1791 
1792 		/*
1793 		 * If we got redirected, we need to determine whether
1794 		 * the AH for the new mcg is in the cache already, and
1795 		 * not pull it in then; otherwise proceed to get the
1796 		 * path for the new mcg. There is no guarantee that
1797 		 * if the AH is currently in the cache, it will still be
1798 		 * there when we look in ibd_acache_lookup(), but that's
1799 		 * okay, we will come back here.
1800 		 */
1801 		if (redirected) {
1802 			ret = IBD_OP_ROUTERED;
1803 			DPRINT(4, "ibd_async_acache :  redirected to "
1804 			    "%08X:%08X:%08X:%08X:%08X",
1805 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1806 			    htonl(mac->ipoib_gidpref[1]),
1807 			    htonl(mac->ipoib_gidsuff[0]),
1808 			    htonl(mac->ipoib_gidsuff[1]));
1809 
1810 			mutex_enter(&state->id_ac_mutex);
1811 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1812 				state->id_ah_op = IBD_OP_ROUTERED;
1813 				mutex_exit(&state->id_ac_mutex);
1814 				DPRINT(4, "ibd_async_acache : router AH found");
1815 				return;
1816 			}
1817 			mutex_exit(&state->id_ac_mutex);
1818 		}
1819 	}
1820 
1821 	/*
1822 	 * Get an AH from the free list.
1823 	 */
1824 	mutex_enter(&state->id_ac_mutex);
1825 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1826 		/*
1827 		 * No free ones; try to grab an unreferenced active
1828 		 * one. Maybe we need to make the active list LRU,
1829 		 * but that will create more work for Tx callbacks.
1830 		 * Is there a way of not having to pull out the
1831 		 * entry from the active list, but just indicate it
1832 		 * is being recycled? Yes, but that creates one more
1833 		 * check in the fast lookup path.
1834 		 */
1835 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1836 			/*
1837 			 * Pretty serious shortage now.
1838 			 */
1839 			state->id_ah_op = IBD_OP_NOTSTARTED;
1840 			mutex_exit(&state->id_ac_mutex);
1841 			DPRINT(10, "ibd_async_acache : failed to find AH "
1842 			    "slot\n");
1843 			return;
1844 		}
1845 		/*
1846 		 * We could check whether ac_mce points to a SendOnly
1847 		 * member and drop that membership now. Or do it lazily
1848 		 * at detach time.
1849 		 */
1850 		ce->ac_mce = NULL;
1851 	}
1852 	mutex_exit(&state->id_ac_mutex);
1853 	ASSERT(ce->ac_mce == NULL);
1854 
1855 	/*
1856 	 * Update the entry.
1857 	 */
1858 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1859 
1860 	bzero(&path_info, sizeof (path_info));
1861 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1862 	path_attr.pa_sgid = state->id_sgid;
1863 	path_attr.pa_num_dgids = 1;
1864 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1865 	path_attr.pa_dgids = &destgid;
1866 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1867 	path_attr.pa_pkey = state->id_pkey;
1868 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1869 	    &path_info, NULL) != IBT_SUCCESS) {
1870 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1871 		goto error;
1872 	}
1873 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1874 	    ntohl(ce->ac_mac.ipoib_qpn),
1875 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1876 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1877 		goto error;
1878 	}
1879 
1880 	/*
1881 	 * mce is set whenever an AH is being associated with a
1882 	 * MCG; this will come in handy when we leave the MCG. The
1883 	 * lock protects Tx fastpath from scanning the active list.
1884 	 */
1885 	if (mce != NULL)
1886 		ce->ac_mce = mce;
1887 
1888 	/*
1889 	 * initiate a RC mode connection for unicast address
1890 	 */
1891 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1892 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1893 		ASSERT(ce->ac_chan == NULL);
1894 		DPRINT(10, "ibd_async_acache: call "
1895 		    "ibd_rc_try_connect(ace=%p)", ce);
1896 		ibd_rc_try_connect(state, ce, &path_info);
1897 		if (ce->ac_chan == NULL) {
1898 			DPRINT(10, "ibd_async_acache: fail to setup RC"
1899 			    " channel");
1900 			state->rc_conn_fail++;
1901 			goto error;
1902 		}
1903 	}
1904 
1905 	mutex_enter(&state->id_ac_mutex);
1906 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1907 	state->id_ah_op = ret;
1908 	mutex_exit(&state->id_ac_mutex);
1909 	return;
1910 error:
1911 	/*
1912 	 * We might want to drop SendOnly membership here if we
1913 	 * joined above. The lock protects Tx callbacks inserting
1914 	 * into the free list.
1915 	 */
1916 	mutex_enter(&state->id_ac_mutex);
1917 	state->id_ah_op = IBD_OP_ERRORED;
1918 	IBD_ACACHE_INSERT_FREE(state, ce);
1919 	mutex_exit(&state->id_ac_mutex);
1920 }
1921 
1922 /*
1923  * While restoring port's presence on the subnet on a port up, it is possible
1924  * that the port goes down again.
1925  */
1926 static void
1927 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1928 {
1929 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1930 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1931 	    LINK_STATE_UP;
1932 	ibd_mce_t *mce, *pmce;
1933 	ibd_ace_t *ace, *pace;
1934 
1935 	DPRINT(10, "ibd_async_link(): %d", opcode);
1936 
1937 	/*
1938 	 * On a link up, revalidate the link speed/width. No point doing
1939 	 * this on a link down, since we will be unable to do SA operations,
1940 	 * defaulting to the lowest speed. Also notice that we update our
1941 	 * notion of speed before calling mac_link_update(), which will do
1942 	 * necessary higher level notifications for speed changes.
1943 	 */
1944 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1945 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1946 		state->id_link_speed = ibd_get_portspeed(state);
1947 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1948 	}
1949 
1950 	/*
1951 	 * Do all the work required to establish our presence on
1952 	 * the subnet.
1953 	 */
1954 	if (opcode == IBD_LINK_UP_ABSENT) {
1955 		/*
1956 		 * If in promiscuous mode ...
1957 		 */
1958 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1959 			/*
1960 			 * Drop all nonmembership.
1961 			 */
1962 			ibd_async_unsetprom(state);
1963 
1964 			/*
1965 			 * Then, try to regain nonmembership to all mcg's.
1966 			 */
1967 			ibd_async_setprom(state);
1968 
1969 		}
1970 
1971 		/*
1972 		 * Drop all sendonly membership (which also gets rid of the
1973 		 * AHs); try to reacquire all full membership.
1974 		 */
1975 		mce = list_head(&state->id_mc_full);
1976 		while ((pmce = mce) != NULL) {
1977 			mce = list_next(&state->id_mc_full, mce);
1978 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1979 				ibd_leave_group(state,
1980 				    pmce->mc_info.mc_adds_vect.av_dgid,
1981 				    IB_MC_JSTATE_SEND_ONLY_NON);
1982 			else
1983 				ibd_reacquire_group(state, pmce);
1984 		}
1985 
1986 		/*
1987 		 * Recycle all active AHs to free list (and if there are
1988 		 * pending posts, make sure they will go into the free list
1989 		 * once the Tx's complete). Grab the lock to prevent
1990 		 * concurrent Tx's as well as Tx cleanups.
1991 		 */
1992 		mutex_enter(&state->id_ac_mutex);
1993 		ace = list_head(&state->id_ah_active);
1994 		while ((pace = ace) != NULL) {
1995 			boolean_t cycled;
1996 
1997 			ace = list_next(&state->id_ah_active, ace);
1998 			mce = pace->ac_mce;
1999 			if (pace->ac_chan != NULL) {
2000 				ASSERT(mce == NULL);
2001 				ASSERT(state->id_enable_rc == B_TRUE);
2002 				if (pace->ac_chan->chan_state ==
2003 				    IBD_RC_STATE_ACT_ESTAB) {
2004 					INC_REF(pace, 1);
2005 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2006 					pace->ac_chan->chan_state =
2007 					    IBD_RC_STATE_ACT_CLOSING;
2008 					ibd_rc_signal_act_close(state, pace);
2009 				} else {
2010 					state->rc_act_close_simultaneous++;
2011 					DPRINT(40, "ibd_async_link: other "
2012 					    "thread is closing it, ace=%p, "
2013 					    "ac_chan=%p, chan_state=%d",
2014 					    pace, pace->ac_chan,
2015 					    pace->ac_chan->chan_state);
2016 				}
2017 			} else {
2018 				cycled = ibd_acache_recycle(state,
2019 				    &pace->ac_mac, B_TRUE);
2020 			}
2021 			/*
2022 			 * If this is for an mcg, it must be for a fullmember,
2023 			 * since we got rid of send-only members above when
2024 			 * processing the mce list.
2025 			 */
2026 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2027 			    IB_MC_JSTATE_FULL)));
2028 
2029 			/*
2030 			 * Check if the fullmember mce needs to be torn down,
2031 			 * ie whether the DLPI disable has already been done.
2032 			 * If so, do some of the work of tx_cleanup, namely
2033 			 * causing leave (which will fail), detach and
2034 			 * mce-freeing. tx_cleanup will put the AH into free
2035 			 * list. The reason to duplicate some of this
2036 			 * tx_cleanup work is because we want to delete the
2037 			 * AH right now instead of waiting for tx_cleanup, to
2038 			 * force subsequent Tx's to reacquire an AH.
2039 			 */
2040 			if ((mce != NULL) && (mce->mc_fullreap))
2041 				ibd_async_reap_group(state, mce,
2042 				    mce->mc_info.mc_adds_vect.av_dgid,
2043 				    mce->mc_jstate);
2044 		}
2045 		mutex_exit(&state->id_ac_mutex);
2046 	}
2047 
2048 	/*
2049 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2050 	 * (which stops further events from being delivered) before
2051 	 * mac_unregister(). At this point, it is guaranteed that mac_register
2052 	 * has already been done.
2053 	 */
2054 	mutex_enter(&state->id_link_mutex);
2055 	state->id_link_state = lstate;
2056 	mac_link_update(state->id_mh, lstate);
2057 	mutex_exit(&state->id_link_mutex);
2058 
2059 	ibd_async_done(state);
2060 }
2061 
2062 /*
2063  * Check the pkey table to see if we can find the pkey we're looking for.
2064  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2065  * failure.
2066  */
2067 static int
2068 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2069     uint16_t *pkix)
2070 {
2071 	uint16_t ndx;
2072 
2073 	ASSERT(pkix != NULL);
2074 
2075 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2076 		if (pkey_tbl[ndx] == pkey) {
2077 			*pkix = ndx;
2078 			return (0);
2079 		}
2080 	}
2081 	return (-1);
2082 }
2083 
2084 /*
2085  * Late HCA Initialization:
2086  * If plumb had succeeded without the availability of an active port or the
2087  * pkey, and either of their availability is now being indicated via PORT_UP
2088  * or PORT_CHANGE respectively, try a start of the interface.
2089  *
2090  * Normal Operation:
2091  * When the link is notified up, we need to do a few things, based
2092  * on the port's current p_init_type_reply claiming a reinit has been
2093  * done or not. The reinit steps are:
2094  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2095  *    the old Pkey and GID0 are correct.
2096  * 2. Register for mcg traps (already done by ibmf).
2097  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2098  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2099  * 4. Give up all sendonly memberships.
2100  * 5. Acquire all full memberships.
2101  * 6. In promiscuous mode, acquire all non memberships.
2102  * 7. Recycle all AHs to free list.
2103  */
2104 static void
2105 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2106 {
2107 	ibt_hca_portinfo_t *port_infop = NULL;
2108 	ibt_status_t ibt_status;
2109 	uint_t psize, port_infosz;
2110 	ibd_link_op_t opcode;
2111 	ibd_req_t *req;
2112 	link_state_t new_link_state = LINK_STATE_UP;
2113 	uint8_t itreply;
2114 	uint16_t pkix;
2115 	int ret;
2116 
2117 	/*
2118 	 * Let's not race with a plumb or an unplumb; if we detect a
2119 	 * pkey relocation event later on here, we may have to restart.
2120 	 */
2121 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2122 
2123 	mutex_enter(&state->id_link_mutex);
2124 
2125 	/*
2126 	 * If the link state is unknown, a plumb has not yet been attempted
2127 	 * on the interface. Nothing to do.
2128 	 */
2129 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2130 		mutex_exit(&state->id_link_mutex);
2131 		goto link_mod_return;
2132 	}
2133 
2134 	/*
2135 	 * If link state is down because of plumb failure, and we are not in
2136 	 * late HCA init, and we were not successfully plumbed, nothing to do.
2137 	 */
2138 	if ((state->id_link_state == LINK_STATE_DOWN) &&
2139 	    ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2140 	    ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2141 		mutex_exit(&state->id_link_mutex);
2142 		goto link_mod_return;
2143 	}
2144 
2145 	/*
2146 	 * If this routine was called in response to a port down event,
2147 	 * we just need to see if this should be informed.
2148 	 */
2149 	if (code == IBT_ERROR_PORT_DOWN) {
2150 		new_link_state = LINK_STATE_DOWN;
2151 		goto update_link_state;
2152 	}
2153 
2154 	/*
2155 	 * If it's not a port down event we've received, try to get the port
2156 	 * attributes first. If we fail here, the port is as good as down.
2157 	 * Otherwise, if the link went down by the time the handler gets
2158 	 * here, give up - we cannot even validate the pkey/gid since those
2159 	 * are not valid and this is as bad as a port down anyway.
2160 	 */
2161 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2162 	    &port_infop, &psize, &port_infosz);
2163 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2164 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2165 		new_link_state = LINK_STATE_DOWN;
2166 		goto update_link_state;
2167 	}
2168 
2169 	/*
2170 	 * If in the previous attempt, the pkey was not found either due to the
2171 	 * port state being down, or due to it's absence in the pkey table,
2172 	 * look for it now and try to start the interface.
2173 	 */
2174 	if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2175 		mutex_exit(&state->id_link_mutex);
2176 		if ((ret = ibd_start(state)) != 0) {
2177 			DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2178 			    "init, ret=%d", ret);
2179 		}
2180 		ibt_free_portinfo(port_infop, port_infosz);
2181 		goto link_mod_return;
2182 	}
2183 
2184 	/*
2185 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2186 	 * PreserveContentReply are 0, we don't know anything about the
2187 	 * data loaded into the port attributes, so we need to verify
2188 	 * if gid0 and pkey are still valid.
2189 	 */
2190 	itreply = port_infop->p_init_type_reply;
2191 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2192 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2193 		/*
2194 		 * Check to see if the subnet part of GID0 has changed. If
2195 		 * not, check the simple case first to see if the pkey
2196 		 * index is the same as before; finally check to see if the
2197 		 * pkey has been relocated to a different index in the table.
2198 		 */
2199 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2200 		if (bcmp(port_infop->p_sgid_tbl,
2201 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2202 
2203 			new_link_state = LINK_STATE_DOWN;
2204 
2205 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2206 		    state->id_pkey) {
2207 
2208 			new_link_state = LINK_STATE_UP;
2209 
2210 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2211 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2212 
2213 			ibt_free_portinfo(port_infop, port_infosz);
2214 			mutex_exit(&state->id_link_mutex);
2215 
2216 			/*
2217 			 * Currently a restart is required if our pkey has moved
2218 			 * in the pkey table. If we get the ibt_recycle_ud() to
2219 			 * work as documented (expected), we may be able to
2220 			 * avoid a complete restart.  Note that we've already
2221 			 * marked both the start and stop 'in-progress' flags,
2222 			 * so it is ok to go ahead and do this restart.
2223 			 */
2224 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2225 			if ((ret = ibd_start(state)) != 0) {
2226 				DPRINT(10, "ibd_restart: cannot restart, "
2227 				    "ret=%d", ret);
2228 			}
2229 
2230 			goto link_mod_return;
2231 		} else {
2232 			new_link_state = LINK_STATE_DOWN;
2233 		}
2234 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2235 	}
2236 
2237 update_link_state:
2238 	if (port_infop) {
2239 		ibt_free_portinfo(port_infop, port_infosz);
2240 	}
2241 
2242 	/*
2243 	 * If we're reporting a link up, check InitTypeReply to see if
2244 	 * the SM has ensured that the port's presence in mcg, traps,
2245 	 * etc. is intact.
2246 	 */
2247 	if (new_link_state == LINK_STATE_DOWN) {
2248 		opcode = IBD_LINK_DOWN;
2249 	} else {
2250 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2251 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2252 			opcode = IBD_LINK_UP;
2253 		} else {
2254 			opcode = IBD_LINK_UP_ABSENT;
2255 		}
2256 	}
2257 
2258 	/*
2259 	 * If the old state is the same as the new state, and the SM indicated
2260 	 * no change in the port parameters, nothing to do.
2261 	 */
2262 	if ((state->id_link_state == new_link_state) && (opcode !=
2263 	    IBD_LINK_UP_ABSENT)) {
2264 		mutex_exit(&state->id_link_mutex);
2265 		goto link_mod_return;
2266 	}
2267 
2268 	/*
2269 	 * Ok, so there was a link state change; see if it's safe to ask
2270 	 * the async thread to do the work
2271 	 */
2272 	if (!ibd_async_safe(state)) {
2273 		state->id_link_state = new_link_state;
2274 		mutex_exit(&state->id_link_mutex);
2275 		goto link_mod_return;
2276 	}
2277 
2278 	mutex_exit(&state->id_link_mutex);
2279 
2280 	/*
2281 	 * Queue up a request for ibd_async_link() to handle this link
2282 	 * state change event
2283 	 */
2284 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2285 	req->rq_ptr = (void *)opcode;
2286 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2287 
2288 link_mod_return:
2289 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2290 }
2291 
2292 /*
2293  * For the port up/down events, IBTL guarantees there will not be concurrent
2294  * invocations of the handler. IBTL might coalesce link transition events,
2295  * and not invoke the handler for _each_ up/down transition, but it will
2296  * invoke the handler with last known state
2297  */
2298 static void
2299 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2300     ibt_async_code_t code, ibt_async_event_t *event)
2301 {
2302 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2303 
2304 	switch (code) {
2305 	case IBT_ERROR_CATASTROPHIC_CHAN:
2306 		ibd_print_warn(state, "catastrophic channel error");
2307 		break;
2308 	case IBT_ERROR_CQ:
2309 		ibd_print_warn(state, "completion queue error");
2310 		break;
2311 	case IBT_PORT_CHANGE_EVENT:
2312 		/*
2313 		 * Events will be delivered to all instances that have
2314 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2315 		 * Only need to do work for our port; IBTF will deliver
2316 		 * events for other ports on the hca we have ibt_open_hca'ed
2317 		 * too. Note that id_port is initialized in ibd_attach()
2318 		 * before we do an ibt_open_hca() in ibd_attach().
2319 		 */
2320 		ASSERT(state->id_hca_hdl == hca_hdl);
2321 		if (state->id_port != event->ev_port)
2322 			break;
2323 
2324 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2325 		    IBT_PORT_CHANGE_PKEY) {
2326 			ibd_link_mod(state, code);
2327 		}
2328 		break;
2329 	case IBT_ERROR_PORT_DOWN:
2330 	case IBT_CLNT_REREG_EVENT:
2331 	case IBT_EVENT_PORT_UP:
2332 		/*
2333 		 * Events will be delivered to all instances that have
2334 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2335 		 * Only need to do work for our port; IBTF will deliver
2336 		 * events for other ports on the hca we have ibt_open_hca'ed
2337 		 * too. Note that id_port is initialized in ibd_attach()
2338 		 * before we do an ibt_open_hca() in ibd_attach().
2339 		 */
2340 		ASSERT(state->id_hca_hdl == hca_hdl);
2341 		if (state->id_port != event->ev_port)
2342 			break;
2343 
2344 		ibd_link_mod(state, code);
2345 		break;
2346 
2347 	case IBT_HCA_ATTACH_EVENT:
2348 	case IBT_HCA_DETACH_EVENT:
2349 		/*
2350 		 * When a new card is plugged to the system, attach_event is
2351 		 * invoked. Additionally, a cfgadm needs to be run to make the
2352 		 * card known to the system, and an ifconfig needs to be run to
2353 		 * plumb up any ibd interfaces on the card. In the case of card
2354 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2355 		 * unplumb the ibd interfaces on the card; when the card is
2356 		 * actually unplugged, the detach_event is invoked;
2357 		 * additionally, if any ibd instances are still active on the
2358 		 * card (eg there were no associated RCM scripts), driver's
2359 		 * detach routine is invoked.
2360 		 */
2361 		break;
2362 	default:
2363 		break;
2364 	}
2365 }
2366 
2367 static int
2368 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2369 {
2370 	mac_register_t *macp;
2371 	int ret;
2372 
2373 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2374 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2375 		return (DDI_FAILURE);
2376 	}
2377 
2378 	/*
2379 	 * Note that when we register with mac during attach, we don't
2380 	 * have the id_macaddr yet, so we'll simply be registering a
2381 	 * zero macaddr that we'll overwrite later during plumb (in
2382 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2383 	 * update the mac layer with the correct mtu during plumb.
2384 	 */
2385 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2386 	macp->m_driver = state;
2387 	macp->m_dip = dip;
2388 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2389 	macp->m_callbacks = &ibd_m_callbacks;
2390 	macp->m_min_sdu = 0;
2391 	if (state->id_type == IBD_PORT_DRIVER) {
2392 		macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2393 	} else if (state->id_enable_rc) {
2394 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2395 	} else {
2396 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
2397 	}
2398 	macp->m_priv_props = ibd_priv_props;
2399 
2400 	/*
2401 	 *  Register ourselves with the GLDv3 interface
2402 	 */
2403 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2404 		mac_free(macp);
2405 		DPRINT(10,
2406 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2407 		return (DDI_FAILURE);
2408 	}
2409 
2410 	mac_free(macp);
2411 	return (DDI_SUCCESS);
2412 }
2413 
2414 static int
2415 ibd_record_capab(ibd_state_t *state)
2416 {
2417 	ibt_hca_attr_t hca_attrs;
2418 	ibt_status_t ibt_status;
2419 
2420 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2421 
2422 	/*
2423 	 * Query the HCA and fetch its attributes
2424 	 */
2425 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2426 	ASSERT(ibt_status == IBT_SUCCESS);
2427 
2428 	/*
2429 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2430 	 *    full checksum offload.
2431 	 */
2432 	if (state->id_enable_rc) {
2433 			state->id_hwcksum_capab = 0;
2434 	} else {
2435 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2436 		    == IBT_HCA_CKSUM_FULL) {
2437 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2438 		}
2439 	}
2440 
2441 	/*
2442 	 * 2. Set LSO policy, capability and maximum length
2443 	 */
2444 	if (state->id_enable_rc) {
2445 		state->id_lso_capable = B_FALSE;
2446 		state->id_lso_maxlen = 0;
2447 	} else {
2448 		if (hca_attrs.hca_max_lso_size > 0) {
2449 			state->id_lso_capable = B_TRUE;
2450 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2451 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
2452 			else
2453 				state->id_lso_maxlen =
2454 				    hca_attrs.hca_max_lso_size;
2455 		} else {
2456 			state->id_lso_capable = B_FALSE;
2457 			state->id_lso_maxlen = 0;
2458 		}
2459 	}
2460 
2461 	/*
2462 	 * 3. Set Reserved L_Key capability
2463 	 */
2464 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2465 		state->id_hca_res_lkey_capab = 1;
2466 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2467 		state->rc_enable_iov_map = B_TRUE;
2468 	} else {
2469 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
2470 		state->rc_enable_iov_map = B_FALSE;
2471 	}
2472 
2473 	/*
2474 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2475 	 *    size information is provided by the hca
2476 	 */
2477 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2478 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2479 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2480 	} else {
2481 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2482 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2483 	}
2484 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2485 		state->id_max_sqseg = IBD_MAX_SQSEG;
2486 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2487 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2488 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2489 	}
2490 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2491 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2492 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2493 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2494 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2495 	}
2496 
2497 	/*
2498 	 * Translating the virtual address regions into physical regions
2499 	 * for using the Reserved LKey feature results in a wr sgl that
2500 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2501 	 * we'll fix a high-water mark (65%) for when we should stop.
2502 	 */
2503 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2504 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2505 
2506 	/*
2507 	 * 5. Set number of recv and send wqes after checking hca maximum
2508 	 *    channel size. Store the max channel size in the state so that it
2509 	 *    can be referred to when the swqe/rwqe change is requested via
2510 	 *    dladm.
2511 	 */
2512 
2513 	state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2514 
2515 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2516 		state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2517 
2518 	state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2519 	    IBD_RWQE_MIN;
2520 
2521 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2522 		state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2523 
2524 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2525 
2526 	return (DDI_SUCCESS);
2527 }
2528 
2529 static int
2530 ibd_part_busy(ibd_state_t *state)
2531 {
2532 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2533 		DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2534 		return (DDI_FAILURE);
2535 	}
2536 
2537 	if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2538 		DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2539 		return (DDI_FAILURE);
2540 	}
2541 
2542 	/*
2543 	 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2544 	 * connecting to a remote IPoIB port. We can't remove this port.
2545 	 */
2546 	if (state->id_ah_op == IBD_OP_ONGOING) {
2547 		DPRINT(10, "ibd_part_busy: failed: connecting\n");
2548 		return (DDI_FAILURE);
2549 	}
2550 
2551 	return (DDI_SUCCESS);
2552 }
2553 
2554 
2555 static void
2556 ibd_part_unattach(ibd_state_t *state)
2557 {
2558 	uint32_t progress = state->id_mac_state;
2559 	ibt_status_t ret;
2560 
2561 	/* make sure rx resources are freed */
2562 	ibd_free_rx_rsrcs(state);
2563 
2564 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2565 		ASSERT(state->id_enable_rc);
2566 		ibd_rc_fini_srq_list(state);
2567 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2568 	}
2569 
2570 	if (progress & IBD_DRV_MAC_REGISTERED) {
2571 		(void) mac_unregister(state->id_mh);
2572 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2573 	}
2574 
2575 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2576 		/*
2577 		 * No new async requests will be posted since the device
2578 		 * link state has been marked as unknown; completion handlers
2579 		 * have been turned off, so Tx handler will not cause any
2580 		 * more IBD_ASYNC_REAP requests.
2581 		 *
2582 		 * Queue a request for the async thread to exit, which will
2583 		 * be serviced after any pending ones. This can take a while,
2584 		 * specially if the SM is unreachable, since IBMF will slowly
2585 		 * timeout each SM request issued by the async thread.  Reap
2586 		 * the thread before continuing on, we do not want it to be
2587 		 * lingering in modunloaded code.
2588 		 */
2589 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2590 		thread_join(state->id_async_thrid);
2591 
2592 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2593 	}
2594 
2595 	if (progress & IBD_DRV_REQ_LIST_INITED) {
2596 		list_destroy(&state->id_req_list);
2597 		mutex_destroy(&state->id_acache_req_lock);
2598 		cv_destroy(&state->id_acache_req_cv);
2599 		state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2600 	}
2601 
2602 	if (progress & IBD_DRV_PD_ALLOCD) {
2603 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2604 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2605 			ibd_print_warn(state, "failed to free "
2606 			    "protection domain, ret=%d", ret);
2607 		}
2608 		state->id_pd_hdl = NULL;
2609 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2610 	}
2611 
2612 	if (progress & IBD_DRV_HCA_OPENED) {
2613 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2614 		    IBT_SUCCESS) {
2615 			ibd_print_warn(state, "failed to close "
2616 			    "HCA device, ret=%d", ret);
2617 		}
2618 		state->id_hca_hdl = NULL;
2619 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2620 	}
2621 
2622 	mutex_enter(&ibd_gstate.ig_mutex);
2623 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2624 		if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2625 		    IBT_SUCCESS) {
2626 			ibd_print_warn(state,
2627 			    "ibt_detach() failed, ret=%d", ret);
2628 		}
2629 		state->id_ibt_hdl = NULL;
2630 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2631 		ibd_gstate.ig_ibt_hdl_ref_cnt--;
2632 	}
2633 	if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2634 	    (ibd_gstate.ig_ibt_hdl != NULL)) {
2635 		if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2636 		    IBT_SUCCESS) {
2637 			ibd_print_warn(state, "ibt_detach(): global "
2638 			    "failed, ret=%d", ret);
2639 		}
2640 		ibd_gstate.ig_ibt_hdl = NULL;
2641 	}
2642 	mutex_exit(&ibd_gstate.ig_mutex);
2643 
2644 	if (progress & IBD_DRV_TXINTR_ADDED) {
2645 		ddi_remove_softintr(state->id_tx);
2646 		state->id_tx = NULL;
2647 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2648 	}
2649 
2650 	if (progress & IBD_DRV_RXINTR_ADDED) {
2651 		ddi_remove_softintr(state->id_rx);
2652 		state->id_rx = NULL;
2653 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2654 	}
2655 
2656 #ifdef DEBUG
2657 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2658 		kstat_delete(state->rc_ksp);
2659 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2660 	}
2661 #endif
2662 
2663 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2664 		ibd_state_fini(state);
2665 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2666 	}
2667 }
2668 
2669 int
2670 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2671 {
2672 	ibt_status_t ret;
2673 	int rv;
2674 	kthread_t *kht;
2675 
2676 	/*
2677 	 * Initialize mutexes and condition variables
2678 	 */
2679 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2680 		DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2681 		return (DDI_FAILURE);
2682 	}
2683 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2684 
2685 	/*
2686 	 * Allocate rx,tx softintr
2687 	 */
2688 	if (ibd_rx_softintr == 1) {
2689 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2690 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2691 			DPRINT(10, "ibd_part_attach: failed in "
2692 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2693 			return (DDI_FAILURE);
2694 		}
2695 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2696 	}
2697 	if (ibd_tx_softintr == 1) {
2698 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2699 		    NULL, NULL, ibd_tx_recycle,
2700 		    (caddr_t)state)) != DDI_SUCCESS) {
2701 			DPRINT(10, "ibd_part_attach: failed in "
2702 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2703 			return (DDI_FAILURE);
2704 		}
2705 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2706 	}
2707 
2708 	/*
2709 	 * Attach to IBTL
2710 	 */
2711 	mutex_enter(&ibd_gstate.ig_mutex);
2712 	if (ibd_gstate.ig_ibt_hdl == NULL) {
2713 		if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2714 		    &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2715 			DPRINT(10, "ibd_part_attach: global: failed in "
2716 			    "ibt_attach(), ret=%d", ret);
2717 			mutex_exit(&ibd_gstate.ig_mutex);
2718 			return (DDI_FAILURE);
2719 		}
2720 	}
2721 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2722 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2723 		DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2724 		    ret);
2725 		mutex_exit(&ibd_gstate.ig_mutex);
2726 		return (DDI_FAILURE);
2727 	}
2728 	ibd_gstate.ig_ibt_hdl_ref_cnt++;
2729 	mutex_exit(&ibd_gstate.ig_mutex);
2730 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2731 
2732 	/*
2733 	 * Open the HCA
2734 	 */
2735 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2736 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2737 		DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2738 		    ret);
2739 		return (DDI_FAILURE);
2740 	}
2741 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2742 
2743 #ifdef DEBUG
2744 	/* Initialize Driver Counters for Reliable Connected Mode */
2745 	if (state->id_enable_rc) {
2746 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2747 			DPRINT(10, "ibd_part_attach: failed in "
2748 			    "ibd_rc_init_stats");
2749 			return (DDI_FAILURE);
2750 		}
2751 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2752 	}
2753 #endif
2754 
2755 	/*
2756 	 * Record capabilities
2757 	 */
2758 	(void) ibd_record_capab(state);
2759 
2760 	/*
2761 	 * Allocate a protection domain on the HCA
2762 	 */
2763 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2764 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2765 		DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2766 		    ret);
2767 		return (DDI_FAILURE);
2768 	}
2769 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2770 
2771 
2772 	/*
2773 	 * We need to initialise the req_list that is required for the
2774 	 * operation of the async_thread.
2775 	 */
2776 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2777 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2778 	list_create(&state->id_req_list, sizeof (ibd_req_t),
2779 	    offsetof(ibd_req_t, rq_list));
2780 	state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2781 
2782 	/*
2783 	 * Create the async thread; thread_create never fails.
2784 	 */
2785 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2786 	    TS_RUN, minclsyspri);
2787 	state->id_async_thrid = kht->t_did;
2788 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2789 
2790 	return (DDI_SUCCESS);
2791 }
2792 
2793 /*
2794  * Attach device to the IO framework.
2795  */
2796 static int
2797 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2798 {
2799 	int ret;
2800 
2801 	switch (cmd) {
2802 		case DDI_ATTACH:
2803 			ret = ibd_port_attach(dip);
2804 			break;
2805 		default:
2806 			ret = DDI_FAILURE;
2807 			break;
2808 	}
2809 	return (ret);
2810 }
2811 
2812 /*
2813  * Detach device from the IO framework.
2814  */
2815 static int
2816 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2817 {
2818 	ibd_state_t *state;
2819 	int instance;
2820 
2821 	/*
2822 	 * IBD doesn't support suspend/resume
2823 	 */
2824 	if (cmd != DDI_DETACH)
2825 		return (DDI_FAILURE);
2826 
2827 	/*
2828 	 * Get the instance softstate
2829 	 */
2830 	instance = ddi_get_instance(dip);
2831 	state = ddi_get_soft_state(ibd_list, instance);
2832 
2833 	/*
2834 	 * Release all resources we're holding still.  Note that if we'd
2835 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2836 	 * so far, we should find all the flags we need in id_mac_state.
2837 	 */
2838 	return (ibd_port_unattach(state, dip));
2839 }
2840 
2841 /*
2842  * Pre ibt_attach() driver initialization
2843  */
2844 static int
2845 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2846 {
2847 	char buf[64];
2848 
2849 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2850 	state->id_link_state = LINK_STATE_UNKNOWN;
2851 
2852 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2853 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2854 	state->id_trap_stop = B_TRUE;
2855 	state->id_trap_inprog = 0;
2856 
2857 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2858 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2859 	state->id_dip = dip;
2860 
2861 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2862 
2863 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2864 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2865 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2866 	state->id_tx_busy = 0;
2867 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2868 
2869 	state->id_rx_list.dl_bufs_outstanding = 0;
2870 	state->id_rx_list.dl_cnt = 0;
2871 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2872 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2873 	(void) sprintf(buf, "ibd_req%d_%x", ddi_get_instance(dip),
2874 	    state->id_pkey);
2875 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2876 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2877 
2878 	/* For Reliable Connected Mode */
2879 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2880 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2881 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2882 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2883 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2884 	    MUTEX_DRIVER, NULL);
2885 	mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2886 
2887 	/*
2888 	 * Make the default link mode as RC. If this fails during connection
2889 	 * setup, the link mode is automatically transitioned to UD.
2890 	 * Also set the RC MTU.
2891 	 */
2892 	state->id_enable_rc = IBD_DEF_LINK_MODE;
2893 	state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2894 	state->id_mtu = IBD_DEF_MAX_MTU;
2895 
2896 	/* Iniatialize all tunables to default */
2897 	state->id_lso_policy = IBD_DEF_LSO_POLICY;
2898 	state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2899 	state->id_num_ah = IBD_DEF_NUM_AH;
2900 	state->id_hash_size = IBD_DEF_HASH_SIZE;
2901 	state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2902 	state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2903 	state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2904 	state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2905 	state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2906 	state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2907 	state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2908 	state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2909 	state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2910 	state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2911 	state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2912 	state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2913 	state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2914 	state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2915 	state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2916 	state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2917 	state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2918 	state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2919 	state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2920 	state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2921 
2922 	return (DDI_SUCCESS);
2923 }
2924 
2925 /*
2926  * Post ibt_detach() driver deconstruction
2927  */
2928 static void
2929 ibd_state_fini(ibd_state_t *state)
2930 {
2931 	kmem_cache_destroy(state->id_req_kmc);
2932 
2933 	mutex_destroy(&state->id_rx_list.dl_mutex);
2934 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2935 
2936 	mutex_destroy(&state->id_txpost_lock);
2937 	mutex_destroy(&state->id_tx_list.dl_mutex);
2938 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2939 	mutex_destroy(&state->id_lso_lock);
2940 
2941 	mutex_destroy(&state->id_sched_lock);
2942 	mutex_destroy(&state->id_scq_poll_lock);
2943 	mutex_destroy(&state->id_rcq_poll_lock);
2944 
2945 	cv_destroy(&state->id_trap_cv);
2946 	mutex_destroy(&state->id_trap_lock);
2947 	mutex_destroy(&state->id_link_mutex);
2948 
2949 	/* For Reliable Connected Mode */
2950 	mutex_destroy(&state->rc_timeout_lock);
2951 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2952 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2953 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2954 	mutex_destroy(&state->rc_tx_large_bufs_lock);
2955 	mutex_destroy(&state->rc_rx_lock);
2956 }
2957 
2958 /*
2959  * Fetch link speed from SA for snmp ifspeed reporting.
2960  */
2961 static uint64_t
2962 ibd_get_portspeed(ibd_state_t *state)
2963 {
2964 	int			ret;
2965 	ibt_path_info_t		path;
2966 	ibt_path_attr_t		path_attr;
2967 	uint8_t			num_paths;
2968 	uint64_t		ifspeed;
2969 
2970 	/*
2971 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2972 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2973 	 * 2000000000. Start with that as default.
2974 	 */
2975 	ifspeed = 2000000000;
2976 
2977 	bzero(&path_attr, sizeof (path_attr));
2978 
2979 	/*
2980 	 * Get the port speed from Loopback path information.
2981 	 */
2982 	path_attr.pa_dgids = &state->id_sgid;
2983 	path_attr.pa_num_dgids = 1;
2984 	path_attr.pa_sgid = state->id_sgid;
2985 
2986 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2987 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2988 		goto earlydone;
2989 
2990 	if (num_paths < 1)
2991 		goto earlydone;
2992 
2993 	/*
2994 	 * In case SA does not return an expected value, report the default
2995 	 * speed as 1X.
2996 	 */
2997 	ret = 1;
2998 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2999 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
3000 			ret = 1;
3001 			break;
3002 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
3003 			ret = 4;
3004 			break;
3005 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
3006 			ret = 12;
3007 			break;
3008 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
3009 			ret = 2;
3010 			break;
3011 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
3012 			ret = 8;
3013 			break;
3014 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
3015 			ret = 16;
3016 			break;
3017 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
3018 			ret = 24;
3019 			break;
3020 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
3021 			ret = 32;
3022 			break;
3023 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
3024 			ret = 48;
3025 			break;
3026 	}
3027 
3028 	ifspeed *= ret;
3029 
3030 earlydone:
3031 	return (ifspeed);
3032 }
3033 
3034 /*
3035  * Search input mcg list (id_mc_full or id_mc_non) for an entry
3036  * representing the input mcg mgid.
3037  */
3038 static ibd_mce_t *
3039 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3040 {
3041 	ibd_mce_t *ptr = list_head(mlist);
3042 
3043 	/*
3044 	 * Do plain linear search.
3045 	 */
3046 	while (ptr != NULL) {
3047 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3048 		    sizeof (ib_gid_t)) == 0)
3049 			return (ptr);
3050 		ptr = list_next(mlist, ptr);
3051 	}
3052 	return (NULL);
3053 }
3054 
3055 /*
3056  * Execute IBA JOIN.
3057  */
3058 static ibt_status_t
3059 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3060 {
3061 	ibt_mcg_attr_t mcg_attr;
3062 
3063 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3064 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3065 	mcg_attr.mc_mgid = mgid;
3066 	mcg_attr.mc_join_state = mce->mc_jstate;
3067 	mcg_attr.mc_scope = state->id_scope;
3068 	mcg_attr.mc_pkey = state->id_pkey;
3069 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3070 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3071 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3072 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3073 	    NULL, NULL));
3074 }
3075 
3076 /*
3077  * This code JOINs the port in the proper way (depending on the join
3078  * state) so that IBA fabric will forward mcg packets to/from the port.
3079  * It also attaches the QPN to the mcg so it can receive those mcg
3080  * packets. This code makes sure not to attach the mcg to the QP if
3081  * that has been previously done due to the mcg being joined with a
3082  * different join state, even though this is not required by SWG_0216,
3083  * refid 3610.
3084  */
3085 static ibd_mce_t *
3086 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3087 {
3088 	ibt_status_t ibt_status;
3089 	ibd_mce_t *mce, *tmce, *omce = NULL;
3090 	boolean_t do_attach = B_TRUE;
3091 
3092 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3093 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3094 
3095 	/*
3096 	 * For enable_multicast Full member joins, we need to do some
3097 	 * extra work. If there is already an mce on the list that
3098 	 * indicates full membership, that means the membership has
3099 	 * not yet been dropped (since the disable_multicast was issued)
3100 	 * because there are pending Tx's to the mcg; in that case, just
3101 	 * mark the mce not to be reaped when the Tx completion queues
3102 	 * an async reap operation.
3103 	 *
3104 	 * If there is already an mce on the list indicating sendonly
3105 	 * membership, try to promote to full membership. Be careful
3106 	 * not to deallocate the old mce, since there might be an AH
3107 	 * pointing to it; instead, update the old mce with new data
3108 	 * that tracks the full membership.
3109 	 */
3110 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3111 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3112 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3113 			ASSERT(omce->mc_fullreap);
3114 			omce->mc_fullreap = B_FALSE;
3115 			return (omce);
3116 		} else {
3117 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3118 		}
3119 	}
3120 
3121 	/*
3122 	 * Allocate the ibd_mce_t to track this JOIN.
3123 	 */
3124 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3125 	mce->mc_fullreap = B_FALSE;
3126 	mce->mc_jstate = jstate;
3127 
3128 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3129 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3130 		    ibt_status);
3131 		kmem_free(mce, sizeof (ibd_mce_t));
3132 		return (NULL);
3133 	}
3134 
3135 	/*
3136 	 * Is an IBA attach required? Not if the interface is already joined
3137 	 * to the mcg in a different appropriate join state.
3138 	 */
3139 	if (jstate == IB_MC_JSTATE_NON) {
3140 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3141 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3142 			do_attach = B_FALSE;
3143 	} else if (jstate == IB_MC_JSTATE_FULL) {
3144 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3145 			do_attach = B_FALSE;
3146 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3147 		do_attach = B_FALSE;
3148 	}
3149 
3150 	if (do_attach) {
3151 		/*
3152 		 * Do the IBA attach.
3153 		 */
3154 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3155 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3156 		    &mce->mc_info)) != IBT_SUCCESS) {
3157 			DPRINT(10, "ibd_join_group : failed qp attachment "
3158 			    "%d\n", ibt_status);
3159 			/*
3160 			 * NOTE that we should probably preserve the join info
3161 			 * in the list and later try to leave again at detach
3162 			 * time.
3163 			 */
3164 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3165 			    state->id_sgid, jstate);
3166 			kmem_free(mce, sizeof (ibd_mce_t));
3167 			return (NULL);
3168 		}
3169 	}
3170 
3171 	/*
3172 	 * Insert the ibd_mce_t in the proper list.
3173 	 */
3174 	if (jstate == IB_MC_JSTATE_NON) {
3175 		IBD_MCACHE_INSERT_NON(state, mce);
3176 	} else {
3177 		/*
3178 		 * Set up the mc_req fields used for reaping the
3179 		 * mcg in case of delayed tx completion (see
3180 		 * ibd_tx_cleanup()). Also done for sendonly join in
3181 		 * case we are promoted to fullmembership later and
3182 		 * keep using the same mce.
3183 		 */
3184 		mce->mc_req.rq_gid = mgid;
3185 		mce->mc_req.rq_ptr = mce;
3186 		/*
3187 		 * Check whether this is the case of trying to join
3188 		 * full member, and we were already joined send only.
3189 		 * We try to drop our SendOnly membership, but it is
3190 		 * possible that the mcg does not exist anymore (and
3191 		 * the subnet trap never reached us), so the leave
3192 		 * operation might fail.
3193 		 */
3194 		if (omce != NULL) {
3195 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3196 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3197 			omce->mc_jstate = IB_MC_JSTATE_FULL;
3198 			bcopy(&mce->mc_info, &omce->mc_info,
3199 			    sizeof (ibt_mcg_info_t));
3200 			kmem_free(mce, sizeof (ibd_mce_t));
3201 			return (omce);
3202 		}
3203 		mutex_enter(&state->id_mc_mutex);
3204 		IBD_MCACHE_INSERT_FULL(state, mce);
3205 		mutex_exit(&state->id_mc_mutex);
3206 	}
3207 
3208 	return (mce);
3209 }
3210 
3211 /*
3212  * Called during port up event handling to attempt to reacquire full
3213  * membership to an mcg. Stripped down version of ibd_join_group().
3214  * Note that it is possible that the mcg might have gone away, and
3215  * gets recreated at this point.
3216  */
3217 static void
3218 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3219 {
3220 	ib_gid_t mgid;
3221 
3222 	/*
3223 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
3224 	 * reap/leave is going to try to leave the group. We could prevent
3225 	 * that by adding a boolean flag into ibd_mce_t, if required.
3226 	 */
3227 	if (mce->mc_fullreap)
3228 		return;
3229 
3230 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
3231 
3232 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3233 	    mgid.gid_guid);
3234 
3235 	/* While reacquiring, leave and then join the MCG */
3236 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3237 	    mce->mc_jstate);
3238 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3239 		ibd_print_warn(state, "Failure on port up to rejoin "
3240 		    "multicast gid %016llx:%016llx",
3241 		    (u_longlong_t)mgid.gid_prefix,
3242 		    (u_longlong_t)mgid.gid_guid);
3243 }
3244 
3245 /*
3246  * This code handles delayed Tx completion cleanups for mcg's to which
3247  * disable_multicast has been issued, regular mcg related cleanups during
3248  * disable_multicast, disable_promiscuous and mcg traps, as well as
3249  * cleanups during driver detach time. Depending on the join state,
3250  * it deletes the mce from the appropriate list and issues the IBA
3251  * leave/detach; except in the disable_multicast case when the mce
3252  * is left on the active list for a subsequent Tx completion cleanup.
3253  */
3254 static void
3255 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3256     uint8_t jstate)
3257 {
3258 	ibd_mce_t *tmce;
3259 	boolean_t do_detach = B_TRUE;
3260 
3261 	/*
3262 	 * Before detaching, we must check whether the other list
3263 	 * contains the mcg; if we detach blindly, the consumer
3264 	 * who set up the other list will also stop receiving
3265 	 * traffic.
3266 	 */
3267 	if (jstate == IB_MC_JSTATE_FULL) {
3268 		/*
3269 		 * The following check is only relevant while coming
3270 		 * from the Tx completion path in the reap case.
3271 		 */
3272 		if (!mce->mc_fullreap)
3273 			return;
3274 		mutex_enter(&state->id_mc_mutex);
3275 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3276 		mutex_exit(&state->id_mc_mutex);
3277 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3278 			do_detach = B_FALSE;
3279 	} else if (jstate == IB_MC_JSTATE_NON) {
3280 		IBD_MCACHE_PULLOUT_NON(state, mce);
3281 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3282 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3283 			do_detach = B_FALSE;
3284 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3285 		mutex_enter(&state->id_mc_mutex);
3286 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3287 		mutex_exit(&state->id_mc_mutex);
3288 		do_detach = B_FALSE;
3289 	}
3290 
3291 	/*
3292 	 * If we are reacting to a mcg trap and leaving our sendonly or
3293 	 * non membership, the mcg is possibly already gone, so attempting
3294 	 * to leave might fail. On the other hand, we must try to leave
3295 	 * anyway, since this might be a trap from long ago, and we could
3296 	 * have potentially sendonly joined to a recent incarnation of
3297 	 * the mcg and are about to loose track of this information.
3298 	 */
3299 	if (do_detach) {
3300 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3301 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3302 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3303 	}
3304 
3305 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3306 	kmem_free(mce, sizeof (ibd_mce_t));
3307 }
3308 
3309 /*
3310  * Async code executed due to multicast and promiscuous disable requests
3311  * and mcg trap handling; also executed during driver detach. Mostly, a
3312  * leave and detach is done; except for the fullmember case when Tx
3313  * requests are pending, whence arrangements are made for subsequent
3314  * cleanup on Tx completion.
3315  */
3316 static void
3317 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3318 {
3319 	ipoib_mac_t mcmac;
3320 	boolean_t recycled;
3321 	ibd_mce_t *mce;
3322 
3323 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3324 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3325 
3326 	if (jstate == IB_MC_JSTATE_NON) {
3327 		recycled = B_TRUE;
3328 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3329 		/*
3330 		 * In case we are handling a mcg trap, we might not find
3331 		 * the mcg in the non list.
3332 		 */
3333 		if (mce == NULL) {
3334 			return;
3335 		}
3336 	} else {
3337 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3338 
3339 		/*
3340 		 * In case we are handling a mcg trap, make sure the trap
3341 		 * is not arriving late; if we have an mce that indicates
3342 		 * that we are already a fullmember, that would be a clear
3343 		 * indication that the trap arrived late (ie, is for a
3344 		 * previous incarnation of the mcg).
3345 		 */
3346 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3347 			if ((mce == NULL) || (mce->mc_jstate ==
3348 			    IB_MC_JSTATE_FULL)) {
3349 				return;
3350 			}
3351 		} else {
3352 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3353 
3354 			/*
3355 			 * If join group failed, mce will be NULL here.
3356 			 * This is because in GLDv3 driver, set multicast
3357 			 *  will always return success.
3358 			 */
3359 			if (mce == NULL) {
3360 				return;
3361 			}
3362 
3363 			mce->mc_fullreap = B_TRUE;
3364 		}
3365 
3366 		/*
3367 		 * If no pending Tx's remain that reference the AH
3368 		 * for the mcg, recycle it from active to free list.
3369 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3370 		 * so the last completing Tx will cause an async reap
3371 		 * operation to be invoked, at which time we will drop our
3372 		 * membership to the mcg so that the pending Tx's complete
3373 		 * successfully. Refer to comments on "AH and MCE active
3374 		 * list manipulation" at top of this file. The lock protects
3375 		 * against Tx fast path and Tx cleanup code.
3376 		 */
3377 		mutex_enter(&state->id_ac_mutex);
3378 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3379 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3380 		    IB_MC_JSTATE_SEND_ONLY_NON));
3381 		mutex_exit(&state->id_ac_mutex);
3382 	}
3383 
3384 	if (recycled) {
3385 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3386 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3387 		ibd_async_reap_group(state, mce, mgid, jstate);
3388 	}
3389 }
3390 
3391 /*
3392  * Find the broadcast address as defined by IPoIB; implicitly
3393  * determines the IBA scope, mtu, tclass etc of the link the
3394  * interface is going to be a member of.
3395  */
3396 static ibt_status_t
3397 ibd_find_bgroup(ibd_state_t *state)
3398 {
3399 	ibt_mcg_attr_t mcg_attr;
3400 	uint_t numg;
3401 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3402 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3403 	    IB_MC_SCOPE_GLOBAL };
3404 	int i, mcgmtu;
3405 	boolean_t found = B_FALSE;
3406 	int ret;
3407 	ibt_mcg_info_t mcg_info;
3408 
3409 	state->id_bgroup_created = B_FALSE;
3410 	state->id_bgroup_present = B_FALSE;
3411 
3412 query_bcast_grp:
3413 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3414 	mcg_attr.mc_pkey = state->id_pkey;
3415 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3416 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3417 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3418 
3419 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3420 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3421 
3422 		/*
3423 		 * Look for the IPoIB broadcast group.
3424 		 */
3425 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3426 		state->id_mgid.gid_prefix =
3427 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3428 		    ((uint64_t)state->id_scope << 48) |
3429 		    ((uint32_t)(state->id_pkey << 16)));
3430 		mcg_attr.mc_mgid = state->id_mgid;
3431 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3432 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3433 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3434 			found = B_TRUE;
3435 			break;
3436 		}
3437 	}
3438 
3439 	if (!found) {
3440 		if (state->id_create_broadcast_group) {
3441 			/*
3442 			 * If we created the broadcast group, but failed to
3443 			 * find it, we can't do anything except leave the
3444 			 * one we created and return failure.
3445 			 */
3446 			if (state->id_bgroup_created) {
3447 				ibd_print_warn(state, "IPoIB broadcast group "
3448 				    "absent. Unable to query after create.");
3449 				goto find_bgroup_fail;
3450 			}
3451 
3452 			/*
3453 			 * Create the ipoib broadcast group if it didn't exist
3454 			 */
3455 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3456 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3457 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3458 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3459 			mcg_attr.mc_pkey = state->id_pkey;
3460 			mcg_attr.mc_flow = 0;
3461 			mcg_attr.mc_sl = 0;
3462 			mcg_attr.mc_tclass = 0;
3463 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3464 			state->id_mgid.gid_prefix =
3465 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3466 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3467 			    ((uint32_t)(state->id_pkey << 16)));
3468 			mcg_attr.mc_mgid = state->id_mgid;
3469 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3470 
3471 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3472 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3473 				ibd_print_warn(state, "IPoIB broadcast group "
3474 				    "absent, create failed: ret = %d\n", ret);
3475 				state->id_bgroup_created = B_FALSE;
3476 				return (IBT_FAILURE);
3477 			}
3478 			state->id_bgroup_created = B_TRUE;
3479 			goto query_bcast_grp;
3480 		} else {
3481 			ibd_print_warn(state, "IPoIB broadcast group absent");
3482 			return (IBT_FAILURE);
3483 		}
3484 	}
3485 
3486 	/*
3487 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3488 	 */
3489 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3490 	if (state->id_mtu < mcgmtu) {
3491 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3492 		    "greater than port's maximum MTU %d", mcgmtu,
3493 		    state->id_mtu);
3494 		ibt_free_mcg_info(state->id_mcinfo, 1);
3495 		goto find_bgroup_fail;
3496 	}
3497 	state->id_mtu = mcgmtu;
3498 	state->id_bgroup_present = B_TRUE;
3499 
3500 	return (IBT_SUCCESS);
3501 
3502 find_bgroup_fail:
3503 	if (state->id_bgroup_created) {
3504 		(void) ibt_leave_mcg(state->id_sgid,
3505 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3506 		    IB_MC_JSTATE_FULL);
3507 	}
3508 
3509 	return (IBT_FAILURE);
3510 }
3511 
3512 static int
3513 ibd_alloc_tx_copybufs(ibd_state_t *state)
3514 {
3515 	ibt_mr_attr_t mem_attr;
3516 
3517 	/*
3518 	 * Allocate one big chunk for all regular tx copy bufs
3519 	 */
3520 	state->id_tx_buf_sz = state->id_mtu;
3521 	if (state->id_lso_policy && state->id_lso_capable &&
3522 	    (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3523 		state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3524 	}
3525 
3526 	state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3527 	    state->id_tx_buf_sz, KM_SLEEP);
3528 
3529 	state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3530 	    sizeof (ibd_swqe_t), KM_SLEEP);
3531 
3532 	/*
3533 	 * Do one memory registration on the entire txbuf area
3534 	 */
3535 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3536 	mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3537 	mem_attr.mr_as = NULL;
3538 	mem_attr.mr_flags = IBT_MR_SLEEP;
3539 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3540 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3541 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3542 		kmem_free(state->id_tx_wqes,
3543 		    state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3544 		kmem_free(state->id_tx_bufs,
3545 		    state->id_ud_num_swqe * state->id_tx_buf_sz);
3546 		state->id_tx_bufs = NULL;
3547 		return (DDI_FAILURE);
3548 	}
3549 
3550 	return (DDI_SUCCESS);
3551 }
3552 
3553 static int
3554 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3555 {
3556 	ibt_mr_attr_t mem_attr;
3557 	ibd_lsobuf_t *buflist;
3558 	ibd_lsobuf_t *lbufp;
3559 	ibd_lsobuf_t *tail;
3560 	ibd_lsobkt_t *bktp;
3561 	uint8_t *membase;
3562 	uint8_t *memp;
3563 	uint_t memsz;
3564 	int i;
3565 
3566 	/*
3567 	 * Allocate the lso bucket
3568 	 */
3569 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3570 
3571 	/*
3572 	 * Allocate the entire lso memory and register it
3573 	 */
3574 	memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3575 	membase = kmem_zalloc(memsz, KM_SLEEP);
3576 
3577 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3578 	mem_attr.mr_len = memsz;
3579 	mem_attr.mr_as = NULL;
3580 	mem_attr.mr_flags = IBT_MR_SLEEP;
3581 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3582 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3583 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3584 		kmem_free(membase, memsz);
3585 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3586 		return (DDI_FAILURE);
3587 	}
3588 
3589 	mutex_enter(&state->id_lso_lock);
3590 
3591 	/*
3592 	 * Now allocate the buflist.  Note that the elements in the buflist and
3593 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3594 	 * can always derive the address of a buflist entry from the address of
3595 	 * an lso buffer.
3596 	 */
3597 	buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3598 	    KM_SLEEP);
3599 
3600 	/*
3601 	 * Set up the lso buf chain
3602 	 */
3603 	memp = membase;
3604 	lbufp = buflist;
3605 	for (i = 0; i < state->id_num_lso_bufs; i++) {
3606 		lbufp->lb_isfree = 1;
3607 		lbufp->lb_buf = memp;
3608 		lbufp->lb_next = lbufp + 1;
3609 
3610 		tail = lbufp;
3611 
3612 		memp += IBD_LSO_BUFSZ;
3613 		lbufp++;
3614 	}
3615 	tail->lb_next = NULL;
3616 
3617 	/*
3618 	 * Set up the LSO buffer information in ibd state
3619 	 */
3620 	bktp->bkt_bufl = buflist;
3621 	bktp->bkt_free_head = buflist;
3622 	bktp->bkt_mem = membase;
3623 	bktp->bkt_nelem = state->id_num_lso_bufs;
3624 	bktp->bkt_nfree = bktp->bkt_nelem;
3625 
3626 	state->id_lso = bktp;
3627 	mutex_exit(&state->id_lso_lock);
3628 
3629 	return (DDI_SUCCESS);
3630 }
3631 
3632 /*
3633  * Statically allocate Tx buffer list(s).
3634  */
3635 static int
3636 ibd_init_txlist(ibd_state_t *state)
3637 {
3638 	ibd_swqe_t *swqe;
3639 	ibt_lkey_t lkey;
3640 	int i;
3641 	uint_t len;
3642 	uint8_t *bufaddr;
3643 
3644 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3645 		return (DDI_FAILURE);
3646 
3647 	if (state->id_lso_policy && state->id_lso_capable) {
3648 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3649 			state->id_lso_capable = B_FALSE;
3650 	}
3651 
3652 	mutex_enter(&state->id_tx_list.dl_mutex);
3653 	state->id_tx_list.dl_head = NULL;
3654 	state->id_tx_list.dl_pending_sends = B_FALSE;
3655 	state->id_tx_list.dl_cnt = 0;
3656 	mutex_exit(&state->id_tx_list.dl_mutex);
3657 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3658 	state->id_tx_rel_list.dl_head = NULL;
3659 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3660 	state->id_tx_rel_list.dl_cnt = 0;
3661 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3662 
3663 	/*
3664 	 * Allocate and setup the swqe list
3665 	 */
3666 	lkey = state->id_tx_mr_desc.md_lkey;
3667 	bufaddr = state->id_tx_bufs;
3668 	len = state->id_tx_buf_sz;
3669 	swqe = state->id_tx_wqes;
3670 	mutex_enter(&state->id_tx_list.dl_mutex);
3671 	for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3672 		swqe->swqe_next = NULL;
3673 		swqe->swqe_im_mblk = NULL;
3674 
3675 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3676 		    bufaddr;
3677 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3678 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3679 
3680 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3681 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3682 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3683 
3684 		/* These are set in send */
3685 		swqe->w_swr.wr_nds = 0;
3686 		swqe->w_swr.wr_sgl = NULL;
3687 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3688 
3689 		/* add to list */
3690 		state->id_tx_list.dl_cnt++;
3691 		swqe->swqe_next = state->id_tx_list.dl_head;
3692 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3693 	}
3694 	mutex_exit(&state->id_tx_list.dl_mutex);
3695 
3696 	return (DDI_SUCCESS);
3697 }
3698 
3699 static int
3700 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3701     uint32_t *nds_p)
3702 {
3703 	ibd_lsobkt_t *bktp;
3704 	ibd_lsobuf_t *lbufp;
3705 	ibd_lsobuf_t *nextp;
3706 	ibt_lkey_t lso_lkey;
3707 	uint_t frag_sz;
3708 	uint_t num_needed;
3709 	int i;
3710 
3711 	ASSERT(sgl_p != NULL);
3712 	ASSERT(nds_p != NULL);
3713 	ASSERT(req_sz != 0);
3714 
3715 	/*
3716 	 * Determine how many bufs we'd need for the size requested
3717 	 */
3718 	num_needed = req_sz / IBD_LSO_BUFSZ;
3719 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3720 		num_needed++;
3721 
3722 	mutex_enter(&state->id_lso_lock);
3723 
3724 	/*
3725 	 * If we don't have enough lso bufs, return failure
3726 	 */
3727 	ASSERT(state->id_lso != NULL);
3728 	bktp = state->id_lso;
3729 	if (bktp->bkt_nfree < num_needed) {
3730 		mutex_exit(&state->id_lso_lock);
3731 		return (-1);
3732 	}
3733 
3734 	/*
3735 	 * Pick the first 'num_needed' bufs from the free list
3736 	 */
3737 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3738 	lbufp = bktp->bkt_free_head;
3739 	for (i = 0; i < num_needed; i++) {
3740 		ASSERT(lbufp->lb_isfree != 0);
3741 		ASSERT(lbufp->lb_buf != NULL);
3742 
3743 		nextp = lbufp->lb_next;
3744 
3745 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3746 		sgl_p[i].ds_key = lso_lkey;
3747 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3748 
3749 		lbufp->lb_isfree = 0;
3750 		lbufp->lb_next = NULL;
3751 
3752 		lbufp = nextp;
3753 	}
3754 	bktp->bkt_free_head = lbufp;
3755 
3756 	/*
3757 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3758 	 * to adjust the last sgl entry's length. Since we know we need atleast
3759 	 * one, the i-1 use below is ok.
3760 	 */
3761 	if (frag_sz) {
3762 		sgl_p[i-1].ds_len = frag_sz;
3763 	}
3764 
3765 	/*
3766 	 * Update nfree count and return
3767 	 */
3768 	bktp->bkt_nfree -= num_needed;
3769 
3770 	mutex_exit(&state->id_lso_lock);
3771 
3772 	*nds_p = num_needed;
3773 
3774 	return (0);
3775 }
3776 
3777 static void
3778 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3779 {
3780 	ibd_lsobkt_t *bktp;
3781 	ibd_lsobuf_t *lbufp;
3782 	uint8_t *lso_mem_end;
3783 	uint_t ndx;
3784 	int i;
3785 
3786 	mutex_enter(&state->id_lso_lock);
3787 
3788 	bktp = state->id_lso;
3789 	ASSERT(bktp != NULL);
3790 
3791 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3792 	for (i = 0; i < nds; i++) {
3793 		uint8_t *va;
3794 
3795 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3796 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3797 
3798 		/*
3799 		 * Figure out the buflist element this sgl buffer corresponds
3800 		 * to and put it back at the head
3801 		 */
3802 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3803 		lbufp = bktp->bkt_bufl + ndx;
3804 
3805 		ASSERT(lbufp->lb_isfree == 0);
3806 		ASSERT(lbufp->lb_buf == va);
3807 
3808 		lbufp->lb_isfree = 1;
3809 		lbufp->lb_next = bktp->bkt_free_head;
3810 		bktp->bkt_free_head = lbufp;
3811 	}
3812 	bktp->bkt_nfree += nds;
3813 
3814 	mutex_exit(&state->id_lso_lock);
3815 }
3816 
3817 static void
3818 ibd_free_tx_copybufs(ibd_state_t *state)
3819 {
3820 	/*
3821 	 * Unregister txbuf mr
3822 	 */
3823 	if (ibt_deregister_mr(state->id_hca_hdl,
3824 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3825 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3826 	}
3827 	state->id_tx_mr_hdl = NULL;
3828 
3829 	/*
3830 	 * Free txbuf memory
3831 	 */
3832 	kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3833 	    sizeof (ibd_swqe_t));
3834 	kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3835 	    state->id_tx_buf_sz);
3836 	state->id_tx_wqes = NULL;
3837 	state->id_tx_bufs = NULL;
3838 }
3839 
3840 static void
3841 ibd_free_tx_lsobufs(ibd_state_t *state)
3842 {
3843 	ibd_lsobkt_t *bktp;
3844 
3845 	mutex_enter(&state->id_lso_lock);
3846 
3847 	if ((bktp = state->id_lso) == NULL) {
3848 		mutex_exit(&state->id_lso_lock);
3849 		return;
3850 	}
3851 
3852 	/*
3853 	 * First, free the buflist
3854 	 */
3855 	ASSERT(bktp->bkt_bufl != NULL);
3856 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3857 
3858 	/*
3859 	 * Unregister the LSO memory and free it
3860 	 */
3861 	ASSERT(bktp->bkt_mr_hdl != NULL);
3862 	if (ibt_deregister_mr(state->id_hca_hdl,
3863 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3864 		DPRINT(10,
3865 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3866 	}
3867 	ASSERT(bktp->bkt_mem);
3868 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3869 
3870 	/*
3871 	 * Finally free the bucket
3872 	 */
3873 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3874 	state->id_lso = NULL;
3875 
3876 	mutex_exit(&state->id_lso_lock);
3877 }
3878 
3879 /*
3880  * Free the statically allocated Tx buffer list.
3881  */
3882 static void
3883 ibd_fini_txlist(ibd_state_t *state)
3884 {
3885 	/*
3886 	 * Free the allocated swqes
3887 	 */
3888 	mutex_enter(&state->id_tx_list.dl_mutex);
3889 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3890 	state->id_tx_list.dl_head = NULL;
3891 	state->id_tx_list.dl_pending_sends = B_FALSE;
3892 	state->id_tx_list.dl_cnt = 0;
3893 	state->id_tx_rel_list.dl_head = NULL;
3894 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3895 	state->id_tx_rel_list.dl_cnt = 0;
3896 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3897 	mutex_exit(&state->id_tx_list.dl_mutex);
3898 
3899 	ibd_free_tx_lsobufs(state);
3900 	ibd_free_tx_copybufs(state);
3901 }
3902 
3903 /*
3904  * post a list of rwqes, NULL terminated.
3905  */
3906 static void
3907 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3908 {
3909 	uint_t		i;
3910 	uint_t		num_posted;
3911 	ibt_status_t	ibt_status;
3912 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3913 
3914 	while (rwqe) {
3915 		/* Post up to IBD_RX_POST_CNT receive work requests */
3916 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
3917 			wrs[i] = rwqe->w_rwr;
3918 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3919 			if (rwqe == NULL) {
3920 				i++;
3921 				break;
3922 			}
3923 		}
3924 
3925 		/*
3926 		 * If posting fails for some reason, we'll never receive
3927 		 * completion intimation, so we'll need to cleanup. But
3928 		 * we need to make sure we don't clean up nodes whose
3929 		 * wrs have been successfully posted. We assume that the
3930 		 * hca driver returns on the first failure to post and
3931 		 * therefore the first 'num_posted' entries don't need
3932 		 * cleanup here.
3933 		 */
3934 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
3935 
3936 		num_posted = 0;
3937 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3938 		    &num_posted);
3939 		if (ibt_status != IBT_SUCCESS) {
3940 			/* This cannot happen unless the device has an error. */
3941 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
3942 			    "posting multiple wrs failed: "
3943 			    "requested=%d, done=%d, ret=%d",
3944 			    IBD_RX_POST_CNT, num_posted, ibt_status);
3945 			atomic_add_32(&state->id_rx_list.dl_cnt,
3946 			    num_posted - i);
3947 		}
3948 	}
3949 }
3950 
3951 /*
3952  * Grab a list of rwqes from the array of lists, and post the list.
3953  */
3954 static void
3955 ibd_post_recv_intr(ibd_state_t *state)
3956 {
3957 	ibd_rx_queue_t	*rxp;
3958 	ibd_rwqe_t *list;
3959 
3960 	/* rotate through the rx_queue array, expecting an adequate number */
3961 	state->id_rx_post_queue_index =
3962 	    (state->id_rx_post_queue_index + 1) &
3963 	    (state->id_rx_nqueues - 1);
3964 
3965 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3966 	mutex_enter(&rxp->rx_post_lock);
3967 	list = WQE_TO_RWQE(rxp->rx_head);
3968 	rxp->rx_head = NULL;
3969 	rxp->rx_cnt = 0;
3970 	mutex_exit(&rxp->rx_post_lock);
3971 	ibd_post_recv_list(state, list);
3972 }
3973 
3974 /* macro explained below */
3975 #define	RX_QUEUE_HASH(rwqe) \
3976 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3977 
3978 /*
3979  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3980  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3981  *
3982  * Note: one of 2^N lists is chosen via a hash.  This is done
3983  * because using one list is contentious.  If the first list is busy
3984  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3985  *
3986  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3987  * even distribution of mapping rwqes to the 2^N queues.
3988  */
3989 static void
3990 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3991 {
3992 	ibd_rx_queue_t	*rxp;
3993 
3994 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3995 
3996 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
3997 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
3998 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3999 		mutex_enter(&rxp->rx_post_lock);
4000 	}
4001 	rwqe->rwqe_next = rxp->rx_head;
4002 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
4003 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4004 
4005 		/* only call ibt_post_recv() every Nth time through here */
4006 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
4007 			rxp->rx_head = NULL;
4008 			rxp->rx_cnt = 0;
4009 			mutex_exit(&rxp->rx_post_lock);
4010 			ibd_post_recv_list(state, rwqe);
4011 			return;
4012 		}
4013 	}
4014 	rxp->rx_head = RWQE_TO_WQE(rwqe);
4015 	mutex_exit(&rxp->rx_post_lock);
4016 }
4017 
4018 static int
4019 ibd_alloc_rx_copybufs(ibd_state_t *state)
4020 {
4021 	ibt_mr_attr_t mem_attr;
4022 	int i;
4023 
4024 	/*
4025 	 * Allocate one big chunk for all regular rx copy bufs
4026 	 */
4027 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4028 
4029 	state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4030 	    state->id_rx_buf_sz, KM_SLEEP);
4031 
4032 	state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4033 	    sizeof (ibd_rwqe_t), KM_SLEEP);
4034 
4035 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4036 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4037 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
4038 	for (i = 0; i < state->id_rx_nqueues; i++) {
4039 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4040 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4041 	}
4042 
4043 	/*
4044 	 * Do one memory registration on the entire rxbuf area
4045 	 */
4046 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4047 	mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4048 	mem_attr.mr_as = NULL;
4049 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4050 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4051 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4052 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4053 		kmem_free(state->id_rx_wqes,
4054 		    state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4055 		kmem_free(state->id_rx_bufs,
4056 		    state->id_ud_num_rwqe * state->id_rx_buf_sz);
4057 		state->id_rx_bufs = NULL;
4058 		state->id_rx_wqes = NULL;
4059 		return (DDI_FAILURE);
4060 	}
4061 
4062 	return (DDI_SUCCESS);
4063 }
4064 
4065 /*
4066  * Allocate the statically allocated Rx buffer list.
4067  */
4068 static int
4069 ibd_init_rxlist(ibd_state_t *state)
4070 {
4071 	ibd_rwqe_t *rwqe, *next;
4072 	ibd_wqe_t *list;
4073 	ibt_lkey_t lkey;
4074 	int i;
4075 	uint_t len;
4076 	uint8_t *bufaddr;
4077 
4078 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4079 	if (state->id_rx_free_list.dl_head != NULL) {
4080 		/* rx rsrcs were never freed.  Just repost them */
4081 		len = state->id_rx_buf_sz;
4082 		list = state->id_rx_free_list.dl_head;
4083 		state->id_rx_free_list.dl_head = NULL;
4084 		state->id_rx_free_list.dl_cnt = 0;
4085 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4086 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4087 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4088 			if ((rwqe->rwqe_im_mblk = desballoc(
4089 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4090 			    &rwqe->w_freemsg_cb)) == NULL) {
4091 				/* allow freemsg_cb to free the rwqes */
4092 				if (atomic_dec_32_nv(&state->id_running) != 0) {
4093 					cmn_err(CE_WARN, "ibd_init_rxlist: "
4094 					    "id_running was not 1\n");
4095 				}
4096 				DPRINT(10, "ibd_init_rxlist : "
4097 				    "failed in desballoc()");
4098 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4099 				    rwqe = next) {
4100 					next = WQE_TO_RWQE(rwqe->rwqe_next);
4101 					if (rwqe->rwqe_im_mblk) {
4102 						atomic_inc_32(&state->
4103 						    id_rx_list.
4104 						    dl_bufs_outstanding);
4105 						freemsg(rwqe->rwqe_im_mblk);
4106 					} else
4107 						ibd_free_rwqe(state, rwqe);
4108 				}
4109 				atomic_inc_32(&state->id_running);
4110 				return (DDI_FAILURE);
4111 			}
4112 		}
4113 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
4114 		return (DDI_SUCCESS);
4115 	}
4116 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4117 
4118 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4119 		return (DDI_FAILURE);
4120 
4121 	/*
4122 	 * Allocate and setup the rwqe list
4123 	 */
4124 	len = state->id_rx_buf_sz;
4125 	lkey = state->id_rx_mr_desc.md_lkey;
4126 	rwqe = state->id_rx_wqes;
4127 	bufaddr = state->id_rx_bufs;
4128 	list = NULL;
4129 	for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4130 		rwqe->w_state = state;
4131 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4132 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4133 
4134 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4135 
4136 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4137 		    &rwqe->w_freemsg_cb)) == NULL) {
4138 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4139 			/* allow freemsg_cb to free the rwqes */
4140 			if (atomic_dec_32_nv(&state->id_running) != 0) {
4141 				cmn_err(CE_WARN, "ibd_init_rxlist: "
4142 				    "id_running was not 1\n");
4143 			}
4144 			DPRINT(10, "ibd_init_rxlist : "
4145 			    "failed in desballoc()");
4146 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4147 			    rwqe = next) {
4148 				next = WQE_TO_RWQE(rwqe->rwqe_next);
4149 				freemsg(rwqe->rwqe_im_mblk);
4150 			}
4151 			atomic_inc_32(&state->id_running);
4152 
4153 			/* remove reference to free'd rwqes */
4154 			mutex_enter(&state->id_rx_free_list.dl_mutex);
4155 			state->id_rx_free_list.dl_head = NULL;
4156 			state->id_rx_free_list.dl_cnt = 0;
4157 			mutex_exit(&state->id_rx_free_list.dl_mutex);
4158 
4159 			ibd_fini_rxlist(state);
4160 			return (DDI_FAILURE);
4161 		}
4162 
4163 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4164 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
4165 		    (ib_vaddr_t)(uintptr_t)bufaddr;
4166 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4167 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4168 		rwqe->w_rwr.wr_nds = 1;
4169 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4170 
4171 		rwqe->rwqe_next = list;
4172 		list = RWQE_TO_WQE(rwqe);
4173 	}
4174 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
4175 
4176 	return (DDI_SUCCESS);
4177 }
4178 
4179 static void
4180 ibd_free_rx_copybufs(ibd_state_t *state)
4181 {
4182 	int i;
4183 
4184 	/*
4185 	 * Unregister rxbuf mr
4186 	 */
4187 	if (ibt_deregister_mr(state->id_hca_hdl,
4188 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
4189 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4190 	}
4191 	state->id_rx_mr_hdl = NULL;
4192 
4193 	/*
4194 	 * Free rxbuf memory
4195 	 */
4196 	for (i = 0; i < state->id_rx_nqueues; i++) {
4197 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4198 		mutex_destroy(&rxp->rx_post_lock);
4199 	}
4200 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4201 	    sizeof (ibd_rx_queue_t));
4202 	kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4203 	    sizeof (ibd_rwqe_t));
4204 	kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4205 	    state->id_rx_buf_sz);
4206 	state->id_rx_queues = NULL;
4207 	state->id_rx_wqes = NULL;
4208 	state->id_rx_bufs = NULL;
4209 }
4210 
4211 static void
4212 ibd_free_rx_rsrcs(ibd_state_t *state)
4213 {
4214 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4215 	if (state->id_rx_free_list.dl_head == NULL) {
4216 		/* already freed */
4217 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4218 		return;
4219 	}
4220 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4221 	ibd_free_rx_copybufs(state);
4222 	state->id_rx_free_list.dl_cnt = 0;
4223 	state->id_rx_free_list.dl_head = NULL;
4224 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4225 }
4226 
4227 /*
4228  * Free the statically allocated Rx buffer list.
4229  */
4230 static void
4231 ibd_fini_rxlist(ibd_state_t *state)
4232 {
4233 	ibd_rwqe_t *rwqe;
4234 	int i;
4235 
4236 	/* run through the rx_queue's, calling freemsg() */
4237 	for (i = 0; i < state->id_rx_nqueues; i++) {
4238 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4239 		mutex_enter(&rxp->rx_post_lock);
4240 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4241 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4242 			freemsg(rwqe->rwqe_im_mblk);
4243 			rxp->rx_cnt--;
4244 		}
4245 		rxp->rx_head = NULL;
4246 		mutex_exit(&rxp->rx_post_lock);
4247 	}
4248 
4249 	/* cannot free rx resources unless gld returned everything */
4250 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4251 		ibd_free_rx_rsrcs(state);
4252 }
4253 
4254 /*
4255  * Free an allocated recv wqe.
4256  */
4257 /* ARGSUSED */
4258 static void
4259 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4260 {
4261 	/*
4262 	 * desballoc() failed (no memory).
4263 	 *
4264 	 * This rwqe is placed on a free list so that it
4265 	 * can be reinstated when memory is available.
4266 	 *
4267 	 * NOTE: no code currently exists to reinstate
4268 	 * these "lost" rwqes.
4269 	 */
4270 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4271 	state->id_rx_free_list.dl_cnt++;
4272 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4273 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4274 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4275 }
4276 
4277 /*
4278  * IBA Rx completion queue handler. Guaranteed to be single
4279  * threaded and nonreentrant for this CQ.
4280  */
4281 /* ARGSUSED */
4282 static void
4283 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4284 {
4285 	ibd_state_t *state = (ibd_state_t *)arg;
4286 
4287 	atomic_inc_64(&state->id_num_intrs);
4288 
4289 	if (ibd_rx_softintr == 1) {
4290 		mutex_enter(&state->id_rcq_poll_lock);
4291 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4292 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4293 			mutex_exit(&state->id_rcq_poll_lock);
4294 			return;
4295 		} else {
4296 			mutex_exit(&state->id_rcq_poll_lock);
4297 			ddi_trigger_softintr(state->id_rx);
4298 		}
4299 	} else
4300 		(void) ibd_intr((caddr_t)state);
4301 }
4302 
4303 /*
4304  * CQ handler for Tx completions, when the Tx CQ is in
4305  * interrupt driven mode.
4306  */
4307 /* ARGSUSED */
4308 static void
4309 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4310 {
4311 	ibd_state_t *state = (ibd_state_t *)arg;
4312 
4313 	atomic_inc_64(&state->id_num_intrs);
4314 
4315 	if (ibd_tx_softintr == 1) {
4316 		mutex_enter(&state->id_scq_poll_lock);
4317 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4318 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4319 			mutex_exit(&state->id_scq_poll_lock);
4320 			return;
4321 		} else {
4322 			mutex_exit(&state->id_scq_poll_lock);
4323 			ddi_trigger_softintr(state->id_tx);
4324 		}
4325 	} else
4326 		(void) ibd_tx_recycle((caddr_t)state);
4327 }
4328 
4329 /*
4330  * Multicast group create/delete trap handler. These will be delivered
4331  * on a kernel thread (handling can thus block) and can be invoked
4332  * concurrently. The handler can be invoked anytime after it is
4333  * registered and before ibt_detach().
4334  */
4335 /* ARGSUSED */
4336 static void
4337 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4338     ibt_subnet_event_t *event)
4339 {
4340 	ibd_state_t *state = (ibd_state_t *)arg;
4341 	ibd_req_t *req;
4342 
4343 	/*
4344 	 * The trap handler will get invoked once for every event for
4345 	 * every port. The input "gid" is the GID0 of the port the
4346 	 * trap came in on; we just need to act on traps that came
4347 	 * to our port, meaning the port on which the ipoib interface
4348 	 * resides. Since ipoib uses GID0 of the port, we just match
4349 	 * the gids to check whether we need to handle the trap.
4350 	 */
4351 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4352 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4353 		return;
4354 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4355 
4356 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4357 
4358 	switch (code) {
4359 		case IBT_SM_EVENT_UNAVAILABLE:
4360 			/*
4361 			 * If we are in promiscuous mode or have
4362 			 * sendnonmembers, we need to print a warning
4363 			 * message right now. Else, just store the
4364 			 * information, print when we enter promiscuous
4365 			 * mode or attempt nonmember send. We might
4366 			 * also want to stop caching sendnonmember.
4367 			 */
4368 			ibd_print_warn(state, "IBA multicast support "
4369 			    "degraded due to unavailability of multicast "
4370 			    "traps");
4371 			break;
4372 		case IBT_SM_EVENT_AVAILABLE:
4373 			/*
4374 			 * If we printed a warning message above or
4375 			 * while trying to nonmember send or get into
4376 			 * promiscuous mode, print an okay message.
4377 			 */
4378 			ibd_print_warn(state, "IBA multicast support "
4379 			    "restored due to availability of multicast "
4380 			    "traps");
4381 			break;
4382 		case IBT_SM_EVENT_MCG_CREATED:
4383 		case IBT_SM_EVENT_MCG_DELETED:
4384 			/*
4385 			 * If it is a "deleted" event and we are in late hca
4386 			 * init, nothing to do.
4387 			 */
4388 			if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4389 			    IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4390 			    IBT_SM_EVENT_MCG_DELETED)) {
4391 				break;
4392 			}
4393 			/*
4394 			 * Common processing of creation/deletion traps.
4395 			 * First check if the instance is being
4396 			 * [de]initialized; back off then, without doing
4397 			 * anything more, since we are not sure if the
4398 			 * async thread is around, or whether we might
4399 			 * be racing with the detach code in ibd_m_stop()
4400 			 * that scans the mcg list.
4401 			 */
4402 			if (!ibd_async_safe(state))
4403 				return;
4404 
4405 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4406 			req->rq_gid = event->sm_notice_gid;
4407 			req->rq_ptr = (void *)code;
4408 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4409 			break;
4410 	}
4411 }
4412 
4413 static void
4414 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4415 {
4416 	ib_gid_t mgid = req->rq_gid;
4417 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4418 	int ret;
4419 	ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4420 
4421 	DPRINT(10, "ibd_async_trap : %d\n", code);
4422 
4423 	/*
4424 	 * Check if we have already joined the IPoIB broadcast group for our
4425 	 * PKEY. If joined, perform the rest of the operation.
4426 	 * Else, the interface is not initialised. Do the initialisation here
4427 	 * by calling ibd_start() and return.
4428 	 */
4429 
4430 	if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4431 	    IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4432 	    (code == IBT_SM_EVENT_MCG_CREATED)) {
4433 		/*
4434 		 * If we are in late HCA init and a notification for the
4435 		 * creation of a MCG came in, check if it is the IPoIB MCG for
4436 		 * this pkey. If not, return.
4437 		 */
4438 		if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4439 		    state->id_pkey)) {
4440 			ibd_async_done(state);
4441 			return;
4442 		}
4443 		ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4444 		/*
4445 		 * Check if there is still a necessity to start the interface.
4446 		 * It is possible that the user attempted unplumb at just about
4447 		 * the same time, and if unplumb succeeded, we have nothing to
4448 		 * do.
4449 		 */
4450 		if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4451 		    IBD_DRV_IN_LATE_HCA_INIT) &&
4452 		    ((ret = ibd_start(state)) != 0)) {
4453 			DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4454 			    "init, ret=%d", ret);
4455 		}
4456 		ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4457 		ibd_async_done(state);
4458 		return;
4459 	}
4460 
4461 	/*
4462 	 * Atomically search the nonmember and sendonlymember lists and
4463 	 * delete.
4464 	 */
4465 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4466 
4467 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4468 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4469 
4470 		/*
4471 		 * If in promiscuous mode, try to join/attach to the new
4472 		 * mcg. Given the unreliable out-of-order mode of trap
4473 		 * delivery, we can never be sure whether it is a problem
4474 		 * if the join fails. Thus, we warn the admin of a failure
4475 		 * if this was a creation trap. Note that the trap might
4476 		 * actually be reporting a long past event, and the mcg
4477 		 * might already have been deleted, thus we might be warning
4478 		 * in vain.
4479 		 */
4480 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4481 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4482 			ibd_print_warn(state, "IBA promiscuous mode missed "
4483 			    "new multicast gid %016llx:%016llx",
4484 			    (u_longlong_t)mgid.gid_prefix,
4485 			    (u_longlong_t)mgid.gid_guid);
4486 	}
4487 
4488 	/*
4489 	 * Free the request slot allocated by the subnet event thread.
4490 	 */
4491 	ibd_async_done(state);
4492 }
4493 
4494 /*
4495  * GLDv3 entry point to get capabilities.
4496  */
4497 static boolean_t
4498 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4499 {
4500 	ibd_state_t *state = arg;
4501 
4502 	if (state->id_type == IBD_PORT_DRIVER)
4503 		return (B_FALSE);
4504 
4505 	switch (cap) {
4506 	case MAC_CAPAB_HCKSUM: {
4507 		uint32_t *txflags = cap_data;
4508 
4509 		/*
4510 		 * We either do full checksum or not do it at all
4511 		 */
4512 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4513 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4514 		else
4515 			return (B_FALSE);
4516 		break;
4517 	}
4518 
4519 	case MAC_CAPAB_LSO: {
4520 		mac_capab_lso_t *cap_lso = cap_data;
4521 
4522 		/*
4523 		 * In addition to the capability and policy, since LSO
4524 		 * relies on hw checksum, we'll not enable LSO if we
4525 		 * don't have hw checksum.  Of course, if the HCA doesn't
4526 		 * provide the reserved lkey capability, enabling LSO will
4527 		 * actually affect performance adversely, so we'll disable
4528 		 * LSO even for that case.
4529 		 */
4530 		if (!state->id_lso_policy || !state->id_lso_capable)
4531 			return (B_FALSE);
4532 
4533 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4534 			return (B_FALSE);
4535 
4536 		if (state->id_hca_res_lkey_capab == 0) {
4537 			ibd_print_warn(state, "no reserved-lkey capability, "
4538 			    "disabling LSO");
4539 			return (B_FALSE);
4540 		}
4541 
4542 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4543 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4544 		break;
4545 	}
4546 
4547 	default:
4548 		return (B_FALSE);
4549 	}
4550 
4551 	return (B_TRUE);
4552 }
4553 
4554 /*
4555  * callback function for set/get of properties
4556  */
4557 static int
4558 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4559     uint_t pr_valsize, const void *pr_val)
4560 {
4561 	ibd_state_t *state = arg;
4562 	int err = 0;
4563 	uint32_t link_mode;
4564 
4565 	/* Cannot set properties on a port driver */
4566 	if (state->id_type == IBD_PORT_DRIVER) {
4567 		return (ENOTSUP);
4568 	}
4569 
4570 	switch (pr_num) {
4571 		case MAC_PROP_IB_LINKMODE:
4572 			if (state->id_mac_state & IBD_DRV_STARTED) {
4573 				err = EBUSY;
4574 				break;
4575 			}
4576 			if (pr_val == NULL) {
4577 				err = EINVAL;
4578 				break;
4579 			}
4580 			bcopy(pr_val, &link_mode, sizeof (link_mode));
4581 			if (link_mode != IBD_LINK_MODE_UD &&
4582 			    link_mode != IBD_LINK_MODE_RC) {
4583 				err = EINVAL;
4584 			} else {
4585 				if (link_mode == IBD_LINK_MODE_RC) {
4586 					if (state->id_enable_rc) {
4587 						return (0);
4588 					}
4589 					state->id_enable_rc = 1;
4590 					/* inform MAC framework of new MTU */
4591 					err = mac_maxsdu_update(state->id_mh,
4592 					    state->rc_mtu - IPOIB_HDRSIZE);
4593 				} else {
4594 					if (!state->id_enable_rc) {
4595 						return (0);
4596 					}
4597 					state->id_enable_rc = 0;
4598 					err = mac_maxsdu_update(state->id_mh,
4599 					    state->id_mtu - IPOIB_HDRSIZE);
4600 				}
4601 				(void) ibd_record_capab(state);
4602 				mac_capab_update(state->id_mh);
4603 			}
4604 			break;
4605 		case MAC_PROP_PRIVATE:
4606 			err = ibd_set_priv_prop(state, pr_name,
4607 			    pr_valsize, pr_val);
4608 			break;
4609 		default:
4610 			err = ENOTSUP;
4611 			break;
4612 	}
4613 	return (err);
4614 }
4615 
4616 static int
4617 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4618     uint_t pr_valsize, void *pr_val)
4619 {
4620 	ibd_state_t *state = arg;
4621 	int err = 0;
4622 
4623 	switch (pr_num) {
4624 		case MAC_PROP_MTU:
4625 			break;
4626 		default:
4627 			if (state->id_type == IBD_PORT_DRIVER) {
4628 				return (ENOTSUP);
4629 			}
4630 			break;
4631 	}
4632 
4633 	switch (pr_num) {
4634 		case MAC_PROP_IB_LINKMODE:
4635 			*(uint_t *)pr_val = state->id_enable_rc;
4636 			break;
4637 		case MAC_PROP_PRIVATE:
4638 			err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4639 			    pr_val);
4640 			break;
4641 		default:
4642 			err = ENOTSUP;
4643 			break;
4644 	}
4645 	return (err);
4646 }
4647 
4648 static void
4649 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4650     mac_prop_info_handle_t prh)
4651 {
4652 	ibd_state_t *state = arg;
4653 
4654 	switch (pr_num) {
4655 	case MAC_PROP_IB_LINKMODE: {
4656 		mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4657 		break;
4658 	}
4659 	case MAC_PROP_MTU: {
4660 		uint32_t min, max;
4661 		if (state->id_type == IBD_PORT_DRIVER) {
4662 			min = 1500;
4663 			max = IBD_DEF_RC_MAX_SDU;
4664 		} else if (state->id_enable_rc) {
4665 			min = max = IBD_DEF_RC_MAX_SDU;
4666 		} else {
4667 			min = max = state->id_mtu - IPOIB_HDRSIZE;
4668 		}
4669 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4670 		mac_prop_info_set_range_uint32(prh, min, max);
4671 		break;
4672 	}
4673 	case MAC_PROP_PRIVATE: {
4674 		char valstr[64];
4675 		int value;
4676 
4677 		if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4678 			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4679 			return;
4680 		} else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4681 			value = IBD_DEF_COALESCE_COMPLETIONS;
4682 		} else if (strcmp(pr_name,
4683 		    "_ibd_create_broadcast_group") == 0) {
4684 			value = IBD_DEF_CREATE_BCAST_GROUP;
4685 		} else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4686 			value = IBD_DEF_HASH_SIZE;
4687 		} else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4688 			value = IBD_DEF_LSO_POLICY;
4689 		} else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4690 			value = IBD_DEF_NUM_AH;
4691 		} else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4692 			value = IBD_DEF_NUM_LSO_BUFS;
4693 		} else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4694 			value = IBD_DEF_RC_ENABLE_SRQ;
4695 		} else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4696 			value = IBD_DEF_RC_NUM_RWQE;
4697 		} else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4698 			value = IBD_DEF_RC_NUM_SRQ;
4699 		} else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4700 			value = IBD_DEF_RC_NUM_SWQE;
4701 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4702 			value = IBD_DEF_RC_RX_COMP_COUNT;
4703 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4704 			value = IBD_DEF_RC_RX_COMP_USEC;
4705 		} else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4706 			value = IBD_DEF_RC_RX_COPY_THRESH;
4707 		} else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4708 			value = IBD_DEF_RC_RX_RWQE_THRESH;
4709 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4710 			value = IBD_DEF_RC_TX_COMP_COUNT;
4711 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4712 			value = IBD_DEF_RC_TX_COMP_USEC;
4713 		} else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4714 			value = IBD_DEF_RC_TX_COPY_THRESH;
4715 		} else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4716 			value = IBD_DEF_UD_NUM_RWQE;
4717 		} else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4718 			value = IBD_DEF_UD_NUM_SWQE;
4719 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4720 			value = IBD_DEF_UD_RX_COMP_COUNT;
4721 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4722 			value = IBD_DEF_UD_RX_COMP_USEC;
4723 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4724 			value = IBD_DEF_UD_TX_COMP_COUNT;
4725 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4726 			value = IBD_DEF_UD_TX_COMP_USEC;
4727 		} else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4728 			value = IBD_DEF_UD_TX_COPY_THRESH;
4729 		} else {
4730 			return;
4731 		}
4732 
4733 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
4734 		mac_prop_info_set_default_str(prh, valstr);
4735 		break;
4736 	}
4737 	} /* switch (pr_num) */
4738 }
4739 
4740 /* ARGSUSED2 */
4741 static int
4742 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4743     uint_t pr_valsize, const void *pr_val)
4744 {
4745 	int err = 0;
4746 	long result;
4747 
4748 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4749 		if (pr_val == NULL) {
4750 			return (EINVAL);
4751 		}
4752 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4753 		if (result < 0 || result > 1) {
4754 			err = EINVAL;
4755 		} else {
4756 			state->id_allow_coalesce_comp_tuning = (result == 1) ?
4757 			    B_TRUE: B_FALSE;
4758 		}
4759 		return (err);
4760 	}
4761 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4762 		if (state->id_mac_state & IBD_DRV_STARTED) {
4763 			return (EBUSY);
4764 		}
4765 		if (pr_val == NULL) {
4766 			return (EINVAL);
4767 		}
4768 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4769 		if (result < 0 || result > 1) {
4770 			err = EINVAL;
4771 		} else {
4772 			state->id_create_broadcast_group = (result == 1) ?
4773 			    B_TRUE: B_FALSE;
4774 		}
4775 		return (err);
4776 	}
4777 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4778 		if (state->id_mac_state & IBD_DRV_STARTED) {
4779 			return (EBUSY);
4780 		}
4781 		if (pr_val == NULL) {
4782 			return (EINVAL);
4783 		}
4784 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4785 		if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4786 			err = EINVAL;
4787 		} else {
4788 			state->id_hash_size = (uint32_t)result;
4789 		}
4790 		return (err);
4791 	}
4792 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4793 		if (state->id_mac_state & IBD_DRV_STARTED) {
4794 			return (EBUSY);
4795 		}
4796 		if (pr_val == NULL) {
4797 			return (EINVAL);
4798 		}
4799 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4800 		if (result < 0 || result > 1) {
4801 			err = EINVAL;
4802 		} else {
4803 			state->id_lso_policy = (result == 1) ?
4804 			    B_TRUE: B_FALSE;
4805 		}
4806 		mac_capab_update(state->id_mh);
4807 		return (err);
4808 	}
4809 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4810 		if (state->id_mac_state & IBD_DRV_STARTED) {
4811 			return (EBUSY);
4812 		}
4813 		if (pr_val == NULL) {
4814 			return (EINVAL);
4815 		}
4816 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4817 		if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4818 			err = EINVAL;
4819 		} else {
4820 			state->id_num_ah = (uint32_t)result;
4821 		}
4822 		return (err);
4823 	}
4824 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4825 		if (state->id_mac_state & IBD_DRV_STARTED) {
4826 			return (EBUSY);
4827 		}
4828 		if (!state->id_lso_policy || !state->id_lso_capable) {
4829 			return (EINVAL);
4830 		}
4831 		if (pr_val == NULL) {
4832 			return (EINVAL);
4833 		}
4834 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4835 		if (result < IBD_MIN_NUM_LSO_BUFS ||
4836 		    result > IBD_MAX_NUM_LSO_BUFS) {
4837 			err = EINVAL;
4838 		} else {
4839 			state->id_num_lso_bufs = (uint32_t)result;
4840 		}
4841 		return (err);
4842 	}
4843 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4844 		if (state->id_mac_state & IBD_DRV_STARTED) {
4845 			return (EBUSY);
4846 		}
4847 		if (pr_val == NULL) {
4848 			return (EINVAL);
4849 		}
4850 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4851 		if (result < 0 || result > 1) {
4852 			err = EINVAL;
4853 		} else {
4854 			state->rc_enable_srq = (result == 1) ?
4855 			    B_TRUE: B_FALSE;
4856 		}
4857 		if (!state->rc_enable_srq) {
4858 			state->id_rc_num_srq = 0;
4859 		}
4860 		return (err);
4861 	}
4862 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4863 		if (state->id_mac_state & IBD_DRV_STARTED) {
4864 			return (EBUSY);
4865 		}
4866 		if (pr_val == NULL) {
4867 			return (EINVAL);
4868 		}
4869 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4870 		if (result < IBD_MIN_RC_NUM_RWQE ||
4871 		    result > IBD_MAX_RC_NUM_RWQE) {
4872 			err = EINVAL;
4873 		} else {
4874 			state->id_rc_num_rwqe = (uint32_t)result;
4875 			if (state->id_allow_coalesce_comp_tuning &&
4876 			    state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4877 				state->id_rc_rx_comp_count =
4878 				    state->id_rc_num_rwqe;
4879 			if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4880 				state->id_rc_num_srq =
4881 				    state->id_rc_num_rwqe - 1;
4882 			/*
4883 			 * If rx_rwqe_threshold is greater than the number of
4884 			 * rwqes, pull it back to 25% of number of rwqes.
4885 			 */
4886 			if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4887 				state->id_rc_rx_rwqe_thresh =
4888 				    (state->id_rc_num_rwqe >> 2);
4889 
4890 		}
4891 		return (err);
4892 	}
4893 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4894 		if (state->id_mac_state & IBD_DRV_STARTED) {
4895 			return (EBUSY);
4896 		}
4897 		if (pr_val == NULL) {
4898 			return (EINVAL);
4899 		}
4900 		if (!state->rc_enable_srq)
4901 			return (EINVAL);
4902 
4903 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4904 		if (result < IBD_MIN_RC_NUM_SRQ ||
4905 		    result >= state->id_rc_num_rwqe) {
4906 			err = EINVAL;
4907 		} else
4908 			state->id_rc_num_srq = (uint32_t)result;
4909 		return (err);
4910 	}
4911 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4912 		if (state->id_mac_state & IBD_DRV_STARTED) {
4913 			return (EBUSY);
4914 		}
4915 		if (pr_val == NULL) {
4916 			return (EINVAL);
4917 		}
4918 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4919 		if (result < IBD_MIN_RC_NUM_SWQE ||
4920 		    result > IBD_MAX_RC_NUM_SWQE) {
4921 			err = EINVAL;
4922 		} else {
4923 			state->id_rc_num_swqe = (uint32_t)result;
4924 			if (state->id_allow_coalesce_comp_tuning &&
4925 			    state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4926 				state->id_rc_tx_comp_count =
4927 				    state->id_rc_num_swqe;
4928 		}
4929 		return (err);
4930 	}
4931 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4932 		if (!state->id_allow_coalesce_comp_tuning) {
4933 			return (ENOTSUP);
4934 		}
4935 		if (pr_val == NULL) {
4936 			return (EINVAL);
4937 		}
4938 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4939 		if (result < 1 || result > state->id_rc_num_rwqe) {
4940 			err = EINVAL;
4941 		} else {
4942 			state->id_rc_rx_comp_count = (uint32_t)result;
4943 		}
4944 		return (err);
4945 	}
4946 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4947 		if (!state->id_allow_coalesce_comp_tuning) {
4948 			return (ENOTSUP);
4949 		}
4950 		if (pr_val == NULL) {
4951 			return (EINVAL);
4952 		}
4953 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4954 		if (result < 1) {
4955 			err = EINVAL;
4956 		} else {
4957 			state->id_rc_rx_comp_usec = (uint32_t)result;
4958 		}
4959 		return (err);
4960 	}
4961 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4962 		if (state->id_mac_state & IBD_DRV_STARTED) {
4963 			return (EBUSY);
4964 		}
4965 		if (pr_val == NULL) {
4966 			return (EINVAL);
4967 		}
4968 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4969 		if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4970 		    result > state->rc_mtu) {
4971 			err = EINVAL;
4972 		} else {
4973 			state->id_rc_rx_copy_thresh = (uint32_t)result;
4974 		}
4975 		return (err);
4976 	}
4977 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4978 		if (state->id_mac_state & IBD_DRV_STARTED) {
4979 			return (EBUSY);
4980 		}
4981 		if (pr_val == NULL) {
4982 			return (EINVAL);
4983 		}
4984 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4985 		if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4986 		    result >= state->id_rc_num_rwqe) {
4987 			err = EINVAL;
4988 		} else {
4989 			state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4990 		}
4991 		return (err);
4992 	}
4993 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4994 		if (!state->id_allow_coalesce_comp_tuning) {
4995 			return (ENOTSUP);
4996 		}
4997 		if (pr_val == NULL) {
4998 			return (EINVAL);
4999 		}
5000 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5001 		if (result < 1 || result > state->id_rc_num_swqe) {
5002 			err = EINVAL;
5003 		} else {
5004 			state->id_rc_tx_comp_count = (uint32_t)result;
5005 		}
5006 		return (err);
5007 	}
5008 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5009 		if (!state->id_allow_coalesce_comp_tuning) {
5010 			return (ENOTSUP);
5011 		}
5012 		if (pr_val == NULL) {
5013 			return (EINVAL);
5014 		}
5015 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5016 		if (result < 1)
5017 			err = EINVAL;
5018 		else {
5019 			state->id_rc_tx_comp_usec = (uint32_t)result;
5020 		}
5021 		return (err);
5022 	}
5023 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5024 		if (state->id_mac_state & IBD_DRV_STARTED) {
5025 			return (EBUSY);
5026 		}
5027 		if (pr_val == NULL) {
5028 			return (EINVAL);
5029 		}
5030 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5031 		if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5032 		    result > state->rc_mtu) {
5033 			err = EINVAL;
5034 		} else {
5035 			state->id_rc_tx_copy_thresh = (uint32_t)result;
5036 		}
5037 		return (err);
5038 	}
5039 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5040 		if (state->id_mac_state & IBD_DRV_STARTED) {
5041 			return (EBUSY);
5042 		}
5043 		if (pr_val == NULL) {
5044 			return (EINVAL);
5045 		}
5046 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5047 		if (result < IBD_MIN_UD_NUM_RWQE ||
5048 		    result > IBD_MAX_UD_NUM_RWQE) {
5049 			err = EINVAL;
5050 		} else {
5051 			if (result > state->id_hca_max_chan_sz) {
5052 				state->id_ud_num_rwqe =
5053 				    state->id_hca_max_chan_sz;
5054 			} else {
5055 				state->id_ud_num_rwqe = (uint32_t)result;
5056 			}
5057 			if (state->id_allow_coalesce_comp_tuning &&
5058 			    state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5059 				state->id_ud_rx_comp_count =
5060 				    state->id_ud_num_rwqe;
5061 		}
5062 		return (err);
5063 	}
5064 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5065 		if (state->id_mac_state & IBD_DRV_STARTED) {
5066 			return (EBUSY);
5067 		}
5068 		if (pr_val == NULL) {
5069 			return (EINVAL);
5070 		}
5071 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5072 		if (result < IBD_MIN_UD_NUM_SWQE ||
5073 		    result > IBD_MAX_UD_NUM_SWQE) {
5074 			err = EINVAL;
5075 		} else {
5076 			if (result > state->id_hca_max_chan_sz) {
5077 				state->id_ud_num_swqe =
5078 				    state->id_hca_max_chan_sz;
5079 			} else {
5080 				state->id_ud_num_swqe = (uint32_t)result;
5081 			}
5082 			if (state->id_allow_coalesce_comp_tuning &&
5083 			    state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5084 				state->id_ud_tx_comp_count =
5085 				    state->id_ud_num_swqe;
5086 		}
5087 		return (err);
5088 	}
5089 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5090 		if (!state->id_allow_coalesce_comp_tuning) {
5091 			return (ENOTSUP);
5092 		}
5093 		if (pr_val == NULL) {
5094 			return (EINVAL);
5095 		}
5096 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5097 		if (result < 1 || result > state->id_ud_num_rwqe) {
5098 			err = EINVAL;
5099 		} else {
5100 			state->id_ud_rx_comp_count = (uint32_t)result;
5101 		}
5102 		return (err);
5103 	}
5104 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5105 		if (!state->id_allow_coalesce_comp_tuning) {
5106 			return (ENOTSUP);
5107 		}
5108 		if (pr_val == NULL) {
5109 			return (EINVAL);
5110 		}
5111 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5112 		if (result < 1) {
5113 			err = EINVAL;
5114 		} else {
5115 			state->id_ud_rx_comp_usec = (uint32_t)result;
5116 		}
5117 		return (err);
5118 	}
5119 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5120 		if (!state->id_allow_coalesce_comp_tuning) {
5121 			return (ENOTSUP);
5122 		}
5123 		if (pr_val == NULL) {
5124 			return (EINVAL);
5125 		}
5126 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5127 		if (result < 1 || result > state->id_ud_num_swqe) {
5128 			err = EINVAL;
5129 		} else {
5130 			state->id_ud_tx_comp_count = (uint32_t)result;
5131 		}
5132 		return (err);
5133 	}
5134 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5135 		if (!state->id_allow_coalesce_comp_tuning) {
5136 			return (ENOTSUP);
5137 		}
5138 		if (pr_val == NULL) {
5139 			return (EINVAL);
5140 		}
5141 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5142 		if (result < 1) {
5143 			err = EINVAL;
5144 		} else {
5145 			state->id_ud_tx_comp_usec = (uint32_t)result;
5146 		}
5147 		return (err);
5148 	}
5149 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5150 		if (state->id_mac_state & IBD_DRV_STARTED) {
5151 			return (EBUSY);
5152 		}
5153 		if (pr_val == NULL) {
5154 			return (EINVAL);
5155 		}
5156 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5157 		if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5158 		    result > IBD_MAX_UD_TX_COPY_THRESH) {
5159 			err = EINVAL;
5160 		} else {
5161 			state->id_ud_tx_copy_thresh = (uint32_t)result;
5162 		}
5163 		return (err);
5164 	}
5165 	return (ENOTSUP);
5166 }
5167 
5168 static int
5169 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5170     void *pr_val)
5171 {
5172 	int err = ENOTSUP;
5173 	int value;
5174 
5175 	if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5176 		value = state->id_bgroup_present;
5177 		err = 0;
5178 		goto done;
5179 	}
5180 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5181 		value = state->id_allow_coalesce_comp_tuning;
5182 		err = 0;
5183 		goto done;
5184 	}
5185 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5186 		value = state->id_create_broadcast_group;
5187 		err = 0;
5188 		goto done;
5189 	}
5190 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5191 		value = state->id_hash_size;
5192 		err = 0;
5193 		goto done;
5194 	}
5195 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5196 		value = state->id_lso_policy;
5197 		err = 0;
5198 		goto done;
5199 	}
5200 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5201 		value = state->id_num_ah;
5202 		err = 0;
5203 		goto done;
5204 	}
5205 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5206 		value = state->id_num_lso_bufs;
5207 		err = 0;
5208 		goto done;
5209 	}
5210 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5211 		value = state->rc_enable_srq;
5212 		err = 0;
5213 		goto done;
5214 	}
5215 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5216 		value = state->id_rc_num_rwqe;
5217 		err = 0;
5218 		goto done;
5219 	}
5220 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5221 		value = state->id_rc_num_srq;
5222 		err = 0;
5223 		goto done;
5224 	}
5225 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5226 		value = state->id_rc_num_swqe;
5227 		err = 0;
5228 		goto done;
5229 	}
5230 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5231 		value = state->id_rc_rx_comp_count;
5232 		err = 0;
5233 		goto done;
5234 	}
5235 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5236 		value = state->id_rc_rx_comp_usec;
5237 		err = 0;
5238 		goto done;
5239 	}
5240 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5241 		value = state->id_rc_rx_copy_thresh;
5242 		err = 0;
5243 		goto done;
5244 	}
5245 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5246 		value = state->id_rc_rx_rwqe_thresh;
5247 		err = 0;
5248 		goto done;
5249 	}
5250 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5251 		value = state->id_rc_tx_comp_count;
5252 		err = 0;
5253 		goto done;
5254 	}
5255 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5256 		value = state->id_rc_tx_comp_usec;
5257 		err = 0;
5258 		goto done;
5259 	}
5260 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5261 		value = state->id_rc_tx_copy_thresh;
5262 		err = 0;
5263 		goto done;
5264 	}
5265 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5266 		value = state->id_ud_num_rwqe;
5267 		err = 0;
5268 		goto done;
5269 	}
5270 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5271 		value = state->id_ud_num_swqe;
5272 		err = 0;
5273 		goto done;
5274 	}
5275 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5276 		value = state->id_ud_rx_comp_count;
5277 		err = 0;
5278 		goto done;
5279 	}
5280 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5281 		value = state->id_ud_rx_comp_usec;
5282 		err = 0;
5283 		goto done;
5284 	}
5285 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5286 		value = state->id_ud_tx_comp_count;
5287 		err = 0;
5288 		goto done;
5289 	}
5290 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5291 		value = state->id_ud_tx_comp_usec;
5292 		err = 0;
5293 		goto done;
5294 	}
5295 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5296 		value = state->id_ud_tx_copy_thresh;
5297 		err = 0;
5298 		goto done;
5299 	}
5300 done:
5301 	if (err == 0) {
5302 		(void) snprintf(pr_val, pr_valsize, "%d", value);
5303 	}
5304 	return (err);
5305 }
5306 
5307 static int
5308 ibd_get_port_details(ibd_state_t *state)
5309 {
5310 	ibt_hca_portinfo_t *port_infop;
5311 	ibt_status_t ret;
5312 	uint_t psize, port_infosz;
5313 
5314 	mutex_enter(&state->id_link_mutex);
5315 
5316 	/*
5317 	 * Query for port information
5318 	 */
5319 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5320 	    &port_infop, &psize, &port_infosz);
5321 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
5322 		mutex_exit(&state->id_link_mutex);
5323 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5324 		    "failed, ret=%d", ret);
5325 		return (ENETDOWN);
5326 	}
5327 
5328 	/*
5329 	 * If the link is active, verify the pkey
5330 	 */
5331 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5332 		if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5333 		    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5334 			state->id_link_state = LINK_STATE_DOWN;
5335 		} else {
5336 			state->id_link_state = LINK_STATE_UP;
5337 		}
5338 		state->id_mtu = (128 << port_infop->p_mtu);
5339 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5340 		state->id_sgid = *port_infop->p_sgid_tbl;
5341 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5342 		/*
5343 		 * Now that the port is active, record the port speed
5344 		 */
5345 		state->id_link_speed = ibd_get_portspeed(state);
5346 	} else {
5347 		/* Make sure that these are handled in PORT_UP/CHANGE */
5348 		state->id_mtu = 0;
5349 		state->id_link_state = LINK_STATE_DOWN;
5350 		state->id_link_speed = 0;
5351 	}
5352 	mutex_exit(&state->id_link_mutex);
5353 	ibt_free_portinfo(port_infop, port_infosz);
5354 
5355 	return (0);
5356 }
5357 
5358 static int
5359 ibd_alloc_cqs(ibd_state_t *state)
5360 {
5361 	ibt_hca_attr_t hca_attrs;
5362 	ibt_cq_attr_t cq_attr;
5363 	ibt_status_t ret;
5364 	uint32_t real_size;
5365 	uint_t num_rwqe_change = 0;
5366 	uint_t num_swqe_change = 0;
5367 
5368 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5369 	ASSERT(ret == IBT_SUCCESS);
5370 
5371 	/*
5372 	 * Allocate Rx/combined CQ:
5373 	 * Theoretically, there is no point in having more than #rwqe
5374 	 * plus #swqe cqe's, except that the CQ will be signaled for
5375 	 * overflow when the last wqe completes, if none of the previous
5376 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
5377 	 * to make sure such overflow does not occur.
5378 	 */
5379 	cq_attr.cq_sched = NULL;
5380 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5381 
5382 	/*
5383 	 * Allocate Receive CQ.
5384 	 */
5385 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5386 		cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5387 	} else {
5388 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5389 		num_rwqe_change = state->id_ud_num_rwqe;
5390 		state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5391 	}
5392 
5393 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5394 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5395 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5396 		    "failed, ret=%d\n", ret);
5397 		return (DDI_FAILURE);
5398 	}
5399 
5400 	if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5401 	    state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5402 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5403 		    "moderation failed, ret=%d\n", ret);
5404 	}
5405 
5406 	/* make the #rx wc's the same as max rx chain size */
5407 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5408 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5409 	    state->id_rxwcs_size, KM_SLEEP);
5410 
5411 	/*
5412 	 * Allocate Send CQ.
5413 	 */
5414 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5415 		cq_attr.cq_size = state->id_ud_num_swqe + 1;
5416 	} else {
5417 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5418 		num_swqe_change = state->id_ud_num_swqe;
5419 		state->id_ud_num_swqe = cq_attr.cq_size - 1;
5420 	}
5421 
5422 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5423 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5424 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5425 		    "failed, ret=%d\n", ret);
5426 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5427 		    state->id_rxwcs_size);
5428 		(void) ibt_free_cq(state->id_rcq_hdl);
5429 		return (DDI_FAILURE);
5430 	}
5431 	if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5432 	    state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5433 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5434 		    "moderation failed, ret=%d\n", ret);
5435 	}
5436 
5437 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
5438 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5439 	    state->id_txwcs_size, KM_SLEEP);
5440 
5441 	/*
5442 	 * Print message in case we could not allocate as many wqe's
5443 	 * as was requested.
5444 	 */
5445 	if (num_rwqe_change) {
5446 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5447 		    "%d", state->id_ud_num_rwqe, num_rwqe_change);
5448 	}
5449 	if (num_swqe_change) {
5450 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
5451 		    "%d", state->id_ud_num_swqe, num_swqe_change);
5452 	}
5453 
5454 	return (DDI_SUCCESS);
5455 }
5456 
5457 static int
5458 ibd_setup_ud_channel(ibd_state_t *state)
5459 {
5460 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
5461 	ibt_ud_chan_query_attr_t ud_chan_attr;
5462 	ibt_status_t ret;
5463 
5464 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
5465 	if (state->id_hca_res_lkey_capab)
5466 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5467 	if (state->id_lso_policy && state->id_lso_capable)
5468 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5469 
5470 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
5471 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5472 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5473 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_ud_num_swqe;
5474 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_ud_num_rwqe;
5475 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
5476 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
5477 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
5478 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
5479 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
5480 	ud_alloc_attr.ud_clone_chan	= NULL;
5481 
5482 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5483 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5484 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5485 		    "failed, ret=%d\n", ret);
5486 		return (DDI_FAILURE);
5487 	}
5488 
5489 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5490 	    &ud_chan_attr)) != IBT_SUCCESS) {
5491 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5492 		    "failed, ret=%d\n", ret);
5493 		(void) ibt_free_channel(state->id_chnl_hdl);
5494 		return (DDI_FAILURE);
5495 	}
5496 
5497 	state->id_qpnum = ud_chan_attr.ud_qpn;
5498 
5499 	return (DDI_SUCCESS);
5500 }
5501 
5502 static int
5503 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5504 {
5505 	uint32_t progress = state->id_mac_state;
5506 	uint_t attempts;
5507 	ibt_status_t ret;
5508 	ib_gid_t mgid;
5509 	ibd_mce_t *mce;
5510 	uint8_t jstate;
5511 	timeout_id_t tid;
5512 
5513 	if (atomic_dec_32_nv(&state->id_running) != 0)
5514 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5515 
5516 	/*
5517 	 * Before we try to stop/undo whatever we did in ibd_start(),
5518 	 * we need to mark the link state appropriately to prevent the
5519 	 * ip layer from using this instance for any new transfers. Note
5520 	 * that if the original state of the link was "up" when we're
5521 	 * here, we'll set the final link state to "unknown", to behave
5522 	 * in the same fashion as other ethernet drivers.
5523 	 */
5524 	mutex_enter(&state->id_link_mutex);
5525 	if (cur_link_state == LINK_STATE_DOWN) {
5526 		state->id_link_state = cur_link_state;
5527 	} else {
5528 		state->id_link_state = LINK_STATE_UNKNOWN;
5529 	}
5530 	mutex_exit(&state->id_link_mutex);
5531 	bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5532 	mac_link_update(state->id_mh, state->id_link_state);
5533 
5534 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5535 	if (progress & IBD_DRV_STARTED) {
5536 		state->id_mac_state &= (~IBD_DRV_STARTED);
5537 	}
5538 
5539 	if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5540 		state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5541 	}
5542 
5543 	/* Stop listen under Reliable Connected Mode */
5544 	if (progress & IBD_DRV_RC_LISTEN) {
5545 		ASSERT(state->id_enable_rc);
5546 		if (state->rc_listen_hdl != NULL) {
5547 			ibd_rc_stop_listen(state);
5548 		}
5549 		state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5550 	}
5551 
5552 	/* Stop timeout routine */
5553 	if (progress & IBD_DRV_RC_TIMEOUT) {
5554 		ASSERT(state->id_enable_rc);
5555 		mutex_enter(&state->rc_timeout_lock);
5556 		state->rc_timeout_start = B_FALSE;
5557 		tid = state->rc_timeout;
5558 		state->rc_timeout = 0;
5559 		mutex_exit(&state->rc_timeout_lock);
5560 		if (tid != 0)
5561 			(void) untimeout(tid);
5562 		state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5563 	}
5564 
5565 	if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5566 		attempts = 100;
5567 		while (state->id_ah_op == IBD_OP_ONGOING) {
5568 			/*
5569 			 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5570 			 * port is connecting to a remote IPoIB port. Wait for
5571 			 * the end of this connecting operation.
5572 			 */
5573 			delay(drv_usectohz(100000));
5574 			if (--attempts == 0) {
5575 				state->rc_stop_connect++;
5576 				DPRINT(40, "ibd_undo_start: connecting");
5577 				break;
5578 			}
5579 		}
5580 		mutex_enter(&state->id_sched_lock);
5581 		state->id_sched_needed = 0;
5582 		mutex_exit(&state->id_sched_lock);
5583 		(void) ibd_rc_close_all_chan(state);
5584 	}
5585 
5586 	/*
5587 	 * First, stop receive interrupts; this stops the driver from
5588 	 * handing up buffers to higher layers.  Wait for receive buffers
5589 	 * to be returned and give up after 1 second.
5590 	 */
5591 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5592 		attempts = 10;
5593 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5594 		    0) > 0) {
5595 			delay(drv_usectohz(100000));
5596 			if (--attempts == 0) {
5597 				/*
5598 				 * There are pending bufs with the network
5599 				 * layer and we have no choice but to wait
5600 				 * for them to be done with. Reap all the
5601 				 * Tx/Rx completions that were posted since
5602 				 * we turned off the notification and
5603 				 * return failure.
5604 				 */
5605 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5606 				DPRINT(2, "ibd_undo_start: "
5607 				    "reclaiming failed");
5608 				break;
5609 			}
5610 		}
5611 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5612 	}
5613 
5614 	if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5615 		ibd_rc_fini_tx_largebuf_list(state);
5616 		state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5617 	}
5618 
5619 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5620 		ASSERT(state->id_enable_rc);
5621 		if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5622 			if (state->id_ah_op == IBD_OP_ONGOING) {
5623 				delay(drv_usectohz(10000));
5624 				if (state->id_ah_op == IBD_OP_ONGOING) {
5625 					/*
5626 					 * "state->id_ah_op == IBD_OP_ONGOING"
5627 					 * means this IPoIB port is connecting
5628 					 * to a remote IPoIB port. We can't
5629 					 * delete SRQ here.
5630 					 */
5631 					state->rc_stop_connect++;
5632 					DPRINT(40, "ibd_undo_start: "
5633 					    "connecting");
5634 				} else {
5635 					ibd_rc_fini_srq_list(state);
5636 					state->id_mac_state &=
5637 					    (~IBD_DRV_RC_SRQ_ALLOCD);
5638 				}
5639 			} else {
5640 				ibd_rc_fini_srq_list(state);
5641 				state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5642 			}
5643 		} else {
5644 			DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5645 		}
5646 	}
5647 
5648 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5649 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5650 
5651 		mutex_enter(&state->id_trap_lock);
5652 		state->id_trap_stop = B_TRUE;
5653 		while (state->id_trap_inprog > 0)
5654 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5655 		mutex_exit(&state->id_trap_lock);
5656 
5657 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5658 	}
5659 
5660 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5661 		/*
5662 		 * Flushing the channel ensures that all pending WQE's
5663 		 * are marked with flush_error and handed to the CQ. It
5664 		 * does not guarantee the invocation of the CQ handler.
5665 		 * This call is guaranteed to return successfully for
5666 		 * UD QPNs.
5667 		 */
5668 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5669 		    IBT_SUCCESS) {
5670 			DPRINT(10, "ibd_undo_start: flush_channel "
5671 			    "failed, ret=%d", ret);
5672 		}
5673 
5674 		/*
5675 		 * Give some time for the TX CQ handler to process the
5676 		 * completions.
5677 		 */
5678 		attempts = 10;
5679 		mutex_enter(&state->id_tx_list.dl_mutex);
5680 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5681 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5682 		    != state->id_ud_num_swqe) {
5683 			if (--attempts == 0)
5684 				break;
5685 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
5686 			mutex_exit(&state->id_tx_list.dl_mutex);
5687 			delay(drv_usectohz(100000));
5688 			mutex_enter(&state->id_tx_list.dl_mutex);
5689 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
5690 		}
5691 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5692 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5693 		    state->id_ud_num_swqe) {
5694 			cmn_err(CE_WARN, "tx resources not freed\n");
5695 		}
5696 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5697 		mutex_exit(&state->id_tx_list.dl_mutex);
5698 
5699 		attempts = 10;
5700 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5701 			if (--attempts == 0)
5702 				break;
5703 			delay(drv_usectohz(100000));
5704 		}
5705 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5706 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5707 			cmn_err(CE_WARN, "rx resources not freed\n");
5708 		}
5709 
5710 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5711 	}
5712 
5713 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5714 		/*
5715 		 * Drop all residual full/non membership. This includes full
5716 		 * membership to the broadcast group, and any nonmembership
5717 		 * acquired during transmits. We do this after the Tx completion
5718 		 * handlers are done, since those might result in some late
5719 		 * leaves; this also eliminates a potential race with that
5720 		 * path wrt the mc full list insert/delete. Trap handling
5721 		 * has also been suppressed at this point. Thus, no locks
5722 		 * are required while traversing the mc full list.
5723 		 */
5724 		DPRINT(2, "ibd_undo_start: clear full cache entries");
5725 		mce = list_head(&state->id_mc_full);
5726 		while (mce != NULL) {
5727 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
5728 			jstate = mce->mc_jstate;
5729 			mce = list_next(&state->id_mc_full, mce);
5730 			ibd_leave_group(state, mgid, jstate);
5731 		}
5732 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5733 	}
5734 
5735 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
5736 		ibd_fini_rxlist(state);
5737 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5738 	}
5739 
5740 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
5741 		ibd_fini_txlist(state);
5742 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5743 	}
5744 
5745 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5746 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5747 		    IBT_SUCCESS) {
5748 			DPRINT(10, "ibd_undo_start: free_channel "
5749 			    "failed, ret=%d", ret);
5750 		}
5751 
5752 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5753 	}
5754 
5755 	if (progress & IBD_DRV_CQS_ALLOCD) {
5756 		kmem_free(state->id_txwcs,
5757 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
5758 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5759 		    IBT_SUCCESS) {
5760 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
5761 			    "failed, ret=%d", ret);
5762 		}
5763 
5764 		kmem_free(state->id_rxwcs,
5765 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
5766 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5767 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5768 			    "ret=%d", ret);
5769 		}
5770 
5771 		state->id_txwcs = NULL;
5772 		state->id_rxwcs = NULL;
5773 		state->id_scq_hdl = NULL;
5774 		state->id_rcq_hdl = NULL;
5775 
5776 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5777 	}
5778 
5779 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5780 		mutex_enter(&state->id_ac_mutex);
5781 		mod_hash_destroy_hash(state->id_ah_active_hash);
5782 		mutex_exit(&state->id_ac_mutex);
5783 		ibd_acache_fini(state);
5784 
5785 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5786 	}
5787 
5788 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5789 		/*
5790 		 * If we'd created the ipoib broadcast group and had
5791 		 * successfully joined it, leave it now
5792 		 */
5793 		if (state->id_bgroup_created) {
5794 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5795 			jstate = IB_MC_JSTATE_FULL;
5796 			(void) ibt_leave_mcg(state->id_sgid, mgid,
5797 			    state->id_sgid, jstate);
5798 		}
5799 		ibt_free_mcg_info(state->id_mcinfo, 1);
5800 
5801 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5802 	}
5803 
5804 	return (DDI_SUCCESS);
5805 }
5806 
5807 /*
5808  * These pair of routines are used to set/clear the condition that
5809  * the caller is likely to do something to change the id_mac_state.
5810  * If there's already someone doing either a start or a stop (possibly
5811  * due to the async handler detecting a pkey relocation event, a plumb
5812  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5813  * that's done.
5814  */
5815 static void
5816 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5817 {
5818 	mutex_enter(&state->id_macst_lock);
5819 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5820 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5821 
5822 	state->id_mac_state |= flag;
5823 	mutex_exit(&state->id_macst_lock);
5824 }
5825 
5826 static void
5827 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5828 {
5829 	mutex_enter(&state->id_macst_lock);
5830 	state->id_mac_state &= (~flag);
5831 	cv_signal(&state->id_macst_cv);
5832 	mutex_exit(&state->id_macst_lock);
5833 }
5834 
5835 /*
5836  * GLDv3 entry point to start hardware.
5837  */
5838 /*ARGSUSED*/
5839 static int
5840 ibd_m_start(void *arg)
5841 {
5842 	ibd_state_t *state = arg;
5843 	int	ret;
5844 
5845 	if (state->id_type == IBD_PORT_DRIVER)
5846 		return (EINVAL);
5847 
5848 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5849 	if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5850 		ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5851 		return (EIO);
5852 	}
5853 
5854 	ret = ibd_start(state);
5855 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5856 	return (ret);
5857 }
5858 
5859 static int
5860 ibd_start(ibd_state_t *state)
5861 {
5862 	int err;
5863 	ibt_status_t ret;
5864 	int late_hca_init = 0;
5865 
5866 	if (state->id_mac_state & IBD_DRV_STARTED)
5867 		return (DDI_SUCCESS);
5868 
5869 	/*
5870 	 * We do not increment the running flag when calling ibd_start() as
5871 	 * a result of some event which moves the state away from late HCA
5872 	 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5873 	 */
5874 	if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5875 	    (atomic_inc_32_nv(&state->id_running) != 1)) {
5876 		DPRINT(10, "ibd_start: id_running is non-zero");
5877 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5878 		atomic_dec_32(&state->id_running);
5879 		return (EINVAL);
5880 	}
5881 
5882 	/*
5883 	 * Get port details; if we fail here, something bad happened.
5884 	 * Fail plumb.
5885 	 */
5886 	if ((err = ibd_get_port_details(state)) != 0) {
5887 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5888 		goto start_fail;
5889 	}
5890 	/*
5891 	 * If state->id_link_state is DOWN, it indicates that either the port
5892 	 * is down, or the pkey is not available. In both cases, resort to late
5893 	 * initialization. Register for subnet notices, and return success.
5894 	 */
5895 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5896 	if (state->id_link_state == LINK_STATE_DOWN) {
5897 		late_hca_init = 1;
5898 		goto late_hca_init_return;
5899 	}
5900 
5901 	/*
5902 	 * Find the IPoIB broadcast group
5903 	 */
5904 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5905 		/* Resort to late initialization */
5906 		late_hca_init = 1;
5907 		goto reg_snet_notices;
5908 	}
5909 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5910 
5911 	/*
5912 	 * Initialize per-interface caches and lists; if we fail here,
5913 	 * it is most likely due to a lack of resources
5914 	 */
5915 	if (ibd_acache_init(state) != DDI_SUCCESS) {
5916 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
5917 		err = ENOMEM;
5918 		goto start_fail;
5919 	}
5920 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5921 
5922 	/*
5923 	 * Allocate send and receive completion queues
5924 	 */
5925 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5926 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5927 		err = ENOMEM;
5928 		goto start_fail;
5929 	}
5930 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5931 
5932 	/*
5933 	 * Setup a UD channel
5934 	 */
5935 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5936 		err = ENOMEM;
5937 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5938 		goto start_fail;
5939 	}
5940 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5941 
5942 	/*
5943 	 * Allocate and initialize the tx buffer list
5944 	 */
5945 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
5946 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5947 		err = ENOMEM;
5948 		goto start_fail;
5949 	}
5950 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5951 
5952 	/*
5953 	 * Create the send cq handler here
5954 	 */
5955 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5956 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5957 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5958 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5959 		    "failed, ret=%d", ret);
5960 		err = EINVAL;
5961 		goto start_fail;
5962 	}
5963 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5964 
5965 	/*
5966 	 * Allocate and initialize the rx buffer list
5967 	 */
5968 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5969 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5970 		err = ENOMEM;
5971 		goto start_fail;
5972 	}
5973 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5974 
5975 	/*
5976 	 * Join IPoIB broadcast group
5977 	 */
5978 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5979 		DPRINT(10, "ibd_start: ibd_join_group() failed");
5980 		err = ENOTACTIVE;
5981 		goto start_fail;
5982 	}
5983 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5984 
5985 	/*
5986 	 * When we did mac_register() in ibd_attach(), we didn't register
5987 	 * the real macaddr and we didn't have the true port mtu. Now that
5988 	 * we're almost ready, set the local mac address and broadcast
5989 	 * addresses and update gldv3 about the real values of these
5990 	 * parameters.
5991 	 */
5992 	if (state->id_enable_rc) {
5993 		ibd_h2n_mac(&state->id_macaddr,
5994 		    IBD_MAC_ADDR_RC + state->id_qpnum,
5995 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5996 		ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
5997 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5998 	} else {
5999 		ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
6000 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6001 	}
6002 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
6003 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
6004 
6005 	if (!state->id_enable_rc) {
6006 		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
6007 		    - IPOIB_HDRSIZE);
6008 	}
6009 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6010 
6011 	/*
6012 	 * Setup the receive cq handler
6013 	 */
6014 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
6015 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
6016 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
6017 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
6018 		    "failed, ret=%d", ret);
6019 		err = EINVAL;
6020 		goto start_fail;
6021 	}
6022 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
6023 
6024 reg_snet_notices:
6025 	/*
6026 	 * In case of normal initialization sequence,
6027 	 * Setup the subnet notices handler after we've initialized the acache/
6028 	 * mcache and started the async thread, both of which are required for
6029 	 * the trap handler to function properly.
6030 	 *
6031 	 * Now that the async thread has been started (and we've already done
6032 	 * a mac_register() during attach so mac_tx_update() can be called
6033 	 * if necessary without any problem), we can enable the trap handler
6034 	 * to queue requests to the async thread.
6035 	 *
6036 	 * In case of late hca initialization, the subnet notices handler will
6037 	 * only handle MCG created/deleted event. The action performed as part
6038 	 * of handling these events is to start the interface. So, the
6039 	 * acache/mcache initialization is not a necessity in such cases for
6040 	 * registering the subnet notices handler. Also, if we are in
6041 	 * ibd_start() as a result of, say, some event handling after entering
6042 	 * late hca initialization phase no need to register again.
6043 	 */
6044 	if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
6045 		ibt_register_subnet_notices(state->id_ibt_hdl,
6046 		    ibd_snet_notices_handler, state);
6047 		mutex_enter(&state->id_trap_lock);
6048 		state->id_trap_stop = B_FALSE;
6049 		mutex_exit(&state->id_trap_lock);
6050 		state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
6051 	}
6052 
6053 late_hca_init_return:
6054 	if (late_hca_init == 1) {
6055 		state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6056 		/*
6057 		 * In case of late initialization, mark the link state as down,
6058 		 * immaterial of the actual link state as reported in the
6059 		 * port_info.
6060 		 */
6061 		state->id_link_state = LINK_STATE_DOWN;
6062 		mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6063 		mac_link_update(state->id_mh, state->id_link_state);
6064 		return (DDI_SUCCESS);
6065 	}
6066 
6067 	if (state->id_enable_rc) {
6068 		if (state->rc_enable_srq) {
6069 			if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6070 				if (ibd_rc_repost_srq_free_list(state) !=
6071 				    IBT_SUCCESS) {
6072 					err = ENOMEM;
6073 					goto start_fail;
6074 				}
6075 			} else {
6076 				/* Allocate SRQ resource */
6077 				if (ibd_rc_init_srq_list(state) !=
6078 				    IBT_SUCCESS) {
6079 					err = ENOMEM;
6080 					goto start_fail;
6081 				}
6082 				state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6083 			}
6084 		}
6085 
6086 		if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6087 			DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6088 			    "failed");
6089 			err = ENOMEM;
6090 			goto start_fail;
6091 		}
6092 		state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6093 
6094 		/* RC: begin to listen only after everything is available */
6095 		if (ibd_rc_listen(state) != IBT_SUCCESS) {
6096 			DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6097 			err = EINVAL;
6098 			goto start_fail;
6099 		}
6100 		state->id_mac_state |= IBD_DRV_RC_LISTEN;
6101 	}
6102 
6103 	/*
6104 	 * Indicate link status to GLDv3 and higher layers. By default,
6105 	 * we assume we are in up state (which must have been true at
6106 	 * least at the time the broadcast mcg's were probed); if there
6107 	 * were any up/down transitions till the time we come here, the
6108 	 * async handler will have updated last known state, which we
6109 	 * use to tell GLDv3. The async handler will not send any
6110 	 * notifications to GLDv3 till we reach here in the initialization
6111 	 * sequence.
6112 	 */
6113 	mac_link_update(state->id_mh, state->id_link_state);
6114 	state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6115 	state->id_mac_state |= IBD_DRV_STARTED;
6116 
6117 	/* Start timer after everything is ready */
6118 	if (state->id_enable_rc) {
6119 		mutex_enter(&state->rc_timeout_lock);
6120 		state->rc_timeout_start = B_TRUE;
6121 		state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
6122 		    SEC_TO_TICK(ibd_rc_conn_timeout));
6123 		mutex_exit(&state->rc_timeout_lock);
6124 		state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
6125 	}
6126 
6127 	return (DDI_SUCCESS);
6128 
6129 start_fail:
6130 	/*
6131 	 * If we ran into a problem during ibd_start() and ran into
6132 	 * some other problem during undoing our partial work, we can't
6133 	 * do anything about it.  Ignore any errors we might get from
6134 	 * ibd_undo_start() and just return the original error we got.
6135 	 */
6136 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
6137 	return (err);
6138 }
6139 
6140 /*
6141  * GLDv3 entry point to stop hardware from receiving packets.
6142  */
6143 /*ARGSUSED*/
6144 static void
6145 ibd_m_stop(void *arg)
6146 {
6147 	ibd_state_t *state = (ibd_state_t *)arg;
6148 
6149 	if (state->id_type == IBD_PORT_DRIVER)
6150 		return;
6151 
6152 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6153 
6154 	(void) ibd_undo_start(state, state->id_link_state);
6155 
6156 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6157 }
6158 
6159 /*
6160  * GLDv3 entry point to modify device's mac address. We do not
6161  * allow address modifications.
6162  */
6163 static int
6164 ibd_m_unicst(void *arg, const uint8_t *macaddr)
6165 {
6166 	ibd_state_t *state = arg;
6167 
6168 	if (state->id_type == IBD_PORT_DRIVER)
6169 		return (EINVAL);
6170 
6171 	/*
6172 	 * Don't bother even comparing the macaddr if we haven't
6173 	 * completed ibd_m_start().
6174 	 */
6175 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6176 		return (0);
6177 
6178 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6179 		return (0);
6180 	else
6181 		return (EINVAL);
6182 }
6183 
6184 /*
6185  * The blocking part of the IBA join/leave operations are done out
6186  * of here on the async thread.
6187  */
6188 static void
6189 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6190 {
6191 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6192 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6193 
6194 	if (op == IBD_ASYNC_JOIN) {
6195 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6196 			ibd_print_warn(state, "Join multicast group failed :"
6197 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6198 		}
6199 	} else {
6200 		/*
6201 		 * Here, we must search for the proper mcg_info and
6202 		 * use that to leave the group.
6203 		 */
6204 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6205 	}
6206 }
6207 
6208 /*
6209  * GLDv3 entry point for multicast enable/disable requests.
6210  * This function queues the operation to the async thread and
6211  * return success for a valid multicast address.
6212  */
6213 static int
6214 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6215 {
6216 	ibd_state_t *state = (ibd_state_t *)arg;
6217 	ipoib_mac_t maddr, *mcast;
6218 	ib_gid_t mgid;
6219 	ibd_req_t *req;
6220 
6221 	if (state->id_type == IBD_PORT_DRIVER)
6222 		return (EINVAL);
6223 
6224 	/*
6225 	 * If we haven't completed ibd_m_start(), async thread wouldn't
6226 	 * have been started and id_bcaddr wouldn't be set, so there's
6227 	 * no point in continuing.
6228 	 */
6229 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6230 		return (0);
6231 
6232 	/*
6233 	 * The incoming multicast address might not be aligned properly
6234 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6235 	 * it to look like one though, to get the offsets of the mc gid,
6236 	 * since we know we are not going to dereference any values with
6237 	 * the ipoib_mac_t pointer.
6238 	 */
6239 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6240 	mcast = &maddr;
6241 
6242 	/*
6243 	 * Check validity of MCG address. We could additionally check
6244 	 * that a enable/disable is not being issued on the "broadcast"
6245 	 * mcg, but since this operation is only invokable by privileged
6246 	 * programs anyway, we allow the flexibility to those dlpi apps.
6247 	 * Note that we do not validate the "scope" of the IBA mcg.
6248 	 */
6249 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6250 		return (EINVAL);
6251 
6252 	/*
6253 	 * fill in multicast pkey and scope
6254 	 */
6255 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6256 
6257 	/*
6258 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
6259 	 * nothing (i.e. we stay JOINed to the broadcast group done in
6260 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6261 	 * requires to be joined to broadcast groups at all times.
6262 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6263 	 * depends on this.
6264 	 */
6265 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6266 		return (0);
6267 
6268 	ibd_n2h_gid(mcast, &mgid);
6269 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6270 	if (req == NULL)
6271 		return (ENOMEM);
6272 
6273 	req->rq_gid = mgid;
6274 
6275 	if (add) {
6276 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6277 		    mgid.gid_prefix, mgid.gid_guid);
6278 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6279 	} else {
6280 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
6281 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6282 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6283 	}
6284 	return (0);
6285 }
6286 
6287 /*
6288  * The blocking part of the IBA promiscuous operations are done
6289  * out of here on the async thread. The dlpireq parameter indicates
6290  * whether this invocation is due to a dlpi request or due to
6291  * a port up/down event.
6292  */
6293 static void
6294 ibd_async_unsetprom(ibd_state_t *state)
6295 {
6296 	ibd_mce_t *mce = list_head(&state->id_mc_non);
6297 	ib_gid_t mgid;
6298 
6299 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6300 
6301 	while (mce != NULL) {
6302 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
6303 		mce = list_next(&state->id_mc_non, mce);
6304 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6305 	}
6306 	state->id_prom_op = IBD_OP_NOTSTARTED;
6307 }
6308 
6309 /*
6310  * The blocking part of the IBA promiscuous operations are done
6311  * out of here on the async thread. The dlpireq parameter indicates
6312  * whether this invocation is due to a dlpi request or due to
6313  * a port up/down event.
6314  */
6315 static void
6316 ibd_async_setprom(ibd_state_t *state)
6317 {
6318 	ibt_mcg_attr_t mcg_attr;
6319 	ibt_mcg_info_t *mcg_info;
6320 	ib_gid_t mgid;
6321 	uint_t numg;
6322 	int i;
6323 	char ret = IBD_OP_COMPLETED;
6324 
6325 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
6326 
6327 	/*
6328 	 * Obtain all active MC groups on the IB fabric with
6329 	 * specified criteria (scope + Pkey + Qkey + mtu).
6330 	 */
6331 	bzero(&mcg_attr, sizeof (mcg_attr));
6332 	mcg_attr.mc_pkey = state->id_pkey;
6333 	mcg_attr.mc_scope = state->id_scope;
6334 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6335 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6336 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6337 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6338 	    IBT_SUCCESS) {
6339 		ibd_print_warn(state, "Could not get list of IBA multicast "
6340 		    "groups");
6341 		ret = IBD_OP_ERRORED;
6342 		goto done;
6343 	}
6344 
6345 	/*
6346 	 * Iterate over the returned mcg's and join as NonMember
6347 	 * to the IP mcg's.
6348 	 */
6349 	for (i = 0; i < numg; i++) {
6350 		/*
6351 		 * Do a NonMember JOIN on the MC group.
6352 		 */
6353 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
6354 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6355 			ibd_print_warn(state, "IBA promiscuous mode missed "
6356 			    "multicast gid %016llx:%016llx",
6357 			    (u_longlong_t)mgid.gid_prefix,
6358 			    (u_longlong_t)mgid.gid_guid);
6359 	}
6360 
6361 	ibt_free_mcg_info(mcg_info, numg);
6362 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6363 done:
6364 	state->id_prom_op = ret;
6365 }
6366 
6367 /*
6368  * GLDv3 entry point for multicast promiscuous enable/disable requests.
6369  * GLDv3 assumes phys state receives more packets than multi state,
6370  * which is not true for IPoIB. Thus, treat the multi and phys
6371  * promiscuous states the same way to work with GLDv3's assumption.
6372  */
6373 static int
6374 ibd_m_promisc(void *arg, boolean_t on)
6375 {
6376 	ibd_state_t *state = (ibd_state_t *)arg;
6377 	ibd_req_t *req;
6378 
6379 	if (state->id_type == IBD_PORT_DRIVER)
6380 		return (EINVAL);
6381 
6382 	/*
6383 	 * Async thread wouldn't have been started if we haven't
6384 	 * passed ibd_m_start()
6385 	 */
6386 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6387 		return (0);
6388 
6389 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6390 	if (req == NULL)
6391 		return (ENOMEM);
6392 	if (on) {
6393 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6394 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6395 	} else {
6396 		DPRINT(1, "ibd_m_promisc : unset_promisc");
6397 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6398 	}
6399 
6400 	return (0);
6401 }
6402 
6403 /*
6404  * GLDv3 entry point for gathering statistics.
6405  */
6406 static int
6407 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6408 {
6409 	ibd_state_t *state = (ibd_state_t *)arg;
6410 
6411 	switch (stat) {
6412 	case MAC_STAT_IFSPEED:
6413 		*val = state->id_link_speed;
6414 		break;
6415 	case MAC_STAT_MULTIRCV:
6416 		*val = state->id_multi_rcv;
6417 		break;
6418 	case MAC_STAT_BRDCSTRCV:
6419 		*val = state->id_brd_rcv;
6420 		break;
6421 	case MAC_STAT_MULTIXMT:
6422 		*val = state->id_multi_xmt;
6423 		break;
6424 	case MAC_STAT_BRDCSTXMT:
6425 		*val = state->id_brd_xmt;
6426 		break;
6427 	case MAC_STAT_RBYTES:
6428 		*val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6429 		    + state->rc_rcv_copy_byte;
6430 		break;
6431 	case MAC_STAT_IPACKETS:
6432 		*val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6433 		    + state->rc_rcv_copy_pkt;
6434 		break;
6435 	case MAC_STAT_OBYTES:
6436 		*val = state->id_xmt_bytes + state->rc_xmt_bytes;
6437 		break;
6438 	case MAC_STAT_OPACKETS:
6439 		*val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6440 		    state->rc_xmt_fragmented_pkt +
6441 		    state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6442 		break;
6443 	case MAC_STAT_OERRORS:
6444 		*val = state->id_ah_error;	/* failed AH translation */
6445 		break;
6446 	case MAC_STAT_IERRORS:
6447 		*val = 0;
6448 		break;
6449 	case MAC_STAT_NOXMTBUF:
6450 		*val = state->id_tx_short + state->rc_swqe_short +
6451 		    state->rc_xmt_buf_short;
6452 		break;
6453 	case MAC_STAT_NORCVBUF:
6454 	default:
6455 		return (ENOTSUP);
6456 	}
6457 
6458 	return (0);
6459 }
6460 
6461 static void
6462 ibd_async_txsched(ibd_state_t *state)
6463 {
6464 	ibd_resume_transmission(state);
6465 }
6466 
6467 static void
6468 ibd_resume_transmission(ibd_state_t *state)
6469 {
6470 	int flag;
6471 	int met_thresh = 0;
6472 	int thresh = 0;
6473 	int ret = -1;
6474 
6475 	mutex_enter(&state->id_sched_lock);
6476 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
6477 		mutex_enter(&state->id_tx_list.dl_mutex);
6478 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
6479 		met_thresh = state->id_tx_list.dl_cnt +
6480 		    state->id_tx_rel_list.dl_cnt;
6481 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6482 		mutex_exit(&state->id_tx_list.dl_mutex);
6483 		thresh = IBD_FREE_SWQES_THRESH;
6484 		flag = IBD_RSRC_SWQE;
6485 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6486 		ASSERT(state->id_lso != NULL);
6487 		mutex_enter(&state->id_lso_lock);
6488 		met_thresh = state->id_lso->bkt_nfree;
6489 		thresh = IBD_FREE_LSOS_THRESH;
6490 		mutex_exit(&state->id_lso_lock);
6491 		flag = IBD_RSRC_LSOBUF;
6492 		if (met_thresh > thresh)
6493 			state->id_sched_lso_cnt++;
6494 	}
6495 	if (met_thresh > thresh) {
6496 		state->id_sched_needed &= ~flag;
6497 		state->id_sched_cnt++;
6498 		ret = 0;
6499 	}
6500 	mutex_exit(&state->id_sched_lock);
6501 
6502 	if (ret == 0)
6503 		mac_tx_update(state->id_mh);
6504 }
6505 
6506 /*
6507  * Release the send wqe back into free list.
6508  */
6509 static void
6510 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6511 {
6512 	/*
6513 	 * Add back on Tx list for reuse.
6514 	 */
6515 	ASSERT(tail->swqe_next == NULL);
6516 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6517 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6518 	tail->swqe_next = state->id_tx_rel_list.dl_head;
6519 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6520 	state->id_tx_rel_list.dl_cnt += n;
6521 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
6522 }
6523 
6524 /*
6525  * Acquire a send wqe from free list.
6526  * Returns error number and send wqe pointer.
6527  */
6528 static ibd_swqe_t *
6529 ibd_acquire_swqe(ibd_state_t *state)
6530 {
6531 	ibd_swqe_t *wqe;
6532 
6533 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6534 	if (state->id_tx_rel_list.dl_head != NULL) {
6535 		/* transfer id_tx_rel_list to id_tx_list */
6536 		state->id_tx_list.dl_head =
6537 		    state->id_tx_rel_list.dl_head;
6538 		state->id_tx_list.dl_cnt =
6539 		    state->id_tx_rel_list.dl_cnt;
6540 		state->id_tx_list.dl_pending_sends = B_FALSE;
6541 
6542 		/* clear id_tx_rel_list */
6543 		state->id_tx_rel_list.dl_head = NULL;
6544 		state->id_tx_rel_list.dl_cnt = 0;
6545 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6546 
6547 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6548 		state->id_tx_list.dl_cnt -= 1;
6549 		state->id_tx_list.dl_head = wqe->swqe_next;
6550 	} else {	/* no free swqe */
6551 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6552 		state->id_tx_list.dl_pending_sends = B_TRUE;
6553 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6554 		state->id_tx_short++;
6555 		wqe = NULL;
6556 	}
6557 	return (wqe);
6558 }
6559 
6560 static int
6561 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6562     ibt_ud_dest_hdl_t ud_dest)
6563 {
6564 	mblk_t	*nmp;
6565 	int iph_len, tcph_len;
6566 	ibt_wr_lso_t *lso;
6567 	uintptr_t ip_start, tcp_start;
6568 	uint8_t *dst;
6569 	uint_t pending, mblen;
6570 
6571 	/*
6572 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6573 	 * we need to adjust it here for lso.
6574 	 */
6575 	lso = &(node->w_swr.wr.ud_lso);
6576 	lso->lso_ud_dest = ud_dest;
6577 	lso->lso_mss = mss;
6578 
6579 	/*
6580 	 * Calculate the LSO header size and set it in the UD LSO structure.
6581 	 * Note that the only assumption we make is that each of the IPoIB,
6582 	 * IP and TCP headers will be contained in a single mblk fragment;
6583 	 * together, the headers may span multiple mblk fragments.
6584 	 */
6585 	nmp = mp;
6586 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6587 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6588 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
6589 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
6590 		nmp = nmp->b_cont;
6591 
6592 	}
6593 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6594 
6595 	tcp_start = ip_start + iph_len;
6596 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6597 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6598 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
6599 		nmp = nmp->b_cont;
6600 	}
6601 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6602 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6603 
6604 	/*
6605 	 * If the lso header fits entirely within a single mblk fragment,
6606 	 * we'll avoid an additional copy of the lso header here and just
6607 	 * pass the b_rptr of the mblk directly.
6608 	 *
6609 	 * If this isn't true, we'd have to allocate for it explicitly.
6610 	 */
6611 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
6612 		lso->lso_hdr = mp->b_rptr;
6613 	} else {
6614 		/* On work completion, remember to free this allocated hdr */
6615 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6616 		if (lso->lso_hdr == NULL) {
6617 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6618 			    "sz = %d", lso->lso_hdr_sz);
6619 			lso->lso_hdr_sz = 0;
6620 			lso->lso_mss = 0;
6621 			return (-1);
6622 		}
6623 	}
6624 
6625 	/*
6626 	 * Copy in the lso header only if we need to
6627 	 */
6628 	if (lso->lso_hdr != mp->b_rptr) {
6629 		dst = lso->lso_hdr;
6630 		pending = lso->lso_hdr_sz;
6631 
6632 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6633 			mblen = MBLKL(nmp);
6634 			if (pending > mblen) {
6635 				bcopy(nmp->b_rptr, dst, mblen);
6636 				dst += mblen;
6637 				pending -= mblen;
6638 			} else {
6639 				bcopy(nmp->b_rptr, dst, pending);
6640 				break;
6641 			}
6642 		}
6643 	}
6644 
6645 	return (0);
6646 }
6647 
6648 static void
6649 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6650 {
6651 	ibt_wr_lso_t *lso;
6652 
6653 	if ((!node) || (!mp))
6654 		return;
6655 
6656 	/*
6657 	 * Free any header space that we might've allocated if we
6658 	 * did an LSO
6659 	 */
6660 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6661 		lso = &(node->w_swr.wr.ud_lso);
6662 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6663 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6664 			lso->lso_hdr = NULL;
6665 			lso->lso_hdr_sz = 0;
6666 		}
6667 	}
6668 }
6669 
6670 static void
6671 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6672 {
6673 	uint_t		i;
6674 	uint_t		num_posted;
6675 	uint_t		n_wrs;
6676 	ibt_status_t	ibt_status;
6677 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
6678 	ibd_swqe_t	*tx_head, *elem;
6679 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
6680 
6681 	/* post the one request, then check for more */
6682 	ibt_status = ibt_post_send(state->id_chnl_hdl,
6683 	    &node->w_swr, 1, NULL);
6684 	if (ibt_status != IBT_SUCCESS) {
6685 		ibd_print_warn(state, "ibd_post_send: "
6686 		    "posting one wr failed: ret=%d", ibt_status);
6687 		ibd_tx_cleanup(state, node);
6688 	}
6689 
6690 	tx_head = NULL;
6691 	for (;;) {
6692 		if (tx_head == NULL) {
6693 			mutex_enter(&state->id_txpost_lock);
6694 			tx_head = state->id_tx_head;
6695 			if (tx_head == NULL) {
6696 				state->id_tx_busy = 0;
6697 				mutex_exit(&state->id_txpost_lock);
6698 				return;
6699 			}
6700 			state->id_tx_head = NULL;
6701 			mutex_exit(&state->id_txpost_lock);
6702 		}
6703 
6704 		/*
6705 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6706 		 * at a time if possible, and keep posting them.
6707 		 */
6708 		for (n_wrs = 0, elem = tx_head;
6709 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6710 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6711 			nodes[n_wrs] = elem;
6712 			wrs[n_wrs] = elem->w_swr;
6713 		}
6714 		tx_head = elem;
6715 
6716 		ASSERT(n_wrs != 0);
6717 
6718 		/*
6719 		 * If posting fails for some reason, we'll never receive
6720 		 * completion intimation, so we'll need to cleanup. But
6721 		 * we need to make sure we don't clean up nodes whose
6722 		 * wrs have been successfully posted. We assume that the
6723 		 * hca driver returns on the first failure to post and
6724 		 * therefore the first 'num_posted' entries don't need
6725 		 * cleanup here.
6726 		 */
6727 		num_posted = 0;
6728 		ibt_status = ibt_post_send(state->id_chnl_hdl,
6729 		    wrs, n_wrs, &num_posted);
6730 		if (ibt_status != IBT_SUCCESS) {
6731 			ibd_print_warn(state, "ibd_post_send: "
6732 			    "posting multiple wrs failed: "
6733 			    "requested=%d, done=%d, ret=%d",
6734 			    n_wrs, num_posted, ibt_status);
6735 
6736 			for (i = num_posted; i < n_wrs; i++)
6737 				ibd_tx_cleanup(state, nodes[i]);
6738 		}
6739 	}
6740 }
6741 
6742 static int
6743 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6744     uint_t lsohdr_sz)
6745 {
6746 	ibt_wr_ds_t *sgl;
6747 	ibt_status_t ibt_status;
6748 	mblk_t *nmp;
6749 	mblk_t *data_mp;
6750 	uchar_t *bufp;
6751 	size_t blksize;
6752 	size_t skip;
6753 	size_t avail;
6754 	uint_t pktsize;
6755 	uint_t frag_len;
6756 	uint_t pending_hdr;
6757 	int nmblks;
6758 	int i;
6759 
6760 	/*
6761 	 * Let's skip ahead to the data if this is LSO
6762 	 */
6763 	data_mp = mp;
6764 	pending_hdr = 0;
6765 	if (lsohdr_sz) {
6766 		pending_hdr = lsohdr_sz;
6767 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
6768 			frag_len = nmp->b_wptr - nmp->b_rptr;
6769 			if (frag_len > pending_hdr)
6770 				break;
6771 			pending_hdr -= frag_len;
6772 		}
6773 		data_mp = nmp;	/* start of data past lso header */
6774 		ASSERT(data_mp != NULL);
6775 	}
6776 
6777 	/*
6778 	 * Calculate the size of message data and number of msg blocks
6779 	 */
6780 	pktsize = 0;
6781 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
6782 	    nmp = nmp->b_cont, nmblks++) {
6783 		pktsize += MBLKL(nmp);
6784 	}
6785 	pktsize -= pending_hdr;
6786 
6787 	/*
6788 	 * We only do ibt_map_mem_iov() if the pktsize is above the
6789 	 * "copy-threshold", and if the number of mp fragments is less than
6790 	 * the maximum acceptable.
6791 	 */
6792 	if ((state->id_hca_res_lkey_capab) &&
6793 	    (pktsize > state->id_ud_tx_copy_thresh) &&
6794 	    (nmblks < state->id_max_sqseg_hiwm)) {
6795 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6796 		ibt_iov_attr_t iov_attr;
6797 
6798 		iov_attr.iov_as = NULL;
6799 		iov_attr.iov = iov_arr;
6800 		iov_attr.iov_buf = NULL;
6801 		iov_attr.iov_list_len = nmblks;
6802 		iov_attr.iov_wr_nds = state->id_max_sqseg;
6803 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6804 		iov_attr.iov_flags = IBT_IOV_SLEEP;
6805 
6806 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6807 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6808 			iov_arr[i].iov_len = MBLKL(nmp);
6809 			if (i == 0) {
6810 				iov_arr[i].iov_addr += pending_hdr;
6811 				iov_arr[i].iov_len -= pending_hdr;
6812 			}
6813 		}
6814 
6815 		node->w_buftype = IBD_WQE_MAPPED;
6816 		node->w_swr.wr_sgl = node->w_sgl;
6817 
6818 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6819 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6820 		if (ibt_status != IBT_SUCCESS) {
6821 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6822 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6823 			goto ibd_copy_path;
6824 		}
6825 
6826 		return (0);
6827 	}
6828 
6829 ibd_copy_path:
6830 	if (pktsize <= state->id_tx_buf_sz) {
6831 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6832 		node->w_swr.wr_nds = 1;
6833 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6834 		node->w_buftype = IBD_WQE_TXBUF;
6835 
6836 		/*
6837 		 * Even though this is the copy path for transfers less than
6838 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
6839 		 * is possible the first data mblk fragment (data_mp) still
6840 		 * contains part of the LSO header that we need to skip.
6841 		 */
6842 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6843 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6844 			blksize = MBLKL(nmp) - pending_hdr;
6845 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6846 			bufp += blksize;
6847 			pending_hdr = 0;
6848 		}
6849 
6850 		return (0);
6851 	}
6852 
6853 	/*
6854 	 * Copy path for transfers greater than id_tx_buf_sz
6855 	 */
6856 	node->w_swr.wr_sgl = node->w_sgl;
6857 	if (ibd_acquire_lsobufs(state, pktsize,
6858 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6859 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6860 		return (-1);
6861 	}
6862 	node->w_buftype = IBD_WQE_LSOBUF;
6863 
6864 	/*
6865 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
6866 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
6867 	 * need to skip part of the LSO header in the first fragment
6868 	 * as before.
6869 	 */
6870 	nmp = data_mp;
6871 	skip = pending_hdr;
6872 	for (i = 0; i < node->w_swr.wr_nds; i++) {
6873 		sgl = node->w_swr.wr_sgl + i;
6874 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6875 		avail = IBD_LSO_BUFSZ;
6876 		while (nmp && avail) {
6877 			blksize = MBLKL(nmp) - skip;
6878 			if (blksize > avail) {
6879 				bcopy(nmp->b_rptr + skip, bufp, avail);
6880 				skip += avail;
6881 				avail = 0;
6882 			} else {
6883 				bcopy(nmp->b_rptr + skip, bufp, blksize);
6884 				skip = 0;
6885 				avail -= blksize;
6886 				bufp += blksize;
6887 				nmp = nmp->b_cont;
6888 			}
6889 		}
6890 	}
6891 
6892 	return (0);
6893 }
6894 
6895 /*
6896  * Schedule a completion queue polling to reap the resource we're
6897  * short on.  If we implement the change to reap tx completions
6898  * in a separate thread, we'll need to wake up that thread here.
6899  */
6900 static int
6901 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6902 {
6903 	ibd_req_t *req;
6904 
6905 	mutex_enter(&state->id_sched_lock);
6906 	state->id_sched_needed |= resource_type;
6907 	mutex_exit(&state->id_sched_lock);
6908 
6909 	/*
6910 	 * If we are asked to queue a work entry, we need to do it
6911 	 */
6912 	if (q_flag) {
6913 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6914 		if (req == NULL)
6915 			return (-1);
6916 
6917 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6918 	}
6919 
6920 	return (0);
6921 }
6922 
6923 /*
6924  * The passed in packet has this format:
6925  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6926  */
6927 static boolean_t
6928 ibd_send(ibd_state_t *state, mblk_t *mp)
6929 {
6930 	ibd_ace_t *ace;
6931 	ibd_swqe_t *node;
6932 	ipoib_mac_t *dest;
6933 	ib_header_info_t *ipibp;
6934 	ip6_t *ip6h;
6935 	uint_t pktsize;
6936 	uint32_t mss;
6937 	uint32_t hckflags;
6938 	uint32_t lsoflags = 0;
6939 	uint_t lsohdr_sz = 0;
6940 	int ret, len;
6941 	boolean_t dofree = B_FALSE;
6942 	boolean_t rc;
6943 	/* if (rc_chan == NULL) send by UD; else send by RC; */
6944 	ibd_rc_chan_t *rc_chan;
6945 	int nmblks;
6946 	mblk_t *nmp;
6947 
6948 	/*
6949 	 * If we aren't done with the device initialization and start,
6950 	 * we shouldn't be here.
6951 	 */
6952 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6953 		return (B_FALSE);
6954 
6955 	/*
6956 	 * Obtain an address handle for the destination.
6957 	 */
6958 	ipibp = (ib_header_info_t *)mp->b_rptr;
6959 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
6960 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6961 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6962 
6963 	rc_chan = NULL;
6964 	ace = ibd_acache_lookup(state, dest, &ret, 1);
6965 	if (state->id_enable_rc && (ace != NULL) &&
6966 	    (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6967 		if (ace->ac_chan == NULL) {
6968 			state->rc_null_conn++;
6969 		} else {
6970 			if (ace->ac_chan->chan_state ==
6971 			    IBD_RC_STATE_ACT_ESTAB) {
6972 				rc_chan = ace->ac_chan;
6973 				rc_chan->is_used = B_TRUE;
6974 				mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6975 				node = WQE_TO_SWQE(
6976 				    rc_chan->tx_wqe_list.dl_head);
6977 				if (node != NULL) {
6978 					rc_chan->tx_wqe_list.dl_cnt -= 1;
6979 					rc_chan->tx_wqe_list.dl_head =
6980 					    node->swqe_next;
6981 				} else {
6982 					node = ibd_rc_acquire_swqes(rc_chan);
6983 				}
6984 				mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6985 
6986 				if (node == NULL) {
6987 					state->rc_swqe_short++;
6988 					mutex_enter(&state->id_sched_lock);
6989 					state->id_sched_needed |=
6990 					    IBD_RSRC_RC_SWQE;
6991 					mutex_exit(&state->id_sched_lock);
6992 					ibd_dec_ref_ace(state, ace);
6993 					return (B_FALSE);
6994 				}
6995 			} else {
6996 				state->rc_no_estab_conn++;
6997 			}
6998 		}
6999 	}
7000 
7001 	if (rc_chan == NULL) {
7002 		mutex_enter(&state->id_tx_list.dl_mutex);
7003 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
7004 		if (node != NULL) {
7005 			state->id_tx_list.dl_cnt -= 1;
7006 			state->id_tx_list.dl_head = node->swqe_next;
7007 		} else {
7008 			node = ibd_acquire_swqe(state);
7009 		}
7010 		mutex_exit(&state->id_tx_list.dl_mutex);
7011 		if (node == NULL) {
7012 			/*
7013 			 * If we don't have an swqe available, schedule a
7014 			 * transmit completion queue cleanup and hold off on
7015 			 * sending more packets until we have some free swqes
7016 			 */
7017 			if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
7018 				if (ace != NULL) {
7019 					ibd_dec_ref_ace(state, ace);
7020 				}
7021 				return (B_FALSE);
7022 			}
7023 
7024 			/*
7025 			 * If a poll cannot be scheduled, we have no choice but
7026 			 * to drop this packet
7027 			 */
7028 			ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
7029 			if (ace != NULL) {
7030 				ibd_dec_ref_ace(state, ace);
7031 			}
7032 			return (B_TRUE);
7033 		}
7034 	}
7035 
7036 	/*
7037 	 * Initialize the commonly used fields in swqe to NULL to protect
7038 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
7039 	 * failure.
7040 	 */
7041 	node->swqe_im_mblk = NULL;
7042 	node->w_swr.wr_nds = 0;
7043 	node->w_swr.wr_sgl = NULL;
7044 	node->w_swr.wr_opcode = IBT_WRC_SEND;
7045 
7046 	/*
7047 	 * Calculate the size of message data and number of msg blocks
7048 	 */
7049 	pktsize = 0;
7050 	for (nmblks = 0, nmp = mp; nmp != NULL;
7051 	    nmp = nmp->b_cont, nmblks++) {
7052 		pktsize += MBLKL(nmp);
7053 	}
7054 
7055 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7056 		atomic_inc_64(&state->id_brd_xmt);
7057 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7058 		atomic_inc_64(&state->id_multi_xmt);
7059 
7060 	if (ace != NULL) {
7061 		node->w_ahandle = ace;
7062 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
7063 	} else {
7064 		DPRINT(5,
7065 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7066 		    ((ret == EFAULT) ? "failed" : "queued"),
7067 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7068 		    htonl(dest->ipoib_gidpref[1]),
7069 		    htonl(dest->ipoib_gidsuff[0]),
7070 		    htonl(dest->ipoib_gidsuff[1]));
7071 		state->rc_ace_not_found++;
7072 		node->w_ahandle = NULL;
7073 
7074 		/*
7075 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7076 		 * can not find a path for the specific dest address. We
7077 		 * should get rid of this kind of packet.  We also should get
7078 		 * rid of the packet if we cannot schedule a poll via the
7079 		 * async thread.  For the normal case, ibd will return the
7080 		 * packet to upper layer and wait for AH creating.
7081 		 *
7082 		 * Note that we always queue a work slot entry for the async
7083 		 * thread when we fail AH lookup (even in intr mode); this is
7084 		 * due to the convoluted way the code currently looks for AH.
7085 		 */
7086 		if (ret == EFAULT) {
7087 			dofree = B_TRUE;
7088 			rc = B_TRUE;
7089 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7090 			dofree = B_TRUE;
7091 			rc = B_TRUE;
7092 		} else {
7093 			dofree = B_FALSE;
7094 			rc = B_FALSE;
7095 		}
7096 		goto ibd_send_fail;
7097 	}
7098 
7099 	/*
7100 	 * For ND6 packets, padding is at the front of the source lladdr.
7101 	 * Insert the padding at front.
7102 	 */
7103 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7104 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7105 			if (!pullupmsg(mp, IPV6_HDR_LEN +
7106 			    sizeof (ib_header_info_t))) {
7107 				DPRINT(10, "ibd_send: pullupmsg failure ");
7108 				dofree = B_TRUE;
7109 				rc = B_TRUE;
7110 				goto ibd_send_fail;
7111 			}
7112 			ipibp = (ib_header_info_t *)mp->b_rptr;
7113 		}
7114 		ip6h = (ip6_t *)((uchar_t *)ipibp +
7115 		    sizeof (ib_header_info_t));
7116 		len = ntohs(ip6h->ip6_plen);
7117 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7118 			mblk_t	*pad;
7119 
7120 			pad = allocb(4, 0);
7121 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7122 			linkb(mp, pad);
7123 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
7124 			    IPV6_HDR_LEN + len + 4) {
7125 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7126 				    IPV6_HDR_LEN + len + 4)) {
7127 					DPRINT(10, "ibd_send: pullupmsg "
7128 					    "failure ");
7129 					dofree = B_TRUE;
7130 					rc = B_TRUE;
7131 					goto ibd_send_fail;
7132 				}
7133 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7134 				    sizeof (ib_header_info_t));
7135 			}
7136 
7137 			/* LINTED: E_CONSTANT_CONDITION */
7138 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7139 		}
7140 	}
7141 
7142 	ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7143 	mp->b_rptr += sizeof (ib_addrs_t);
7144 	pktsize -= sizeof (ib_addrs_t);
7145 
7146 	if (rc_chan) {	/* send in RC mode */
7147 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7148 		ibt_iov_attr_t iov_attr;
7149 		uint_t		i;
7150 		size_t	blksize;
7151 		uchar_t *bufp;
7152 		ibd_rc_tx_largebuf_t *lbufp;
7153 
7154 		atomic_add_64(&state->rc_xmt_bytes, pktsize);
7155 
7156 		/*
7157 		 * Upper layer does Tx checksum, we don't need do any
7158 		 * checksum here.
7159 		 */
7160 		ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7161 
7162 		/*
7163 		 * We only do ibt_map_mem_iov() if the pktsize is above
7164 		 * the "copy-threshold", and if the number of mp
7165 		 * fragments is less than the maximum acceptable.
7166 		 */
7167 		if (pktsize <= state->id_rc_tx_copy_thresh) {
7168 			atomic_inc_64(&state->rc_xmt_small_pkt);
7169 			/*
7170 			 * Only process unicast packet in Reliable Connected
7171 			 * mode.
7172 			 */
7173 			node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7174 			node->w_swr.wr_nds = 1;
7175 			node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7176 			node->w_buftype = IBD_WQE_TXBUF;
7177 
7178 			bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7179 			for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7180 				blksize = MBLKL(nmp);
7181 				bcopy(nmp->b_rptr, bufp, blksize);
7182 				bufp += blksize;
7183 			}
7184 			freemsg(mp);
7185 			ASSERT(node->swqe_im_mblk == NULL);
7186 		} else {
7187 			if ((state->rc_enable_iov_map) &&
7188 			    (nmblks < state->rc_max_sqseg_hiwm)) {
7189 
7190 				/* do ibt_map_mem_iov() */
7191 				iov_attr.iov_as = NULL;
7192 				iov_attr.iov = iov_arr;
7193 				iov_attr.iov_buf = NULL;
7194 				iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7195 				iov_attr.iov_lso_hdr_sz = 0;
7196 				iov_attr.iov_flags = IBT_IOV_SLEEP;
7197 
7198 				i = 0;
7199 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7200 					iov_arr[i].iov_len = MBLKL(nmp);
7201 					if (iov_arr[i].iov_len != 0) {
7202 						iov_arr[i].iov_addr = (caddr_t)
7203 						    (void *)nmp->b_rptr;
7204 						i++;
7205 					}
7206 				}
7207 				iov_attr.iov_list_len = i;
7208 				node->w_swr.wr_sgl = node->w_sgl;
7209 
7210 				ret = ibt_map_mem_iov(state->id_hca_hdl,
7211 				    &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7212 				    &node->w_mi_hdl);
7213 				if (ret != IBT_SUCCESS) {
7214 					atomic_inc_64(
7215 					    &state->rc_xmt_map_fail_pkt);
7216 					DPRINT(30, "ibd_send: ibt_map_mem_iov("
7217 					    ") failed, nmblks=%d, real_nmblks"
7218 					    "=%d, ret=0x%x", nmblks, i, ret);
7219 					goto ibd_rc_large_copy;
7220 				}
7221 
7222 				atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7223 				node->w_buftype = IBD_WQE_MAPPED;
7224 				node->swqe_im_mblk = mp;
7225 			} else {
7226 				atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7227 ibd_rc_large_copy:
7228 				mutex_enter(&state->rc_tx_large_bufs_lock);
7229 				if (state->rc_tx_largebuf_nfree == 0) {
7230 					state->rc_xmt_buf_short++;
7231 					mutex_exit
7232 					    (&state->rc_tx_large_bufs_lock);
7233 					mutex_enter(&state->id_sched_lock);
7234 					state->id_sched_needed |=
7235 					    IBD_RSRC_RC_TX_LARGEBUF;
7236 					mutex_exit(&state->id_sched_lock);
7237 					dofree = B_FALSE;
7238 					rc = B_FALSE;
7239 					/*
7240 					 * If we don't have Tx large bufs,
7241 					 * return failure. node->w_buftype
7242 					 * should not be IBD_WQE_RC_COPYBUF,
7243 					 * otherwise it will cause problem
7244 					 * in ibd_rc_tx_cleanup()
7245 					 */
7246 					node->w_buftype = IBD_WQE_TXBUF;
7247 					goto ibd_send_fail;
7248 				}
7249 
7250 				lbufp = state->rc_tx_largebuf_free_head;
7251 				ASSERT(lbufp->lb_buf != NULL);
7252 				state->rc_tx_largebuf_free_head =
7253 				    lbufp->lb_next;
7254 				lbufp->lb_next = NULL;
7255 				/* Update nfree count */
7256 				state->rc_tx_largebuf_nfree --;
7257 				mutex_exit(&state->rc_tx_large_bufs_lock);
7258 				bufp = lbufp->lb_buf;
7259 				node->w_sgl[0].ds_va =
7260 				    (ib_vaddr_t)(uintptr_t)bufp;
7261 				node->w_sgl[0].ds_key =
7262 				    state->rc_tx_mr_desc.md_lkey;
7263 				node->w_sgl[0].ds_len = pktsize;
7264 				node->w_swr.wr_sgl = node->w_sgl;
7265 				node->w_swr.wr_nds = 1;
7266 				node->w_buftype = IBD_WQE_RC_COPYBUF;
7267 				node->w_rc_tx_largebuf = lbufp;
7268 
7269 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7270 					blksize = MBLKL(nmp);
7271 					if (blksize != 0) {
7272 						bcopy(nmp->b_rptr, bufp,
7273 						    blksize);
7274 						bufp += blksize;
7275 					}
7276 				}
7277 				freemsg(mp);
7278 				ASSERT(node->swqe_im_mblk == NULL);
7279 			}
7280 		}
7281 
7282 		node->swqe_next = NULL;
7283 		mutex_enter(&rc_chan->tx_post_lock);
7284 		if (rc_chan->tx_busy) {
7285 			if (rc_chan->tx_head) {
7286 				rc_chan->tx_tail->swqe_next =
7287 				    SWQE_TO_WQE(node);
7288 			} else {
7289 				rc_chan->tx_head = node;
7290 			}
7291 			rc_chan->tx_tail = node;
7292 			mutex_exit(&rc_chan->tx_post_lock);
7293 		} else {
7294 			rc_chan->tx_busy = 1;
7295 			mutex_exit(&rc_chan->tx_post_lock);
7296 			ibd_rc_post_send(rc_chan, node);
7297 		}
7298 
7299 		return (B_TRUE);
7300 	} /* send by RC */
7301 
7302 	if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7303 		/*
7304 		 * Too long pktsize. The packet size from GLD should <=
7305 		 * state->id_mtu + sizeof (ib_addrs_t)
7306 		 */
7307 		if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7308 			ibd_req_t *req;
7309 
7310 			mutex_enter(&ace->tx_too_big_mutex);
7311 			if (ace->tx_too_big_ongoing) {
7312 				mutex_exit(&ace->tx_too_big_mutex);
7313 				state->rc_xmt_reenter_too_long_pkt++;
7314 				dofree = B_TRUE;
7315 			} else {
7316 				ace->tx_too_big_ongoing = B_TRUE;
7317 				mutex_exit(&ace->tx_too_big_mutex);
7318 				state->rc_xmt_icmp_too_long_pkt++;
7319 
7320 				req = kmem_cache_alloc(state->id_req_kmc,
7321 				    KM_NOSLEEP);
7322 				if (req == NULL) {
7323 					ibd_print_warn(state, "ibd_send: alloc "
7324 					    "ibd_req_t fail");
7325 					/* Drop it. */
7326 					dofree = B_TRUE;
7327 				} else {
7328 					req->rq_ptr = mp;
7329 					req->rq_ptr2 = ace;
7330 					ibd_queue_work_slot(state, req,
7331 					    IBD_ASYNC_RC_TOO_BIG);
7332 					dofree = B_FALSE;
7333 				}
7334 			}
7335 		} else {
7336 			ibd_print_warn(state, "Reliable Connected mode is on. "
7337 			    "Multicast packet length %d > %d is too long to "
7338 			    "send packet (%d > %d), drop it",
7339 			    pktsize, state->id_mtu);
7340 			state->rc_xmt_drop_too_long_pkt++;
7341 			/* Drop it. */
7342 			dofree = B_TRUE;
7343 		}
7344 		rc = B_TRUE;
7345 		goto ibd_send_fail;
7346 	}
7347 
7348 	atomic_add_64(&state->id_xmt_bytes, pktsize);
7349 	atomic_inc_64(&state->id_xmt_pkt);
7350 
7351 	/*
7352 	 * Do LSO and checksum related work here.  For LSO send, adjust the
7353 	 * ud destination, the opcode and the LSO header information to the
7354 	 * work request.
7355 	 */
7356 	mac_lso_get(mp, &mss, &lsoflags);
7357 	if ((lsoflags & HW_LSO) != HW_LSO) {
7358 		node->w_swr.wr_opcode = IBT_WRC_SEND;
7359 		lsohdr_sz = 0;
7360 	} else {
7361 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7362 			/*
7363 			 * The routine can only fail if there's no memory; we
7364 			 * can only drop the packet if this happens
7365 			 */
7366 			ibd_print_warn(state,
7367 			    "ibd_send: no memory, lso posting failed");
7368 			dofree = B_TRUE;
7369 			rc = B_TRUE;
7370 			goto ibd_send_fail;
7371 		}
7372 
7373 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7374 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7375 	}
7376 
7377 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7378 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7379 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7380 	else
7381 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7382 
7383 	/*
7384 	 * Prepare the sgl for posting; the routine can only fail if there's
7385 	 * no lso buf available for posting. If this is the case, we should
7386 	 * probably resched for lso bufs to become available and then try again.
7387 	 */
7388 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7389 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7390 			dofree = B_TRUE;
7391 			rc = B_TRUE;
7392 		} else {
7393 			dofree = B_FALSE;
7394 			rc = B_FALSE;
7395 		}
7396 		goto ibd_send_fail;
7397 	}
7398 	node->swqe_im_mblk = mp;
7399 
7400 	/*
7401 	 * Queue the wqe to hardware; since we can now simply queue a
7402 	 * post instead of doing it serially, we cannot assume anything
7403 	 * about the 'node' after ibd_post_send() returns.
7404 	 */
7405 	node->swqe_next = NULL;
7406 
7407 	mutex_enter(&state->id_txpost_lock);
7408 	if (state->id_tx_busy) {
7409 		if (state->id_tx_head) {
7410 			state->id_tx_tail->swqe_next =
7411 			    SWQE_TO_WQE(node);
7412 		} else {
7413 			state->id_tx_head = node;
7414 		}
7415 		state->id_tx_tail = node;
7416 		mutex_exit(&state->id_txpost_lock);
7417 	} else {
7418 		state->id_tx_busy = 1;
7419 		mutex_exit(&state->id_txpost_lock);
7420 		ibd_post_send(state, node);
7421 	}
7422 
7423 	return (B_TRUE);
7424 
7425 ibd_send_fail:
7426 	if (node && mp)
7427 		ibd_free_lsohdr(node, mp);
7428 
7429 	if (dofree)
7430 		freemsg(mp);
7431 
7432 	if (node != NULL) {
7433 		if (rc_chan) {
7434 			ibd_rc_tx_cleanup(node);
7435 		} else {
7436 			ibd_tx_cleanup(state, node);
7437 		}
7438 	}
7439 
7440 	return (rc);
7441 }
7442 
7443 /*
7444  * GLDv3 entry point for transmitting datagram.
7445  */
7446 static mblk_t *
7447 ibd_m_tx(void *arg, mblk_t *mp)
7448 {
7449 	ibd_state_t *state = (ibd_state_t *)arg;
7450 	mblk_t *next;
7451 
7452 	if (state->id_type == IBD_PORT_DRIVER) {
7453 		freemsgchain(mp);
7454 		return (NULL);
7455 	}
7456 
7457 	if ((state->id_link_state != LINK_STATE_UP) ||
7458 	    !(state->id_mac_state & IBD_DRV_STARTED)) {
7459 		freemsgchain(mp);
7460 		mp = NULL;
7461 	}
7462 
7463 	while (mp != NULL) {
7464 		next = mp->b_next;
7465 		mp->b_next = NULL;
7466 		if (ibd_send(state, mp) == B_FALSE) {
7467 			/* Send fail */
7468 			mp->b_next = next;
7469 			break;
7470 		}
7471 		mp = next;
7472 	}
7473 
7474 	return (mp);
7475 }
7476 
7477 /*
7478  * this handles Tx and Rx completions. With separate CQs, this handles
7479  * only Rx completions.
7480  */
7481 static uint_t
7482 ibd_intr(caddr_t arg)
7483 {
7484 	ibd_state_t *state = (ibd_state_t *)arg;
7485 
7486 	ibd_poll_rcq(state, state->id_rcq_hdl);
7487 
7488 	return (DDI_INTR_CLAIMED);
7489 }
7490 
7491 /*
7492  * Poll and fully drain the send cq
7493  */
7494 static void
7495 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7496 {
7497 	ibt_wc_t *wcs = state->id_txwcs;
7498 	uint_t numwcs = state->id_txwcs_size;
7499 	ibd_wqe_t *wqe;
7500 	ibd_swqe_t *head, *tail;
7501 	ibt_wc_t *wc;
7502 	uint_t num_polled;
7503 	int i;
7504 
7505 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7506 		head = tail = NULL;
7507 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7508 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7509 			if (wc->wc_status != IBT_WC_SUCCESS) {
7510 				/*
7511 				 * Channel being torn down.
7512 				 */
7513 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7514 					DPRINT(5, "ibd_drain_scq: flush error");
7515 					DPRINT(10, "ibd_drain_scq: Bad "
7516 					    "status %d", wc->wc_status);
7517 				} else {
7518 					DPRINT(10, "ibd_drain_scq: "
7519 					    "unexpected wc_status %d",
7520 					    wc->wc_status);
7521 				}
7522 				/*
7523 				 * Fallthrough to invoke the Tx handler to
7524 				 * release held resources, e.g., AH refcount.
7525 				 */
7526 			}
7527 			/*
7528 			 * Add this swqe to the list to be cleaned up.
7529 			 */
7530 			if (head)
7531 				tail->swqe_next = wqe;
7532 			else
7533 				head = WQE_TO_SWQE(wqe);
7534 			tail = WQE_TO_SWQE(wqe);
7535 		}
7536 		tail->swqe_next = NULL;
7537 		ibd_tx_cleanup_list(state, head, tail);
7538 
7539 		/*
7540 		 * Resume any blocked transmissions if possible
7541 		 */
7542 		ibd_resume_transmission(state);
7543 	}
7544 }
7545 
7546 /*
7547  * Poll and fully drain the receive cq
7548  */
7549 static void
7550 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7551 {
7552 	ibt_wc_t *wcs = state->id_rxwcs;
7553 	uint_t numwcs = state->id_rxwcs_size;
7554 	ibd_rwqe_t *rwqe;
7555 	ibt_wc_t *wc;
7556 	uint_t num_polled;
7557 	int i;
7558 	mblk_t *head, *tail, *mp;
7559 
7560 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7561 		head = tail = NULL;
7562 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7563 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7564 			if (wc->wc_status != IBT_WC_SUCCESS) {
7565 				/*
7566 				 * Channel being torn down.
7567 				 */
7568 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7569 					DPRINT(5, "ibd_drain_rcq: "
7570 					    "expected flushed rwqe");
7571 				} else {
7572 					DPRINT(5, "ibd_drain_rcq: "
7573 					    "unexpected wc_status %d",
7574 					    wc->wc_status);
7575 				}
7576 				atomic_inc_32(
7577 				    &state->id_rx_list.dl_bufs_outstanding);
7578 				freemsg(rwqe->rwqe_im_mblk);
7579 				continue;
7580 			}
7581 			mp = ibd_process_rx(state, rwqe, wc);
7582 			if (mp == NULL)
7583 				continue;
7584 
7585 			/*
7586 			 * Add this mp to the list to send to the nw layer.
7587 			 */
7588 			if (head)
7589 				tail->b_next = mp;
7590 			else
7591 				head = mp;
7592 			tail = mp;
7593 		}
7594 		if (head)
7595 			mac_rx(state->id_mh, state->id_rh, head);
7596 
7597 		/*
7598 		 * Account for #rwqes polled.
7599 		 * Post more here, if less than one fourth full.
7600 		 */
7601 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7602 		    (state->id_ud_num_rwqe / 4))
7603 			ibd_post_recv_intr(state);
7604 	}
7605 }
7606 
7607 /*
7608  * Common code for interrupt handling as well as for polling
7609  * for all completed wqe's while detaching.
7610  */
7611 static void
7612 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7613 {
7614 	int flag, redo_flag;
7615 	int redo = 1;
7616 
7617 	flag = IBD_CQ_POLLING;
7618 	redo_flag = IBD_REDO_CQ_POLLING;
7619 
7620 	mutex_enter(&state->id_scq_poll_lock);
7621 	if (state->id_scq_poll_busy & flag) {
7622 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7623 		state->id_scq_poll_busy |= redo_flag;
7624 		mutex_exit(&state->id_scq_poll_lock);
7625 		return;
7626 	}
7627 	state->id_scq_poll_busy |= flag;
7628 	mutex_exit(&state->id_scq_poll_lock);
7629 
7630 	/*
7631 	 * In some cases (eg detaching), this code can be invoked on
7632 	 * any cpu after disabling cq notification (thus no concurrency
7633 	 * exists). Apart from that, the following applies normally:
7634 	 * Transmit completion handling could be from any cpu if
7635 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7636 	 * is interrupt driven.
7637 	 */
7638 
7639 	/*
7640 	 * Poll and drain the CQ
7641 	 */
7642 	ibd_drain_scq(state, cq_hdl);
7643 
7644 	/*
7645 	 * Enable CQ notifications and redrain the cq to catch any
7646 	 * completions we might have missed after the ibd_drain_scq()
7647 	 * above and before the ibt_enable_cq_notify() that follows.
7648 	 * Finally, service any new requests to poll the cq that
7649 	 * could've come in after the ibt_enable_cq_notify().
7650 	 */
7651 	do {
7652 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7653 		    IBT_SUCCESS) {
7654 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7655 		}
7656 
7657 		ibd_drain_scq(state, cq_hdl);
7658 
7659 		mutex_enter(&state->id_scq_poll_lock);
7660 		if (state->id_scq_poll_busy & redo_flag)
7661 			state->id_scq_poll_busy &= ~redo_flag;
7662 		else {
7663 			state->id_scq_poll_busy &= ~flag;
7664 			redo = 0;
7665 		}
7666 		mutex_exit(&state->id_scq_poll_lock);
7667 
7668 	} while (redo);
7669 }
7670 
7671 /*
7672  * Common code for interrupt handling as well as for polling
7673  * for all completed wqe's while detaching.
7674  */
7675 static void
7676 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7677 {
7678 	int flag, redo_flag;
7679 	int redo = 1;
7680 
7681 	flag = IBD_CQ_POLLING;
7682 	redo_flag = IBD_REDO_CQ_POLLING;
7683 
7684 	mutex_enter(&state->id_rcq_poll_lock);
7685 	if (state->id_rcq_poll_busy & flag) {
7686 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7687 		state->id_rcq_poll_busy |= redo_flag;
7688 		mutex_exit(&state->id_rcq_poll_lock);
7689 		return;
7690 	}
7691 	state->id_rcq_poll_busy |= flag;
7692 	mutex_exit(&state->id_rcq_poll_lock);
7693 
7694 	/*
7695 	 * Poll and drain the CQ
7696 	 */
7697 	ibd_drain_rcq(state, rcq);
7698 
7699 	/*
7700 	 * Enable CQ notifications and redrain the cq to catch any
7701 	 * completions we might have missed after the ibd_drain_cq()
7702 	 * above and before the ibt_enable_cq_notify() that follows.
7703 	 * Finally, service any new requests to poll the cq that
7704 	 * could've come in after the ibt_enable_cq_notify().
7705 	 */
7706 	do {
7707 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7708 		    IBT_SUCCESS) {
7709 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7710 		}
7711 
7712 		ibd_drain_rcq(state, rcq);
7713 
7714 		mutex_enter(&state->id_rcq_poll_lock);
7715 		if (state->id_rcq_poll_busy & redo_flag)
7716 			state->id_rcq_poll_busy &= ~redo_flag;
7717 		else {
7718 			state->id_rcq_poll_busy &= ~flag;
7719 			redo = 0;
7720 		}
7721 		mutex_exit(&state->id_rcq_poll_lock);
7722 
7723 	} while (redo);
7724 }
7725 
7726 /*
7727  * Unmap the memory area associated with a given swqe.
7728  */
7729 void
7730 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7731 {
7732 	ibt_status_t stat;
7733 
7734 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7735 
7736 	if (swqe->w_mi_hdl) {
7737 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7738 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
7739 			DPRINT(10,
7740 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7741 		}
7742 		swqe->w_mi_hdl = NULL;
7743 	}
7744 	swqe->w_swr.wr_nds = 0;
7745 }
7746 
7747 void
7748 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7749 {
7750 	/*
7751 	 * The recycling logic can be eliminated from here
7752 	 * and put into the async thread if we create another
7753 	 * list to hold ACE's for unjoined mcg's.
7754 	 */
7755 	if (DEC_REF_DO_CYCLE(ace)) {
7756 		ibd_mce_t *mce;
7757 
7758 		/*
7759 		 * Check with the lock taken: we decremented
7760 		 * reference count without the lock, and some
7761 		 * transmitter might already have bumped the
7762 		 * reference count (possible in case of multicast
7763 		 * disable when we leave the AH on the active
7764 		 * list). If not still 0, get out, leaving the
7765 		 * recycle bit intact.
7766 		 *
7767 		 * Atomically transition the AH from active
7768 		 * to free list, and queue a work request to
7769 		 * leave the group and destroy the mce. No
7770 		 * transmitter can be looking at the AH or
7771 		 * the MCE in between, since we have the
7772 		 * ac_mutex lock. In the SendOnly reap case,
7773 		 * it is not necessary to hold the ac_mutex
7774 		 * and recheck the ref count (since the AH was
7775 		 * taken off the active list), we just do it
7776 		 * to have uniform processing with the Full
7777 		 * reap case.
7778 		 */
7779 		mutex_enter(&state->id_ac_mutex);
7780 		mce = ace->ac_mce;
7781 		if (GET_REF_CYCLE(ace) == 0) {
7782 			CLEAR_REFCYCLE(ace);
7783 			/*
7784 			 * Identify the case of fullmember reap as
7785 			 * opposed to mcg trap reap. Also, port up
7786 			 * might set ac_mce to NULL to indicate Tx
7787 			 * cleanup should do no more than put the
7788 			 * AH in the free list (see ibd_async_link).
7789 			 */
7790 			if (mce != NULL) {
7791 				ace->ac_mce = NULL;
7792 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7793 				/*
7794 				 * mc_req was initialized at mce
7795 				 * creation time.
7796 				 */
7797 				ibd_queue_work_slot(state,
7798 				    &mce->mc_req, IBD_ASYNC_REAP);
7799 			}
7800 			IBD_ACACHE_INSERT_FREE(state, ace);
7801 		}
7802 		mutex_exit(&state->id_ac_mutex);
7803 	}
7804 }
7805 
7806 /*
7807  * Common code that deals with clean ups after a successful or
7808  * erroneous transmission attempt.
7809  */
7810 static void
7811 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7812 {
7813 	ibd_ace_t *ace = swqe->w_ahandle;
7814 
7815 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7816 
7817 	/*
7818 	 * If this was a dynamic mapping in ibd_send(), we need to
7819 	 * unmap here. If this was an lso buffer we'd used for sending,
7820 	 * we need to release the lso buf to the pool, since the resource
7821 	 * is scarce. However, if this was simply a normal send using
7822 	 * the copybuf (present in each swqe), we don't need to release it.
7823 	 */
7824 	if (swqe->swqe_im_mblk != NULL) {
7825 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
7826 			ibd_unmap_mem(state, swqe);
7827 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7828 			ibd_release_lsobufs(state,
7829 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7830 		}
7831 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7832 		freemsg(swqe->swqe_im_mblk);
7833 		swqe->swqe_im_mblk = NULL;
7834 	}
7835 
7836 	/*
7837 	 * Drop the reference count on the AH; it can be reused
7838 	 * now for a different destination if there are no more
7839 	 * posted sends that will use it. This can be eliminated
7840 	 * if we can always associate each Tx buffer with an AH.
7841 	 * The ace can be null if we are cleaning up from the
7842 	 * ibd_send() error path.
7843 	 */
7844 	if (ace != NULL) {
7845 		ibd_dec_ref_ace(state, ace);
7846 	}
7847 
7848 	/*
7849 	 * Release the send wqe for reuse.
7850 	 */
7851 	swqe->swqe_next = NULL;
7852 	ibd_release_swqe(state, swqe, swqe, 1);
7853 }
7854 
7855 static void
7856 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7857 {
7858 	ibd_ace_t *ace;
7859 	ibd_swqe_t *swqe;
7860 	int n = 0;
7861 
7862 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7863 
7864 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7865 
7866 		/*
7867 		 * If this was a dynamic mapping in ibd_send(), we need to
7868 		 * unmap here. If this was an lso buffer we'd used for sending,
7869 		 * we need to release the lso buf to the pool, since the
7870 		 * resource is scarce. However, if this was simply a normal
7871 		 * send using the copybuf (present in each swqe), we don't need
7872 		 * to release it.
7873 		 */
7874 		if (swqe->swqe_im_mblk != NULL) {
7875 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
7876 				ibd_unmap_mem(state, swqe);
7877 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7878 				ibd_release_lsobufs(state,
7879 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7880 			}
7881 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7882 			freemsg(swqe->swqe_im_mblk);
7883 			swqe->swqe_im_mblk = NULL;
7884 		}
7885 
7886 		/*
7887 		 * Drop the reference count on the AH; it can be reused
7888 		 * now for a different destination if there are no more
7889 		 * posted sends that will use it. This can be eliminated
7890 		 * if we can always associate each Tx buffer with an AH.
7891 		 * The ace can be null if we are cleaning up from the
7892 		 * ibd_send() error path.
7893 		 */
7894 		ace = swqe->w_ahandle;
7895 		if (ace != NULL) {
7896 			ibd_dec_ref_ace(state, ace);
7897 		}
7898 		n++;
7899 	}
7900 
7901 	/*
7902 	 * Release the send wqes for reuse.
7903 	 */
7904 	ibd_release_swqe(state, head, tail, n);
7905 }
7906 
7907 /*
7908  * Processing to be done after receipt of a packet; hand off to GLD
7909  * in the format expected by GLD.  The received packet has this
7910  * format: 2b sap :: 00 :: data.
7911  */
7912 static mblk_t *
7913 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7914 {
7915 	ib_header_info_t *phdr;
7916 	mblk_t *mp;
7917 	ipoib_hdr_t *ipibp;
7918 	ipha_t *iphap;
7919 	ip6_t *ip6h;
7920 	int len;
7921 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7922 	uint32_t bufs;
7923 
7924 	/*
7925 	 * Track number handed to upper layer that need to be returned.
7926 	 */
7927 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7928 
7929 	/* Never run out of rwqes, use allocb when running low */
7930 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
7931 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7932 		atomic_inc_32(&state->id_rx_allocb);
7933 		mp = allocb(pkt_len, BPRI_HI);
7934 		if (mp) {
7935 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7936 			ibd_post_recv(state, rwqe);
7937 		} else {	/* no memory */
7938 			atomic_inc_32(&state->id_rx_allocb_failed);
7939 			ibd_post_recv(state, rwqe);
7940 			return (NULL);
7941 		}
7942 	} else {
7943 		mp = rwqe->rwqe_im_mblk;
7944 	}
7945 
7946 
7947 	/*
7948 	 * Adjust write pointer depending on how much data came in.
7949 	 */
7950 	mp->b_wptr = mp->b_rptr + pkt_len;
7951 
7952 	/*
7953 	 * Make sure this is NULL or we're in trouble.
7954 	 */
7955 	if (mp->b_next != NULL) {
7956 		ibd_print_warn(state,
7957 		    "ibd_process_rx: got duplicate mp from rcq?");
7958 		mp->b_next = NULL;
7959 	}
7960 
7961 	/*
7962 	 * the IB link will deliver one of the IB link layer
7963 	 * headers called, the Global Routing Header (GRH).
7964 	 * ibd driver uses the information in GRH to build the
7965 	 * Header_info structure and pass it with the datagram up
7966 	 * to GLDv3.
7967 	 * If the GRH is not valid, indicate to GLDv3 by setting
7968 	 * the VerTcFlow field to 0.
7969 	 */
7970 	phdr = (ib_header_info_t *)mp->b_rptr;
7971 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7972 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7973 
7974 		/* if it is loop back packet, just drop it. */
7975 		if (state->id_enable_rc) {
7976 			if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7977 			    &state->rc_macaddr_loopback,
7978 			    IPOIB_ADDRL) == 0) {
7979 				freemsg(mp);
7980 				return (NULL);
7981 			}
7982 		} else {
7983 			if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7984 			    IPOIB_ADDRL) == 0) {
7985 				freemsg(mp);
7986 				return (NULL);
7987 			}
7988 		}
7989 
7990 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7991 		    sizeof (ipoib_mac_t));
7992 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
7993 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
7994 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
7995 		} else {
7996 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
7997 		}
7998 	} else {
7999 		/*
8000 		 * It can not be a IBA multicast packet. Must have been
8001 		 * unicast for us. Just copy the interface address to dst.
8002 		 */
8003 		phdr->ib_grh.ipoib_vertcflow = 0;
8004 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
8005 		    sizeof (ipoib_mac_t));
8006 	}
8007 
8008 	/*
8009 	 * For ND6 packets, padding is at the front of the source/target
8010 	 * lladdr. However the inet6 layer is not aware of it, hence remove
8011 	 * the padding from such packets.
8012 	 */
8013 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
8014 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
8015 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8016 		len = ntohs(ip6h->ip6_plen);
8017 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8018 			/* LINTED: E_CONSTANT_CONDITION */
8019 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
8020 		}
8021 	}
8022 
8023 	/*
8024 	 * Update statistics
8025 	 */
8026 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
8027 	atomic_inc_64(&state->id_rcv_pkt);
8028 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
8029 		atomic_inc_64(&state->id_brd_rcv);
8030 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
8031 		atomic_inc_64(&state->id_multi_rcv);
8032 
8033 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8034 	/*
8035 	 * Set receive checksum status in mp
8036 	 * Hardware checksumming can be considered valid only if:
8037 	 * 1. CQE.IP_OK bit is set
8038 	 * 2. CQE.CKSUM = 0xffff
8039 	 * 3. IPv6 routing header is not present in the packet
8040 	 * 4. If there are no IP_OPTIONS in the IP HEADER
8041 	 */
8042 
8043 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
8044 	    (wc->wc_cksum == 0xFFFF) &&
8045 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
8046 		mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
8047 	}
8048 
8049 	return (mp);
8050 }
8051 
8052 /*
8053  * Callback code invoked from STREAMs when the receive data buffer is
8054  * free for recycling.
8055  */
8056 static void
8057 ibd_freemsg_cb(char *arg)
8058 {
8059 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
8060 	ibd_state_t *state = rwqe->w_state;
8061 
8062 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
8063 
8064 	/*
8065 	 * If the driver is stopped, just free the rwqe.
8066 	 */
8067 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8068 		DPRINT(6, "ibd_freemsg: wqe being freed");
8069 		rwqe->rwqe_im_mblk = NULL;
8070 		ibd_free_rwqe(state, rwqe);
8071 		return;
8072 	}
8073 
8074 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8075 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8076 	if (rwqe->rwqe_im_mblk == NULL) {
8077 		ibd_free_rwqe(state, rwqe);
8078 		DPRINT(6, "ibd_freemsg: desballoc failed");
8079 		return;
8080 	}
8081 
8082 	ibd_post_recv(state, rwqe);
8083 }
8084 
8085 static uint_t
8086 ibd_tx_recycle(caddr_t arg)
8087 {
8088 	ibd_state_t *state = (ibd_state_t *)arg;
8089 
8090 	/*
8091 	 * Poll for completed entries
8092 	 */
8093 	ibd_poll_scq(state, state->id_scq_hdl);
8094 
8095 	return (DDI_INTR_CLAIMED);
8096 }
8097 
8098 #ifdef IBD_LOGGING
8099 static void
8100 ibd_log_init(void)
8101 {
8102 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8103 	ibd_lbuf_ndx = 0;
8104 
8105 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8106 }
8107 
8108 static void
8109 ibd_log_fini(void)
8110 {
8111 	if (ibd_lbuf)
8112 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
8113 	ibd_lbuf_ndx = 0;
8114 	ibd_lbuf = NULL;
8115 
8116 	mutex_destroy(&ibd_lbuf_lock);
8117 }
8118 
8119 static void
8120 ibd_log(const char *fmt, ...)
8121 {
8122 	va_list	ap;
8123 	uint32_t off;
8124 	uint32_t msglen;
8125 	char tmpbuf[IBD_DMAX_LINE];
8126 
8127 	if (ibd_lbuf == NULL)
8128 		return;
8129 
8130 	va_start(ap, fmt);
8131 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8132 	va_end(ap);
8133 
8134 	if (msglen >= IBD_DMAX_LINE)
8135 		msglen = IBD_DMAX_LINE - 1;
8136 
8137 	mutex_enter(&ibd_lbuf_lock);
8138 
8139 	off = ibd_lbuf_ndx;		/* current msg should go here */
8140 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8141 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8142 
8143 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
8144 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
8145 
8146 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8147 		ibd_lbuf_ndx = 0;
8148 
8149 	mutex_exit(&ibd_lbuf_lock);
8150 
8151 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
8152 }
8153 #endif
8154 
8155 /* ARGSUSED */
8156 static int
8157 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8158     int *rvalp)
8159 {
8160 	ibd_create_ioctl_t	*cmd = karg;
8161 	ibd_state_t		*state, *port_state, *p;
8162 	int			i, err, rval = 0;
8163 	mac_register_t		*macp;
8164 	ibt_hca_portinfo_t 	*pinfop = NULL;
8165 	ibt_status_t 		ibt_status;
8166 	uint_t 			psize, pinfosz;
8167 	boolean_t		force_create = B_FALSE;
8168 
8169 	cmd->ibdioc.ioc_status = 0;
8170 
8171 	if (cmd->ibdioc.ioc_port_inst < 0) {
8172 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8173 		return (EINVAL);
8174 	}
8175 	port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8176 	if (port_state == NULL) {
8177 		DPRINT(10, "ibd_create_partition: failed to get state %d",
8178 		    cmd->ibdioc.ioc_port_inst);
8179 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8180 		return (EINVAL);
8181 	}
8182 
8183 	/* Limited PKeys not supported */
8184 	if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8185 		rval = EINVAL;
8186 		goto part_create_return;
8187 	}
8188 
8189 	if (cmd->ioc_force_create == 0) {
8190 		/*
8191 		 * Check if the port pkey table contains the pkey for which
8192 		 * this partition is being created.
8193 		 */
8194 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8195 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8196 
8197 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8198 			rval = EINVAL;
8199 			goto part_create_return;
8200 		}
8201 
8202 		if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8203 			rval = ENETDOWN;
8204 			cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8205 			goto part_create_return;
8206 		}
8207 
8208 		for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8209 			if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8210 				break;
8211 			}
8212 		}
8213 		if (i == pinfop->p_pkey_tbl_sz) {
8214 			rval = EINVAL;
8215 			cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8216 			goto part_create_return;
8217 		}
8218 	} else {
8219 		force_create = B_TRUE;
8220 	}
8221 
8222 	mutex_enter(&ibd_objlist_lock);
8223 	for (p = ibd_objlist_head; p; p = p->id_next) {
8224 		if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8225 		    (p->id_pkey == cmd->ioc_pkey)) {
8226 			mutex_exit(&ibd_objlist_lock);
8227 			rval = EEXIST;
8228 			cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8229 			goto part_create_return;
8230 		}
8231 	}
8232 	mutex_exit(&ibd_objlist_lock);
8233 
8234 	state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8235 
8236 	state->id_type		= IBD_PARTITION_OBJ;
8237 
8238 	state->id_plinkid	= cmd->ioc_partid;
8239 	state->id_dlinkid	= cmd->ibdioc.ioc_linkid;
8240 	state->id_port_inst	= cmd->ibdioc.ioc_port_inst;
8241 
8242 	state->id_dip		= port_state->id_dip;
8243 	state->id_port		= port_state->id_port;
8244 	state->id_pkey		= cmd->ioc_pkey;
8245 	state->id_hca_guid	= port_state->id_hca_guid;
8246 	state->id_port_guid	= port_state->id_port_guid;
8247 	state->id_force_create	= force_create;
8248 
8249 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8250 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8251 
8252 	if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8253 		rval = EIO;
8254 		cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8255 		goto fail;
8256 	}
8257 
8258 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8259 		rval = EAGAIN;
8260 		goto fail;
8261 	}
8262 
8263 	macp->m_type_ident	= MAC_PLUGIN_IDENT_IB;
8264 	macp->m_dip		= port_state->id_dip;
8265 	macp->m_instance	= (uint_t)-1;
8266 	macp->m_driver		= state;
8267 	macp->m_src_addr	= (uint8_t *)&state->id_macaddr;
8268 	macp->m_callbacks	= &ibd_m_callbacks;
8269 	macp->m_min_sdu		= 0;
8270 	if (state->id_enable_rc) {
8271 		macp->m_max_sdu		= IBD_DEF_RC_MAX_SDU;
8272 	} else {
8273 		macp->m_max_sdu		= IBD_DEF_MAX_SDU;
8274 	}
8275 	macp->m_priv_props = ibd_priv_props;
8276 
8277 	err = mac_register(macp, &state->id_mh);
8278 	mac_free(macp);
8279 
8280 	if (err != 0) {
8281 		DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8282 		    err);
8283 		rval = err;
8284 		goto fail;
8285 	}
8286 
8287 	err = dls_devnet_create(state->id_mh,
8288 	    cmd->ioc_partid, crgetzoneid(credp));
8289 	if (err != 0) {
8290 		DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8291 		    "%d", err);
8292 		rval = err;
8293 		(void) mac_unregister(state->id_mh);
8294 		goto fail;
8295 	}
8296 
8297 	/*
8298 	 * Add the new partition state structure to the list
8299 	 */
8300 	mutex_enter(&ibd_objlist_lock);
8301 	if (ibd_objlist_head)
8302 		state->id_next = ibd_objlist_head;
8303 
8304 	ibd_objlist_head = state;
8305 	mutex_exit(&ibd_objlist_lock);
8306 
8307 part_create_return:
8308 	if (pinfop) {
8309 		ibt_free_portinfo(pinfop, pinfosz);
8310 	}
8311 	return (rval);
8312 
8313 fail:
8314 	if (pinfop) {
8315 		ibt_free_portinfo(pinfop, pinfosz);
8316 	}
8317 	ibd_part_unattach(state);
8318 	kmem_free(state, sizeof (ibd_state_t));
8319 	return (rval);
8320 }
8321 
8322 /* ARGSUSED */
8323 static int
8324 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8325     int *rvalp)
8326 {
8327 	int err;
8328 	datalink_id_t tmpid;
8329 	ibd_state_t *node, *prev;
8330 	ibd_delete_ioctl_t *cmd = karg;
8331 
8332 	prev = NULL;
8333 
8334 	mutex_enter(&ibd_objlist_lock);
8335 	node = ibd_objlist_head;
8336 
8337 	/* Find the ibd state structure corresponding the partion */
8338 	while (node != NULL) {
8339 		if (node->id_plinkid == cmd->ioc_partid)
8340 			break;
8341 		prev = node;
8342 		node = node->id_next;
8343 	}
8344 
8345 	if (node == NULL) {
8346 		mutex_exit(&ibd_objlist_lock);
8347 		return (ENOENT);
8348 	}
8349 
8350 	if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8351 		DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8352 		    "%d", err);
8353 		mutex_exit(&ibd_objlist_lock);
8354 		return (err);
8355 	}
8356 
8357 	/*
8358 	 * Call ibd_part_unattach() only after making sure that the instance has
8359 	 * not been started yet and is also not in late hca init mode.
8360 	 */
8361 	ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8362 
8363 	err = 0;
8364 	if ((node->id_mac_state & IBD_DRV_STARTED) ||
8365 	    (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8366 	    (ibd_part_busy(node) != DDI_SUCCESS) ||
8367 	    ((err = mac_disable(node->id_mh)) != 0)) {
8368 		(void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8369 		    crgetzoneid(credp));
8370 		ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8371 		mutex_exit(&ibd_objlist_lock);
8372 		return (err != 0 ? err : EBUSY);
8373 	}
8374 
8375 	node->id_mac_state |= IBD_DRV_IN_DELETION;
8376 
8377 	ibd_part_unattach(node);
8378 
8379 	ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8380 
8381 	/* Remove the partition state structure from the linked list */
8382 	if (prev == NULL)
8383 		ibd_objlist_head = node->id_next;
8384 	else
8385 		prev->id_next = node->id_next;
8386 	mutex_exit(&ibd_objlist_lock);
8387 
8388 	if ((err = mac_unregister(node->id_mh)) != 0) {
8389 		DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8390 		    err);
8391 	}
8392 
8393 	cv_destroy(&node->id_macst_cv);
8394 	mutex_destroy(&node->id_macst_lock);
8395 
8396 	kmem_free(node, sizeof (ibd_state_t));
8397 
8398 	return (0);
8399 }
8400 
8401 /* ARGSUSED */
8402 static int
8403 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8404     int *rvalp)
8405 {
8406 	ibd_ioctl_t		cmd;
8407 	ibpart_ioctl_t		partioc;
8408 	ibport_ioctl_t		portioc;
8409 #ifdef _MULTI_DATAMODEL
8410 	ibport_ioctl32_t	portioc32;
8411 #endif
8412 	ibd_state_t		*state, *port_state;
8413 	int			size;
8414 	ibt_hca_portinfo_t 	*pinfop = NULL;
8415 	ibt_status_t 		ibt_status;
8416 	uint_t 			psize, pinfosz;
8417 	int			rval = 0;
8418 
8419 	size = sizeof (ibd_ioctl_t);
8420 	if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8421 		return (EFAULT);
8422 	}
8423 	cmd.ioc_status = 0;
8424 	switch (cmd.ioc_info_cmd) {
8425 	case IBD_INFO_CMD_IBPART:
8426 		size = sizeof (ibpart_ioctl_t);
8427 		if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8428 			return (EFAULT);
8429 		}
8430 
8431 		mutex_enter(&ibd_objlist_lock);
8432 		/* Find the ibd state structure corresponding the partition */
8433 		for (state = ibd_objlist_head; state; state = state->id_next) {
8434 			if (state->id_plinkid == cmd.ioc_linkid) {
8435 				break;
8436 			}
8437 		}
8438 
8439 		if (state == NULL) {
8440 			mutex_exit(&ibd_objlist_lock);
8441 			return (ENOENT);
8442 		}
8443 
8444 		partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8445 		partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8446 		partioc.ibdioc.ioc_portnum = state->id_port;
8447 		partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8448 		partioc.ibdioc.ioc_portguid = state->id_port_guid;
8449 		partioc.ibdioc.ioc_status = 0;
8450 		partioc.ioc_partid = state->id_plinkid;
8451 		partioc.ioc_pkey = state->id_pkey;
8452 		partioc.ioc_force_create = state->id_force_create;
8453 		if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8454 			mutex_exit(&ibd_objlist_lock);
8455 			return (EFAULT);
8456 		}
8457 		mutex_exit(&ibd_objlist_lock);
8458 
8459 		break;
8460 
8461 	case IBD_INFO_CMD_IBPORT:
8462 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8463 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8464 			DPRINT(10, "ibd_create_partition: failed to get"
8465 			    " state %d", cmd.ioc_port_inst);
8466 			size = sizeof (ibd_ioctl_t);
8467 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8468 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8469 			    mode)) {
8470 				return (EFAULT);
8471 			}
8472 			return (EINVAL);
8473 		}
8474 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8475 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8476 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8477 			return (EINVAL);
8478 		}
8479 #ifdef _MULTI_DATAMODEL
8480 		switch (ddi_model_convert_from(mode & FMODELS)) {
8481 		case DDI_MODEL_ILP32: {
8482 			size = sizeof (ibport_ioctl32_t);
8483 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8484 				rval = EFAULT;
8485 				goto fail;
8486 			}
8487 			portioc32.ibdioc.ioc_status = 0;
8488 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8489 			portioc32.ibdioc.ioc_hcaguid =
8490 			    port_state->id_hca_guid;
8491 			portioc32.ibdioc.ioc_portguid =
8492 			    port_state->id_port_guid;
8493 			if (portioc32.ioc_pkey_tbl_sz !=
8494 			    pinfop->p_pkey_tbl_sz) {
8495 				rval = EINVAL;
8496 				size = sizeof (ibd_ioctl_t);
8497 				portioc32.ibdioc.ioc_status =
8498 				    IBD_INVALID_PKEY_TBL_SIZE;
8499 				if (ddi_copyout((void *)&portioc32.ibdioc,
8500 				    (void *)arg, size, mode)) {
8501 					rval = EFAULT;
8502 					goto fail;
8503 				}
8504 				goto fail;
8505 			}
8506 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8507 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8508 			    (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8509 			    mode)) {
8510 				rval = EFAULT;
8511 				goto fail;
8512 			}
8513 			size = sizeof (ibport_ioctl32_t);
8514 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8515 			    mode)) {
8516 				rval = EFAULT;
8517 				goto fail;
8518 			}
8519 			break;
8520 		}
8521 		case DDI_MODEL_NONE:
8522 			size = sizeof (ibport_ioctl_t);
8523 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8524 				rval = EFAULT;
8525 				goto fail;
8526 			}
8527 			portioc.ibdioc.ioc_status = 0;
8528 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8529 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8530 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8531 			if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8532 				rval = EINVAL;
8533 				size = sizeof (ibd_ioctl_t);
8534 				portioc.ibdioc.ioc_status =
8535 				    IBD_INVALID_PKEY_TBL_SIZE;
8536 				if (ddi_copyout((void *)&portioc.ibdioc,
8537 				    (void *)arg, size, mode)) {
8538 					rval = EFAULT;
8539 					goto fail;
8540 				}
8541 				goto fail;
8542 			}
8543 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8544 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8545 			    (void *)(portioc.ioc_pkeys), size, mode)) {
8546 				rval = EFAULT;
8547 				goto fail;
8548 			}
8549 			size = sizeof (ibport_ioctl_t);
8550 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8551 			    mode)) {
8552 				rval = EFAULT;
8553 				goto fail;
8554 			}
8555 			break;
8556 		}
8557 #else /* ! _MULTI_DATAMODEL */
8558 		size = sizeof (ibport_ioctl_t);
8559 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8560 			rval = EFAULT;
8561 			goto fail;
8562 		}
8563 		portioc.ibdioc.ioc_status = 0;
8564 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8565 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8566 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8567 		if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8568 			rval = EINVAL;
8569 			size = sizeof (ibd_ioctl_t);
8570 			portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8571 			if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8572 			    size, mode)) {
8573 				rval = EFAULT;
8574 				goto fail;
8575 			}
8576 			goto fail;
8577 		}
8578 		size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8579 		if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8580 		    (void *)(portioc.ioc_pkeys), size, mode)) {
8581 			rval = EFAULT;
8582 			goto fail;
8583 		}
8584 		size = sizeof (ibport_ioctl_t);
8585 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8586 		    mode)) {
8587 			rval = EFAULT;
8588 			goto fail;
8589 		}
8590 #endif /* _MULTI_DATAMODEL */
8591 
8592 		break;
8593 
8594 	case IBD_INFO_CMD_PKEYTBLSZ:
8595 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8596 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8597 			DPRINT(10, "ibd_create_partition: failed to get"
8598 			    " state %d", cmd.ioc_port_inst);
8599 			size = sizeof (ibd_ioctl_t);
8600 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8601 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8602 			    mode)) {
8603 				return (EFAULT);
8604 			}
8605 			return (EINVAL);
8606 		}
8607 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8608 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8609 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8610 			return (EINVAL);
8611 		}
8612 #ifdef _MULTI_DATAMODEL
8613 		switch (ddi_model_convert_from(mode & FMODELS)) {
8614 		case DDI_MODEL_ILP32: {
8615 			size = sizeof (ibport_ioctl32_t);
8616 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8617 				rval = EFAULT;
8618 				goto fail;
8619 			}
8620 			portioc32.ibdioc.ioc_status = 0;
8621 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8622 			portioc32.ibdioc.ioc_hcaguid =
8623 			    port_state->id_hca_guid;
8624 			portioc32.ibdioc.ioc_portguid =
8625 			    port_state->id_port_guid;
8626 			portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8627 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8628 			    mode)) {
8629 				rval = EFAULT;
8630 				goto fail;
8631 			}
8632 			break;
8633 		}
8634 		case DDI_MODEL_NONE:
8635 			size = sizeof (ibport_ioctl_t);
8636 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8637 				rval = EFAULT;
8638 				goto fail;
8639 			}
8640 			portioc.ibdioc.ioc_status = 0;
8641 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8642 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8643 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8644 			portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8645 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8646 			    mode)) {
8647 				rval = EFAULT;
8648 				goto fail;
8649 			}
8650 			break;
8651 		}
8652 #else /* ! _MULTI_DATAMODEL */
8653 		size = sizeof (ibport_ioctl_t);
8654 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8655 			rval = EFAULT;
8656 			goto fail;
8657 		}
8658 		portioc.ibdioc.ioc_status = 0;
8659 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8660 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8661 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8662 		portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8663 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8664 		    mode)) {
8665 			rval = EFAULT;
8666 			goto fail;
8667 		}
8668 #endif /* _MULTI_DATAMODEL */
8669 		break;
8670 
8671 	default:
8672 		return (EINVAL);
8673 
8674 	} /* switch (cmd.ioc_info_cmd) */
8675 fail:
8676 	if (pinfop) {
8677 		ibt_free_portinfo(pinfop, pinfosz);
8678 	}
8679 	return (rval);
8680 }
8681 
8682 /* ARGSUSED */
8683 static void
8684 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8685     ibt_async_code_t code, ibt_async_event_t *event)
8686 {
8687 	ibd_state_t *state = (ibd_state_t *)arg;
8688 	link_state_t	lstate;
8689 
8690 	switch (code) {
8691 	case IBT_EVENT_PORT_UP:
8692 	case IBT_ERROR_PORT_DOWN:
8693 		if (ibd_get_port_state(state, &lstate) != 0)
8694 			break;
8695 
8696 		if (state->id_link_state != lstate) {
8697 			state->id_link_state = lstate;
8698 			mac_link_update(state->id_mh, lstate);
8699 		}
8700 		break;
8701 	default:
8702 		break;
8703 	}
8704 }
8705 
8706 static int
8707 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8708 {
8709 	ibt_hca_portinfo_t *port_infop;
8710 	uint_t psize, port_infosz;
8711 	ibt_status_t	ret;
8712 
8713 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8714 	    &port_infop, &psize, &port_infosz);
8715 	if ((ret != IBT_SUCCESS) || (psize != 1))
8716 		return (-1);
8717 
8718 	state->id_sgid = *port_infop->p_sgid_tbl;
8719 	state->id_link_speed = ibd_get_portspeed(state);
8720 
8721 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8722 		*lstate = LINK_STATE_UP;
8723 	else
8724 		*lstate = LINK_STATE_DOWN;
8725 
8726 	ibt_free_portinfo(port_infop, port_infosz);
8727 	return (0);
8728 }
8729 
8730 static int
8731 ibd_port_attach(dev_info_t *dip)
8732 {
8733 	ibd_state_t		*state;
8734 	link_state_t		lstate;
8735 	int			instance;
8736 	ibt_status_t		ret;
8737 
8738 	/*
8739 	 * Allocate softstate structure
8740 	 */
8741 	instance = ddi_get_instance(dip);
8742 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8743 		DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8744 		return (DDI_FAILURE);
8745 	}
8746 
8747 	state = ddi_get_soft_state(ibd_list, instance);
8748 
8749 	state->id_dip = dip;
8750 	state->id_type = IBD_PORT_DRIVER;
8751 
8752 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8753 	    "port-number", 0)) == 0) {
8754 		DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8755 		    state->id_port);
8756 		return (DDI_FAILURE);
8757 	}
8758 	if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8759 	    "hca-guid", 0)) == 0) {
8760 		DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8761 		    state->id_hca_guid);
8762 		return (DDI_FAILURE);
8763 	}
8764 	if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8765 	    "port-guid", 0)) == 0) {
8766 		DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8767 		    state->id_port_guid);
8768 		return (DDI_FAILURE);
8769 	}
8770 
8771 	/*
8772 	 * Attach to IBTL
8773 	 */
8774 	if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8775 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
8776 		DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8777 		    ret);
8778 		goto done;
8779 	}
8780 
8781 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8782 
8783 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8784 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
8785 		DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8786 		    ret);
8787 		goto done;
8788 	}
8789 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
8790 
8791 	/* Update link status */
8792 
8793 	if (ibd_get_port_state(state, &lstate) != 0) {
8794 		DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8795 		    ret);
8796 		goto done;
8797 	}
8798 	state->id_link_state = lstate;
8799 	/*
8800 	 * Register ibd interfaces with the Nemo framework
8801 	 */
8802 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8803 		DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8804 		goto done;
8805 	}
8806 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8807 
8808 	mac_link_update(state->id_mh, lstate);
8809 
8810 	return (DDI_SUCCESS);
8811 done:
8812 	(void) ibd_port_unattach(state, dip);
8813 	return (DDI_FAILURE);
8814 }
8815 
8816 static int
8817 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8818 {
8819 	int instance;
8820 	uint32_t progress = state->id_mac_state;
8821 	ibt_status_t ret;
8822 
8823 	if (progress & IBD_DRV_MAC_REGISTERED) {
8824 		(void) mac_unregister(state->id_mh);
8825 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8826 	}
8827 
8828 	if (progress & IBD_DRV_HCA_OPENED) {
8829 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8830 		    IBT_SUCCESS) {
8831 			ibd_print_warn(state, "failed to close "
8832 			    "HCA device, ret=%d", ret);
8833 		}
8834 		state->id_hca_hdl = NULL;
8835 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8836 	}
8837 
8838 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8839 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8840 			ibd_print_warn(state,
8841 			    "ibt_detach() failed, ret=%d", ret);
8842 		}
8843 		state->id_ibt_hdl = NULL;
8844 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8845 	}
8846 	instance = ddi_get_instance(dip);
8847 	ddi_soft_state_free(ibd_list, instance);
8848 
8849 	return (DDI_SUCCESS);
8850 }
8851 
8852 ibt_status_t
8853 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8854 {
8855 	ibd_state_t	*state;
8856 
8857 	mutex_enter(&ibd_objlist_lock);
8858 
8859 	/* Find the ibd state structure corresponding the partition */
8860 	for (state = ibd_objlist_head; state; state = state->id_next) {
8861 		if (state->id_plinkid == linkid) {
8862 			break;
8863 		}
8864 	}
8865 
8866 	if (state == NULL) {
8867 		mutex_exit(&ibd_objlist_lock);
8868 		return (IBT_NO_SUCH_OBJECT);
8869 	}
8870 
8871 	attr->pa_dlinkid = state->id_dlinkid;
8872 	attr->pa_plinkid = state->id_plinkid;
8873 	attr->pa_port = state->id_port;
8874 	attr->pa_hca_guid = state->id_hca_guid;
8875 	attr->pa_port_guid = state->id_port_guid;
8876 	attr->pa_pkey = state->id_pkey;
8877 
8878 	mutex_exit(&ibd_objlist_lock);
8879 
8880 	return (IBT_SUCCESS);
8881 }
8882 
8883 ibt_status_t
8884 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8885 {
8886 	ibd_state_t	*state;
8887 	int		n = 0;
8888 	ibt_part_attr_t	*attr;
8889 
8890 	mutex_enter(&ibd_objlist_lock);
8891 
8892 	for (state = ibd_objlist_head; state; state = state->id_next)
8893 		n++;
8894 
8895 	*nparts = n;
8896 	if (n == 0) {
8897 		*attr_list = NULL;
8898 		mutex_exit(&ibd_objlist_lock);
8899 		return (IBT_SUCCESS);
8900 	}
8901 
8902 	*attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8903 	attr = *attr_list;
8904 	for (state = ibd_objlist_head; state; state = state->id_next) {
8905 #ifdef DEBUG
8906 		ASSERT(n > 0);
8907 		n--;
8908 #endif
8909 		attr->pa_dlinkid = state->id_dlinkid;
8910 		attr->pa_plinkid = state->id_plinkid;
8911 		attr->pa_port = state->id_port;
8912 		attr->pa_hca_guid = state->id_hca_guid;
8913 		attr->pa_port_guid = state->id_port_guid;
8914 		attr->pa_pkey = state->id_pkey;
8915 		attr++;
8916 	}
8917 
8918 	mutex_exit(&ibd_objlist_lock);
8919 	return (IBT_SUCCESS);
8920 }
8921