xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 6a634c9d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * An implementation of the IPoIB standard based on PSARC 2001/289.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/sysmacros.h>	/* for offsetof */
44 #include <sys/disp.h>		/* for async thread pri */
45 #include <sys/atomic.h>		/* for atomic_add*() */
46 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
47 #include <netinet/in.h>		/* for netinet/ip.h below */
48 #include <netinet/ip.h>		/* for struct ip */
49 #include <netinet/udp.h>	/* for struct udphdr */
50 #include <inet/common.h>	/* for inet/ip.h below */
51 #include <inet/ip.h>		/* for ipha_t */
52 #include <inet/ip6.h>		/* for ip6_t */
53 #include <inet/tcp.h>		/* for tcph_t */
54 #include <netinet/icmp6.h>	/* for icmp6_t */
55 #include <sys/callb.h>
56 #include <sys/modhash.h>
57 
58 #include <sys/ib/clients/ibd/ibd.h>
59 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
60 #include <sys/note.h>
61 #include <sys/multidata.h>
62 
63 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
64 
65 #include <sys/priv_names.h>
66 #include <sys/dls.h>
67 #include <sys/dld_ioc.h>
68 #include <sys/policy.h>
69 #include <sys/ibpart.h>
70 #include <sys/file.h>
71 
72 /*
73  * The write-up below includes details on the following:
74  * 1. The dladm administrative model.
75  * 2. Late HCA initialization feature.
76  * 3. Brussels support and its implications to the current architecture.
77  *
78  * 1. The dladm administrative model.
79  * ------------------------------------------
80  * With the dladm model, ibnex will create one ibd instance per port. These
81  * instances will be created independent of the port state.
82  *
83  * The ibd driver is two faceted: One side of it working as the port driver and
84  * the other as the partition object driver.
85  *
86  * The port instance is a child of the HCA, and will have an entry in the devfs.
87  * A DDI attach only happens for the port driver, and its attach is
88  * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89  * handled in ibd_port_unattach().
90  *
91  * The partition object is only a registrant to the mac layer via mac_register()
92  * and does not have an entry in the device tree. There is no DDI softstate
93  * managed by the DDI framework for the partition objects. However, the state is
94  * managed inside the ibd driver, and every partition object hangs off the
95  * "ibd_objlist_head".
96  *
97  * The partition object first comes into existence when a user runs the
98  * 'create-part' subcommand of dladm. This is like invoking the attach entry
99  * point of the partition object. The partition object goes away with the
100  * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101  * point of the partition object.
102  *
103  * The create-part and delete-part subcommands result in dld ioctls that end up
104  * calling ibd_create_parition() and ibd_delete_partition respectively.
105  * There ioctls are registered with the dld layer in _init() via a call to
106  * dld_ioc_register().
107  *
108  * The port instance by itself cannot be plumbed. It is only the partition
109  * objects that can be plumbed and they alone participate in I/O and not the
110  * port driver.
111  *
112  * There are some info ioctls supported in ibd which are used by dladm(1M) to
113  * display useful information. The info entry point for ibd is
114  * ibd_get_partition_info().
115  *
116  * 2. Late HCA initialization feature.
117  * ------------------------------------
118  * As mentioned in section 1, the user creates the partition objects via
119  * dladm(1M). It is possible that:
120  * a) The physical port itself is down and the SM cannot be reached.
121  * b) The PKEY specified by the used has not been created in the SM yet.
122  * c) An IPoIB broadcast group for the specified PKEY is not present.
123  *
124  * In all of the above cases, complete initialization of the partition object is
125  * not possible. However, the new model allows the creation of partition
126  * objects even in such cases but will defer the initialization for later.
127  * When such a partition object is plumbed, the link state will be displayed as
128  * "down".
129  * The driver, at this point, is listening to events that herald the
130  * availability of resources -
131  * i)   LINK_UP when the link becomes available
132  * ii)  PORT_CHANGE when the PKEY has been created
133  * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134  * created
135  * via ibd_async_handler() for events i) and ii), and via
136  * ibd_snet_notices_handler() for iii.
137  * The driver handles these events (as and when they arrive) and completes the
138  * initialization of the partition object and transitions it to a usable state.
139  *
140  * 3. Brussels support and its implications to the current architecture.
141  * ---------------------------------------------------------------------
142  * The brussels support introduces two new interfaces to the ibd driver -
143  * ibd_m_getprop() and ibd_m_setprop().
144  * These interfaces allow setting and retrieval of certain properties.
145  * Some of them are public properties while most other are private properties
146  * meant to be used by developers. Tuning the latter kind can cause
147  * performance issues and should not be used without understanding the
148  * implications. All properties are specific to an instance of either the
149  * partition object or the port driver.
150  *
151  * The public properties are : mtu and linkmode.
152  * mtu is a read-only property.
153  * linkmode can take two values - UD and CM.
154  *
155  * Changing the linkmode requires some bookkeeping in the driver. The
156  * capabilities need to be re-reported to the mac layer. This is done by
157  * calling mac_capab_update().  The maxsdu is updated by calling
158  * mac_maxsdu_update2().
159  * The private properties retain their values across the change of linkmode.
160  * NOTE:
161  * - The port driver does not support any property apart from mtu.
162  * - All other properties are only meant for the partition object.
163  * - The properties cannot be set when an instance is plumbed. The
164  * instance has to be unplumbed to effect any setting.
165  */
166 
167 /*
168  * Driver wide tunables
169  *
170  * ibd_tx_softintr
171  * ibd_rx_softintr
172  *     The softintr mechanism allows ibd to avoid event queue overflows if
173  *     the receive/completion handlers are to be expensive. These are enabled
174  *     by default.
175  *
176  * ibd_log_sz
177  *     This specifies the size of the ibd log buffer in bytes. The buffer is
178  *     allocated and logging is enabled only when IBD_LOGGING is defined.
179  *
180  */
181 uint_t ibd_rx_softintr = 1;
182 uint_t ibd_tx_softintr = 1;
183 
184 #ifdef IBD_LOGGING
185 uint_t ibd_log_sz = 0x20000;
186 #endif
187 
188 #ifdef IBD_LOGGING
189 #define	IBD_LOG_SZ			ibd_log_sz
190 #endif
191 
192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 #define	IBD_RX_POST_CNT			8
194 
195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 #define	IBD_LOG_RX_POST			4
197 
198 /* Minimum number of receive work requests driver needs to always have */
199 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200 
201 /*
202  * LSO parameters
203  */
204 #define	IBD_LSO_MAXLEN			65536
205 #define	IBD_LSO_BUFSZ			8192
206 
207 /*
208  * Async operation states
209  */
210 #define	IBD_OP_NOTSTARTED		0
211 #define	IBD_OP_ONGOING			1
212 #define	IBD_OP_COMPLETED		2
213 #define	IBD_OP_ERRORED			3
214 #define	IBD_OP_ROUTERED			4
215 
216 /*
217  * Start/stop in-progress flags; note that restart must always remain
218  * the OR of start and stop flag values.
219  */
220 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
221 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
222 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
223 #define	IBD_DRV_DELETE_IN_PROGRESS	IBD_DRV_RESTART_IN_PROGRESS
224 
225 /*
226  * Miscellaneous constants
227  */
228 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
229 #define	IBD_DEF_MAX_SDU			2044
230 #define	IBD_DEF_MAX_MTU			(IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
231 #define	IBD_DEF_RC_MAX_SDU		65520
232 #define	IBD_DEF_RC_MAX_MTU		(IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
233 #define	IBD_DEFAULT_QKEY		0xB1B
234 #ifdef IBD_LOGGING
235 #define	IBD_DMAX_LINE			100
236 #endif
237 
238 /*
239  * Enumerations for link states
240  */
241 typedef enum {
242 	IBD_LINK_DOWN,
243 	IBD_LINK_UP,
244 	IBD_LINK_UP_ABSENT
245 } ibd_link_op_t;
246 
247 /*
248  * Driver State Pointer
249  */
250 void *ibd_list;
251 
252 /*
253  * Driver Global Data
254  */
255 ibd_global_state_t ibd_gstate;
256 
257 /*
258  * Partition object list
259  */
260 ibd_state_t	*ibd_objlist_head = NULL;
261 kmutex_t	ibd_objlist_lock;
262 
263 int ibd_rc_conn_timeout = 60 * 10;	/* 10 minutes */
264 
265 /*
266  * Logging
267  */
268 #ifdef IBD_LOGGING
269 kmutex_t ibd_lbuf_lock;
270 uint8_t *ibd_lbuf;
271 uint32_t ibd_lbuf_ndx;
272 #endif
273 
274 /*
275  * Required system entry points
276  */
277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
279 
280 /*
281  * Required driver entry points for GLDv3
282  */
283 static int ibd_m_stat(void *, uint_t, uint64_t *);
284 static int ibd_m_start(void *);
285 static void ibd_m_stop(void *);
286 static int ibd_m_promisc(void *, boolean_t);
287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
288 static int ibd_m_unicst(void *, const uint8_t *);
289 static mblk_t *ibd_m_tx(void *, mblk_t *);
290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
291 
292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
293     const void *);
294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
296     mac_prop_info_handle_t);
297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
298     const void *);
299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
300 
301 /*
302  * Private driver entry points for GLDv3
303  */
304 
305 /*
306  * Initialization
307  */
308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
309 static int ibd_init_txlist(ibd_state_t *);
310 static int ibd_init_rxlist(ibd_state_t *);
311 static int ibd_acache_init(ibd_state_t *);
312 #ifdef IBD_LOGGING
313 static void ibd_log_init(void);
314 #endif
315 
316 /*
317  * Termination/cleanup
318  */
319 static void ibd_state_fini(ibd_state_t *);
320 static void ibd_fini_txlist(ibd_state_t *);
321 static void ibd_fini_rxlist(ibd_state_t *);
322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
324 static void ibd_acache_fini(ibd_state_t *);
325 #ifdef IBD_LOGGING
326 static void ibd_log_fini(void);
327 #endif
328 
329 /*
330  * Allocation/acquire/map routines
331  */
332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
337     uint32_t *);
338 
339 /*
340  * Free/release/unmap routines
341  */
342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
343 static void ibd_free_tx_copybufs(ibd_state_t *);
344 static void ibd_free_rx_copybufs(ibd_state_t *);
345 static void ibd_free_rx_rsrcs(ibd_state_t *);
346 static void ibd_free_tx_lsobufs(ibd_state_t *);
347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
350 
351 /*
352  * Handlers/callback routines
353  */
354 static uint_t ibd_intr(caddr_t);
355 static uint_t ibd_tx_recycle(caddr_t);
356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
362 static void ibd_freemsg_cb(char *);
363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
364     ibt_async_event_t *);
365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
366     ibt_async_event_t *);
367 static void ibd_snet_notices_handler(void *, ib_gid_t,
368     ibt_subnet_event_code_t, ibt_subnet_event_t *);
369 
370 /*
371  * Send/receive routines
372  */
373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
377 
378 /*
379  * Threads
380  */
381 static void ibd_async_work(ibd_state_t *);
382 
383 /*
384  * Async tasks
385  */
386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
388 static void ibd_async_setprom(ibd_state_t *);
389 static void ibd_async_unsetprom(ibd_state_t *);
390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
392 static void ibd_async_txsched(ibd_state_t *);
393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
394 
395 /*
396  * Async task helpers
397  */
398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
402     ipoib_mac_t *, ipoib_mac_t *);
403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
409 static uint64_t ibd_get_portspeed(ibd_state_t *);
410 static boolean_t ibd_async_safe(ibd_state_t *);
411 static void ibd_async_done(ibd_state_t *);
412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
416 
417 /*
418  * Helpers for attach/start routines
419  */
420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
421 static int ibd_record_capab(ibd_state_t *);
422 static int ibd_get_port_details(ibd_state_t *);
423 static int ibd_alloc_cqs(ibd_state_t *);
424 static int ibd_setup_ud_channel(ibd_state_t *);
425 static int ibd_start(ibd_state_t *);
426 static int ibd_undo_start(ibd_state_t *, link_state_t);
427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
430 static void ibd_part_unattach(ibd_state_t *state);
431 static int ibd_port_attach(dev_info_t *);
432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
434 static int ibd_part_busy(ibd_state_t *);
435 
436 /*
437  * Miscellaneous helpers
438  */
439 static int ibd_sched_poll(ibd_state_t *, int, int);
440 static void ibd_resume_transmission(ibd_state_t *);
441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
443 static void *list_get_head(list_t *);
444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
446 
447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
449 
450 #ifdef IBD_LOGGING
451 static void ibd_log(const char *, ...);
452 #endif
453 
454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
455     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
456 
457 /* Module Driver Info */
458 static struct modldrv ibd_modldrv = {
459 	&mod_driverops,			/* This one is a driver */
460 	"InfiniBand GLDv3 Driver",	/* short description */
461 	&ibd_dev_ops			/* driver specific ops */
462 };
463 
464 /* Module Linkage */
465 static struct modlinkage ibd_modlinkage = {
466 	MODREV_1, (void *)&ibd_modldrv, NULL
467 };
468 
469 /*
470  * Module (static) info passed to IBTL during ibt_attach
471  */
472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
473 	IBTI_V_CURR,
474 	IBT_NETWORK,
475 	ibd_async_handler,
476 	NULL,
477 	"IBPART"
478 };
479 
480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
481 	IBTI_V_CURR,
482 	IBT_NETWORK,
483 	ibdpd_async_handler,
484 	NULL,
485 	"IPIB"
486 };
487 
488 /*
489  * GLDv3 entry points
490  */
491 #define	IBD_M_CALLBACK_FLAGS	\
492 	(MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
493 
494 static mac_callbacks_t ibd_m_callbacks = {
495 	IBD_M_CALLBACK_FLAGS,
496 	ibd_m_stat,
497 	ibd_m_start,
498 	ibd_m_stop,
499 	ibd_m_promisc,
500 	ibd_m_multicst,
501 	ibd_m_unicst,
502 	ibd_m_tx,
503 	NULL,
504 	NULL,
505 	ibd_m_getcapab,
506 	NULL,
507 	NULL,
508 	ibd_m_setprop,
509 	ibd_m_getprop,
510 	ibd_m_propinfo
511 };
512 
513 /* Private properties */
514 char *ibd_priv_props[] = {
515 	"_ibd_broadcast_group",
516 	"_ibd_coalesce_completions",
517 	"_ibd_create_broadcast_group",
518 	"_ibd_hash_size",
519 	"_ibd_lso_enable",
520 	"_ibd_num_ah",
521 	"_ibd_num_lso_bufs",
522 	"_ibd_rc_enable_srq",
523 	"_ibd_rc_num_rwqe",
524 	"_ibd_rc_num_srq",
525 	"_ibd_rc_num_swqe",
526 	"_ibd_rc_rx_comp_count",
527 	"_ibd_rc_rx_comp_usec",
528 	"_ibd_rc_rx_copy_thresh",
529 	"_ibd_rc_rx_rwqe_thresh",
530 	"_ibd_rc_tx_comp_count",
531 	"_ibd_rc_tx_comp_usec",
532 	"_ibd_rc_tx_copy_thresh",
533 	"_ibd_ud_num_rwqe",
534 	"_ibd_ud_num_swqe",
535 	"_ibd_ud_rx_comp_count",
536 	"_ibd_ud_rx_comp_usec",
537 	"_ibd_ud_tx_comp_count",
538 	"_ibd_ud_tx_comp_usec",
539 	"_ibd_ud_tx_copy_thresh",
540 	NULL
541 };
542 
543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
546 
547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
548 	{IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
549 	    ibd_create_partition, secpolicy_dl_config},
550 	{IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
551 	    ibd_delete_partition, secpolicy_dl_config},
552 	{IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
553 	    ibd_get_partition_info, NULL}
554 };
555 
556 /*
557  * Fill/clear <scope> and <p_key> in multicast/broadcast address
558  */
559 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
560 {							\
561 	*(uint32_t *)((char *)(maddr) + 4) |=		\
562 	    htonl((uint32_t)(scope) << 16);		\
563 	*(uint32_t *)((char *)(maddr) + 8) |=		\
564 	    htonl((uint32_t)(pkey) << 16);		\
565 }
566 
567 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
568 {							\
569 	*(uint32_t *)((char *)(maddr) + 4) &=		\
570 	    htonl(~((uint32_t)0xF << 16));		\
571 	*(uint32_t *)((char *)(maddr) + 8) &=		\
572 	    htonl(~((uint32_t)0xFFFF << 16));		\
573 }
574 
575 /*
576  * Rudimentary debugging support
577  */
578 #ifdef DEBUG
579 int ibd_debuglevel = 100;
580 void
debug_print(int l,char * fmt,...)581 debug_print(int l, char *fmt, ...)
582 {
583 	va_list ap;
584 
585 	if (l < ibd_debuglevel)
586 		return;
587 	va_start(ap, fmt);
588 	vcmn_err(CE_CONT, fmt, ap);
589 	va_end(ap);
590 }
591 #endif
592 
593 /*
594  * Common routine to print warning messages; adds in hca guid, port number
595  * and pkey to be able to identify the IBA interface.
596  */
597 void
ibd_print_warn(ibd_state_t * state,char * fmt,...)598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
599 {
600 	ib_guid_t hca_guid;
601 	char ibd_print_buf[MAXNAMELEN + 256];
602 	int len;
603 	va_list ap;
604 	char part_name[MAXNAMELEN];
605 	datalink_id_t linkid = state->id_plinkid;
606 
607 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
608 	    0, "hca-guid", 0);
609 	(void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
610 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
611 	    "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
612 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
613 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
614 	    part_name);
615 	va_start(ap, fmt);
616 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
617 	    fmt, ap);
618 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
619 	va_end(ap);
620 }
621 
622 /*
623  * Warlock directives
624  */
625 
626 /*
627  * id_lso_lock
628  *
629  * state->id_lso->bkt_nfree may be accessed without a lock to
630  * determine the threshold at which we have to ask the nw layer
631  * to resume transmission (see ibd_resume_transmission()).
632  */
633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
634     ibd_state_t::id_lso))
_NOTE(DATA_READABLE_WITHOUT_LOCK (ibd_state_t::id_lso))635 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
636 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
637 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
638 
639 /*
640  * id_scq_poll_lock
641  */
642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
643     ibd_state_t::id_scq_poll_busy))
644 
645 /*
646  * id_txpost_lock
647  */
648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
649     ibd_state_t::id_tx_head))
650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
651     ibd_state_t::id_tx_busy))
652 
653 /*
654  * id_acache_req_lock
655  */
656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
657     ibd_state_t::id_acache_req_cv))
658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
659     ibd_state_t::id_req_list))
660 _NOTE(SCHEME_PROTECTS_DATA("atomic",
661     ibd_acache_s::ac_ref))
662 
663 /*
664  * id_ac_mutex
665  *
666  * This mutex is actually supposed to protect id_ah_op as well,
667  * but this path of the code isn't clean (see update of id_ah_op
668  * in ibd_async_acache(), immediately after the call to
669  * ibd_async_mcache()). For now, we'll skip this check by
670  * declaring that id_ah_op is protected by some internal scheme
671  * that warlock isn't aware of.
672  */
673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
674     ibd_state_t::id_ah_active))
675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
676     ibd_state_t::id_ah_free))
677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
678     ibd_state_t::id_ah_addr))
679 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
680     ibd_state_t::id_ah_op))
681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
682     ibd_state_t::id_ah_error))
683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
684     ibd_state_t::id_ac_hot_ace))
685 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
686 
687 /*
688  * id_mc_mutex
689  */
690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
691     ibd_state_t::id_mc_full))
692 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
693     ibd_state_t::id_mc_non))
694 
695 /*
696  * id_trap_lock
697  */
698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
699     ibd_state_t::id_trap_cv))
700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
701     ibd_state_t::id_trap_stop))
702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
703     ibd_state_t::id_trap_inprog))
704 
705 /*
706  * id_prom_op
707  */
708 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
709     ibd_state_t::id_prom_op))
710 
711 /*
712  * id_sched_lock
713  */
714 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
715     ibd_state_t::id_sched_needed))
716 
717 /*
718  * id_link_mutex
719  */
720 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
721     ibd_state_t::id_link_state))
722 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
723 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
724     ibd_state_t::id_link_speed))
725 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
726 
727 /*
728  * id_tx_list.dl_mutex
729  */
730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
731     ibd_state_t::id_tx_list.dl_head))
732 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
733     ibd_state_t::id_tx_list.dl_pending_sends))
734 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
735     ibd_state_t::id_tx_list.dl_cnt))
736 
737 /*
738  * id_rx_list.dl_mutex
739  */
740 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
741     ibd_state_t::id_rx_list.dl_bufs_outstanding))
742 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
743     ibd_state_t::id_rx_list.dl_cnt))
744 
745 /*
746  * rc_timeout_lock
747  */
748 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
749     ibd_state_t::rc_timeout_start))
750 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
751     ibd_state_t::rc_timeout))
752 
753 
754 /*
755  * Items protected by atomic updates
756  */
757 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
758     ibd_state_s::id_brd_rcv
759     ibd_state_s::id_brd_xmt
760     ibd_state_s::id_multi_rcv
761     ibd_state_s::id_multi_xmt
762     ibd_state_s::id_num_intrs
763     ibd_state_s::id_rcv_bytes
764     ibd_state_s::id_rcv_pkt
765     ibd_state_s::id_rx_post_queue_index
766     ibd_state_s::id_tx_short
767     ibd_state_s::id_xmt_bytes
768     ibd_state_s::id_xmt_pkt
769     ibd_state_s::rc_rcv_trans_byte
770     ibd_state_s::rc_rcv_trans_pkt
771     ibd_state_s::rc_rcv_copy_byte
772     ibd_state_s::rc_rcv_copy_pkt
773     ibd_state_s::rc_xmt_bytes
774     ibd_state_s::rc_xmt_small_pkt
775     ibd_state_s::rc_xmt_fragmented_pkt
776     ibd_state_s::rc_xmt_map_fail_pkt
777     ibd_state_s::rc_xmt_map_succ_pkt
778     ibd_rc_chan_s::rcq_invoking))
779 
780 /*
781  * Non-mutex protection schemes for data elements. Almost all of
782  * these are non-shared items.
783  */
784 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
785     callb_cpr
786     ib_gid_s
787     ib_header_info
788     ibd_acache_rq
789     ibd_acache_s::ac_mce
790     ibd_acache_s::ac_chan
791     ibd_mcache::mc_fullreap
792     ibd_mcache::mc_jstate
793     ibd_mcache::mc_req
794     ibd_rwqe_s
795     ibd_swqe_s
796     ibd_wqe_s
797     ibt_wr_ds_s::ds_va
798     ibt_wr_lso_s
799     ipoib_mac::ipoib_qpn
800     mac_capab_lso_s
801     msgb::b_next
802     msgb::b_cont
803     msgb::b_rptr
804     msgb::b_wptr
805     ibd_state_s::id_bgroup_created
806     ibd_state_s::id_mac_state
807     ibd_state_s::id_mtu
808     ibd_state_s::id_ud_num_rwqe
809     ibd_state_s::id_ud_num_swqe
810     ibd_state_s::id_qpnum
811     ibd_state_s::id_rcq_hdl
812     ibd_state_s::id_rx_buf_sz
813     ibd_state_s::id_rx_bufs
814     ibd_state_s::id_rx_mr_hdl
815     ibd_state_s::id_rx_wqes
816     ibd_state_s::id_rxwcs
817     ibd_state_s::id_rxwcs_size
818     ibd_state_s::id_rx_nqueues
819     ibd_state_s::id_rx_queues
820     ibd_state_s::id_scope
821     ibd_state_s::id_scq_hdl
822     ibd_state_s::id_tx_buf_sz
823     ibd_state_s::id_tx_bufs
824     ibd_state_s::id_tx_mr_hdl
825     ibd_state_s::id_tx_rel_list.dl_cnt
826     ibd_state_s::id_tx_wqes
827     ibd_state_s::id_txwcs
828     ibd_state_s::id_txwcs_size
829     ibd_state_s::rc_listen_hdl
830     ibd_state_s::rc_listen_hdl_OFED_interop
831     ibd_state_s::rc_srq_size
832     ibd_state_s::rc_srq_rwqes
833     ibd_state_s::rc_srq_rx_bufs
834     ibd_state_s::rc_srq_rx_mr_hdl
835     ibd_state_s::rc_tx_largebuf_desc_base
836     ibd_state_s::rc_tx_mr_bufs
837     ibd_state_s::rc_tx_mr_hdl
838     ipha_s
839     icmph_s
840     ibt_path_info_s::pi_sid
841     ibd_rc_chan_s::ace
842     ibd_rc_chan_s::chan_hdl
843     ibd_rc_chan_s::state
844     ibd_rc_chan_s::chan_state
845     ibd_rc_chan_s::is_tx_chan
846     ibd_rc_chan_s::rcq_hdl
847     ibd_rc_chan_s::rcq_size
848     ibd_rc_chan_s::scq_hdl
849     ibd_rc_chan_s::scq_size
850     ibd_rc_chan_s::rx_bufs
851     ibd_rc_chan_s::rx_mr_hdl
852     ibd_rc_chan_s::rx_rwqes
853     ibd_rc_chan_s::tx_wqes
854     ibd_rc_chan_s::tx_mr_bufs
855     ibd_rc_chan_s::tx_mr_hdl
856     ibd_rc_chan_s::tx_rel_list.dl_cnt
857     ibd_rc_chan_s::is_used
858     ibd_rc_tx_largebuf_s::lb_buf
859     ibd_rc_msg_hello_s
860     ibt_cm_return_args_s))
861 
862 /*
863  * ibd_rc_chan_s::next is protected by two mutexes:
864  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
865  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
866  */
867 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
868     ibd_rc_chan_s::next))
869 
870 /*
871  * ibd_state_s.rc_tx_large_bufs_lock
872  */
873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
874     ibd_state_s::rc_tx_largebuf_free_head))
875 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
876     ibd_state_s::rc_tx_largebuf_nfree))
877 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
878     ibd_rc_tx_largebuf_s::lb_next))
879 
880 /*
881  * ibd_acache_s.tx_too_big_mutex
882  */
883 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
884     ibd_acache_s::tx_too_big_ongoing))
885 
886 /*
887  * tx_wqe_list.dl_mutex
888  */
889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
890     ibd_rc_chan_s::tx_wqe_list.dl_head))
891 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
892     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
893 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
894     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
895 
896 /*
897  * ibd_state_s.rc_ace_recycle_lock
898  */
899 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
900     ibd_state_s::rc_ace_recycle))
901 
902 /*
903  * rc_srq_rwqe_list.dl_mutex
904  */
905 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
906     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
907 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
908     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
909 
910 /*
911  * Non-mutex protection schemes for data elements. They are counters
912  * for problem diagnosis. Don't need be protected.
913  */
914 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
915     ibd_state_s::rc_rcv_alloc_fail
916     ibd_state_s::rc_rcq_err
917     ibd_state_s::rc_ace_not_found
918     ibd_state_s::rc_xmt_drop_too_long_pkt
919     ibd_state_s::rc_xmt_icmp_too_long_pkt
920     ibd_state_s::rc_xmt_reenter_too_long_pkt
921     ibd_state_s::rc_swqe_short
922     ibd_state_s::rc_swqe_mac_update
923     ibd_state_s::rc_xmt_buf_short
924     ibd_state_s::rc_xmt_buf_mac_update
925     ibd_state_s::rc_scq_no_swqe
926     ibd_state_s::rc_scq_no_largebuf
927     ibd_state_s::rc_conn_succ
928     ibd_state_s::rc_conn_fail
929     ibd_state_s::rc_null_conn
930     ibd_state_s::rc_no_estab_conn
931     ibd_state_s::rc_act_close
932     ibd_state_s::rc_pas_close
933     ibd_state_s::rc_delay_ace_recycle
934     ibd_state_s::rc_act_close_simultaneous
935     ibd_state_s::rc_act_close_not_clean
936     ibd_state_s::rc_pas_close_rcq_invoking
937     ibd_state_s::rc_reset_cnt
938     ibd_state_s::rc_timeout_act
939     ibd_state_s::rc_timeout_pas
940     ibd_state_s::rc_stop_connect))
941 
942 #ifdef DEBUG
943 /*
944  * Non-mutex protection schemes for data elements. They are counters
945  * for problem diagnosis. Don't need be protected.
946  */
947 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
948     ibd_state_s::rc_rwqe_short
949     ibd_rc_stat_s::rc_rcv_trans_byte
950     ibd_rc_stat_s::rc_rcv_trans_pkt
951     ibd_rc_stat_s::rc_rcv_copy_byte
952     ibd_rc_stat_s::rc_rcv_copy_pkt
953     ibd_rc_stat_s::rc_rcv_alloc_fail
954     ibd_rc_stat_s::rc_rcq_err
955     ibd_rc_stat_s::rc_rwqe_short
956     ibd_rc_stat_s::rc_xmt_bytes
957     ibd_rc_stat_s::rc_xmt_small_pkt
958     ibd_rc_stat_s::rc_xmt_fragmented_pkt
959     ibd_rc_stat_s::rc_xmt_map_fail_pkt
960     ibd_rc_stat_s::rc_xmt_map_succ_pkt
961     ibd_rc_stat_s::rc_ace_not_found
962     ibd_rc_stat_s::rc_scq_no_swqe
963     ibd_rc_stat_s::rc_scq_no_largebuf
964     ibd_rc_stat_s::rc_swqe_short
965     ibd_rc_stat_s::rc_swqe_mac_update
966     ibd_rc_stat_s::rc_xmt_buf_short
967     ibd_rc_stat_s::rc_xmt_buf_mac_update
968     ibd_rc_stat_s::rc_conn_succ
969     ibd_rc_stat_s::rc_conn_fail
970     ibd_rc_stat_s::rc_null_conn
971     ibd_rc_stat_s::rc_no_estab_conn
972     ibd_rc_stat_s::rc_act_close
973     ibd_rc_stat_s::rc_pas_close
974     ibd_rc_stat_s::rc_delay_ace_recycle
975     ibd_rc_stat_s::rc_act_close_simultaneous
976     ibd_rc_stat_s::rc_reset_cnt
977     ibd_rc_stat_s::rc_timeout_act
978     ibd_rc_stat_s::rc_timeout_pas))
979 #endif
980 
981 int
982 _init()
983 {
984 	int status;
985 
986 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
987 	    PAGESIZE), 0);
988 	if (status != 0) {
989 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
990 		return (status);
991 	}
992 
993 	mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
994 
995 	mac_init_ops(&ibd_dev_ops, "ibp");
996 	status = mod_install(&ibd_modlinkage);
997 	if (status != 0) {
998 		DPRINT(10, "_init:failed in mod_install()");
999 		ddi_soft_state_fini(&ibd_list);
1000 		mac_fini_ops(&ibd_dev_ops);
1001 		return (status);
1002 	}
1003 
1004 	mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1005 	mutex_enter(&ibd_gstate.ig_mutex);
1006 	ibd_gstate.ig_ibt_hdl = NULL;
1007 	ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1008 	ibd_gstate.ig_service_list = NULL;
1009 	mutex_exit(&ibd_gstate.ig_mutex);
1010 
1011 	if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1012 	    DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1013 		return (EIO);
1014 	}
1015 
1016 	ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1017 
1018 #ifdef IBD_LOGGING
1019 	ibd_log_init();
1020 #endif
1021 	return (0);
1022 }
1023 
1024 int
_info(struct modinfo * modinfop)1025 _info(struct modinfo *modinfop)
1026 {
1027 	return (mod_info(&ibd_modlinkage, modinfop));
1028 }
1029 
1030 int
_fini()1031 _fini()
1032 {
1033 	int status;
1034 
1035 	status = mod_remove(&ibd_modlinkage);
1036 	if (status != 0)
1037 		return (status);
1038 
1039 	ibt_unregister_part_attr_cb();
1040 
1041 	mac_fini_ops(&ibd_dev_ops);
1042 	mutex_destroy(&ibd_objlist_lock);
1043 	ddi_soft_state_fini(&ibd_list);
1044 	mutex_destroy(&ibd_gstate.ig_mutex);
1045 #ifdef IBD_LOGGING
1046 	ibd_log_fini();
1047 #endif
1048 	return (0);
1049 }
1050 
1051 /*
1052  * Convert the GID part of the mac address from network byte order
1053  * to host order.
1054  */
1055 static void
ibd_n2h_gid(ipoib_mac_t * mac,ib_gid_t * dgid)1056 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1057 {
1058 	ib_sn_prefix_t nbopref;
1059 	ib_guid_t nboguid;
1060 
1061 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1062 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1063 	dgid->gid_prefix = b2h64(nbopref);
1064 	dgid->gid_guid = b2h64(nboguid);
1065 }
1066 
1067 /*
1068  * Create the IPoIB address in network byte order from host order inputs.
1069  */
1070 static void
ibd_h2n_mac(ipoib_mac_t * mac,ib_qpn_t qpn,ib_sn_prefix_t prefix,ib_guid_t guid)1071 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1072     ib_guid_t guid)
1073 {
1074 	ib_sn_prefix_t nbopref;
1075 	ib_guid_t nboguid;
1076 
1077 	mac->ipoib_qpn = htonl(qpn);
1078 	nbopref = h2b64(prefix);
1079 	nboguid = h2b64(guid);
1080 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1081 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1082 }
1083 
1084 /*
1085  * Send to the appropriate all-routers group when the IBA multicast group
1086  * does not exist, based on whether the target group is v4 or v6.
1087  */
1088 static boolean_t
ibd_get_allroutergroup(ibd_state_t * state,ipoib_mac_t * mcmac,ipoib_mac_t * rmac)1089 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1090     ipoib_mac_t *rmac)
1091 {
1092 	boolean_t retval = B_TRUE;
1093 	uint32_t adjscope = state->id_scope << 16;
1094 	uint32_t topword;
1095 
1096 	/*
1097 	 * Copy the first 4 bytes in without assuming any alignment of
1098 	 * input mac address; this will have IPoIB signature, flags and
1099 	 * scope bits.
1100 	 */
1101 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1102 	topword = ntohl(topword);
1103 
1104 	/*
1105 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
1106 	 */
1107 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1108 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1109 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1110 		    ((uint32_t)(state->id_pkey << 16))),
1111 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1112 	else
1113 		/*
1114 		 * Does not have proper bits in the mgid address.
1115 		 */
1116 		retval = B_FALSE;
1117 
1118 	return (retval);
1119 }
1120 
1121 /*
1122  * Membership states for different mcg's are tracked by two lists:
1123  * the "non" list is used for promiscuous mode, when all mcg traffic
1124  * needs to be inspected. This type of membership is never used for
1125  * transmission, so there can not be an AH in the active list
1126  * corresponding to a member in this list. This list does not need
1127  * any protection, since all operations are performed by the async
1128  * thread.
1129  *
1130  * "Full" and "SendOnly" membership is tracked using a single list,
1131  * the "full" list. This is because this single list can then be
1132  * searched during transmit to a multicast group (if an AH for the
1133  * mcg is not found in the active list), since at least one type
1134  * of membership must be present before initiating the transmit.
1135  * This list is also emptied during driver detach, since sendonly
1136  * membership acquired during transmit is dropped at detach time
1137  * along with ipv4 broadcast full membership. Insert/deletes to
1138  * this list are done only by the async thread, but it is also
1139  * searched in program context (see multicast disable case), thus
1140  * the id_mc_mutex protects the list. The driver detach path also
1141  * deconstructs the "full" list, but it ensures that the async
1142  * thread will not be accessing the list (by blocking out mcg
1143  * trap handling and making sure no more Tx reaping will happen).
1144  *
1145  * Currently, an IBA attach is done in the SendOnly case too,
1146  * although this is not required.
1147  */
1148 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1149 	list_insert_head(&state->id_mc_full, mce)
1150 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1151 	list_insert_head(&state->id_mc_non, mce)
1152 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1153 	ibd_mcache_find(mgid, &state->id_mc_full)
1154 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1155 	ibd_mcache_find(mgid, &state->id_mc_non)
1156 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1157 	list_remove(&state->id_mc_full, mce)
1158 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1159 	list_remove(&state->id_mc_non, mce)
1160 
1161 static void *
list_get_head(list_t * list)1162 list_get_head(list_t *list)
1163 {
1164 	list_node_t *lhead = list_head(list);
1165 
1166 	if (lhead != NULL)
1167 		list_remove(list, lhead);
1168 	return (lhead);
1169 }
1170 
1171 /*
1172  * This is always guaranteed to be able to queue the work.
1173  */
1174 void
ibd_queue_work_slot(ibd_state_t * state,ibd_req_t * ptr,int op)1175 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1176 {
1177 	/* Initialize request */
1178 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1179 	ptr->rq_op = op;
1180 
1181 	/*
1182 	 * Queue provided slot onto request pool.
1183 	 */
1184 	mutex_enter(&state->id_acache_req_lock);
1185 	list_insert_tail(&state->id_req_list, ptr);
1186 
1187 	/* Go, fetch, async thread */
1188 	cv_signal(&state->id_acache_req_cv);
1189 	mutex_exit(&state->id_acache_req_lock);
1190 }
1191 
1192 /*
1193  * Main body of the per interface async thread.
1194  */
1195 static void
ibd_async_work(ibd_state_t * state)1196 ibd_async_work(ibd_state_t *state)
1197 {
1198 	ibd_req_t *ptr;
1199 	callb_cpr_t cprinfo;
1200 
1201 	mutex_enter(&state->id_acache_req_lock);
1202 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1203 	    callb_generic_cpr, "ibd_async_work");
1204 
1205 	for (;;) {
1206 		ptr = list_get_head(&state->id_req_list);
1207 		if (ptr != NULL) {
1208 			mutex_exit(&state->id_acache_req_lock);
1209 
1210 			/*
1211 			 * If we are in late hca initialization mode, do not
1212 			 * process any other async request other than TRAP. TRAP
1213 			 * is used for indicating creation of a broadcast group;
1214 			 * in which case, we need to join/create the group.
1215 			 */
1216 			if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1217 			    (ptr->rq_op != IBD_ASYNC_TRAP)) {
1218 				goto free_req_and_continue;
1219 			}
1220 
1221 			/*
1222 			 * Once we have done the operation, there is no
1223 			 * guarantee the request slot is going to be valid,
1224 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1225 			 * TRAP).
1226 			 *
1227 			 * Perform the request.
1228 			 */
1229 			switch (ptr->rq_op) {
1230 				case IBD_ASYNC_GETAH:
1231 					ibd_async_acache(state, &ptr->rq_mac);
1232 					break;
1233 				case IBD_ASYNC_JOIN:
1234 				case IBD_ASYNC_LEAVE:
1235 					ibd_async_multicast(state,
1236 					    ptr->rq_gid, ptr->rq_op);
1237 					break;
1238 				case IBD_ASYNC_PROMON:
1239 					ibd_async_setprom(state);
1240 					break;
1241 				case IBD_ASYNC_PROMOFF:
1242 					ibd_async_unsetprom(state);
1243 					break;
1244 				case IBD_ASYNC_REAP:
1245 					ibd_async_reap_group(state,
1246 					    ptr->rq_ptr, ptr->rq_gid,
1247 					    IB_MC_JSTATE_FULL);
1248 					/*
1249 					 * the req buf contains in mce
1250 					 * structure, so we do not need
1251 					 * to free it here.
1252 					 */
1253 					ptr = NULL;
1254 					break;
1255 				case IBD_ASYNC_TRAP:
1256 					ibd_async_trap(state, ptr);
1257 					break;
1258 				case IBD_ASYNC_SCHED:
1259 					ibd_async_txsched(state);
1260 					break;
1261 				case IBD_ASYNC_LINK:
1262 					ibd_async_link(state, ptr);
1263 					break;
1264 				case IBD_ASYNC_EXIT:
1265 					mutex_enter(&state->id_acache_req_lock);
1266 #ifndef __lock_lint
1267 					CALLB_CPR_EXIT(&cprinfo);
1268 #else
1269 					mutex_exit(&state->id_acache_req_lock);
1270 #endif
1271 					return;
1272 				case IBD_ASYNC_RC_TOO_BIG:
1273 					ibd_async_rc_process_too_big(state,
1274 					    ptr);
1275 					break;
1276 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1277 					ibd_async_rc_close_act_chan(state, ptr);
1278 					break;
1279 				case IBD_ASYNC_RC_RECYCLE_ACE:
1280 					ibd_async_rc_recycle_ace(state, ptr);
1281 					break;
1282 				case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
1283 					(void) ibd_rc_pas_close(ptr->rq_ptr,
1284 					    B_TRUE, B_TRUE);
1285 					break;
1286 			}
1287 free_req_and_continue:
1288 			if (ptr != NULL)
1289 				kmem_cache_free(state->id_req_kmc, ptr);
1290 
1291 			mutex_enter(&state->id_acache_req_lock);
1292 		} else {
1293 #ifndef __lock_lint
1294 			/*
1295 			 * Nothing to do: wait till new request arrives.
1296 			 */
1297 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1298 			cv_wait(&state->id_acache_req_cv,
1299 			    &state->id_acache_req_lock);
1300 			CALLB_CPR_SAFE_END(&cprinfo,
1301 			    &state->id_acache_req_lock);
1302 #endif
1303 		}
1304 	}
1305 
1306 	/*NOTREACHED*/
1307 	_NOTE(NOT_REACHED)
1308 }
1309 
1310 /*
1311  * Return when it is safe to queue requests to the async daemon; primarily
1312  * for subnet trap and async event handling. Disallow requests before the
1313  * daemon is created, and when interface deinitilization starts.
1314  */
1315 static boolean_t
ibd_async_safe(ibd_state_t * state)1316 ibd_async_safe(ibd_state_t *state)
1317 {
1318 	mutex_enter(&state->id_trap_lock);
1319 	if (state->id_trap_stop) {
1320 		mutex_exit(&state->id_trap_lock);
1321 		return (B_FALSE);
1322 	}
1323 	state->id_trap_inprog++;
1324 	mutex_exit(&state->id_trap_lock);
1325 	return (B_TRUE);
1326 }
1327 
1328 /*
1329  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1330  * trap or event handling to complete to kill the async thread and deconstruct
1331  * the mcg/ace list.
1332  */
1333 static void
ibd_async_done(ibd_state_t * state)1334 ibd_async_done(ibd_state_t *state)
1335 {
1336 	mutex_enter(&state->id_trap_lock);
1337 	if (--state->id_trap_inprog == 0)
1338 		cv_signal(&state->id_trap_cv);
1339 	mutex_exit(&state->id_trap_lock);
1340 }
1341 
1342 /*
1343  * Hash functions:
1344  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1345  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1346  * These operate on mac addresses input into ibd_send, but there is no
1347  * guarantee on the alignment of the ipoib_mac_t structure.
1348  */
1349 /*ARGSUSED*/
1350 static uint_t
ibd_hash_by_id(void * hash_data,mod_hash_key_t key)1351 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1352 {
1353 	ulong_t ptraddr = (ulong_t)key;
1354 	uint_t hval;
1355 
1356 	/*
1357 	 * If the input address is 4 byte aligned, we can just dereference
1358 	 * it. This is most common, since IP will send in a 4 byte aligned
1359 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1360 	 * 4 byte aligned too.
1361 	 */
1362 	if ((ptraddr & 3) == 0)
1363 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1364 
1365 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1366 	return (hval);
1367 }
1368 
1369 static int
ibd_hash_key_cmp(mod_hash_key_t key1,mod_hash_key_t key2)1370 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1371 {
1372 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1373 		return (0);
1374 	else
1375 		return (1);
1376 }
1377 
1378 /*
1379  * Initialize all the per interface caches and lists; AH cache,
1380  * MCG list etc.
1381  */
1382 static int
ibd_acache_init(ibd_state_t * state)1383 ibd_acache_init(ibd_state_t *state)
1384 {
1385 	ibd_ace_t *ce;
1386 	int i;
1387 
1388 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1389 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1390 	mutex_enter(&state->id_ac_mutex);
1391 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1392 	    offsetof(ibd_ace_t, ac_list));
1393 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1394 	    offsetof(ibd_ace_t, ac_list));
1395 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1396 	    state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1397 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1398 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1399 	    offsetof(ibd_mce_t, mc_list));
1400 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1401 	    offsetof(ibd_mce_t, mc_list));
1402 	state->id_ac_hot_ace = NULL;
1403 
1404 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1405 	    state->id_num_ah, KM_SLEEP);
1406 	for (i = 0; i < state->id_num_ah; i++, ce++) {
1407 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1408 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1409 			mutex_exit(&state->id_ac_mutex);
1410 			ibd_acache_fini(state);
1411 			return (DDI_FAILURE);
1412 		} else {
1413 			CLEAR_REFCYCLE(ce);
1414 			ce->ac_mce = NULL;
1415 			mutex_init(&ce->tx_too_big_mutex, NULL,
1416 			    MUTEX_DRIVER, NULL);
1417 			IBD_ACACHE_INSERT_FREE(state, ce);
1418 		}
1419 	}
1420 	mutex_exit(&state->id_ac_mutex);
1421 	return (DDI_SUCCESS);
1422 }
1423 
1424 static void
ibd_acache_fini(ibd_state_t * state)1425 ibd_acache_fini(ibd_state_t *state)
1426 {
1427 	ibd_ace_t *ptr;
1428 
1429 	mutex_enter(&state->id_ac_mutex);
1430 
1431 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1432 		ASSERT(GET_REF(ptr) == 0);
1433 		mutex_destroy(&ptr->tx_too_big_mutex);
1434 		(void) ibt_free_ud_dest(ptr->ac_dest);
1435 	}
1436 
1437 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1438 		ASSERT(GET_REF(ptr) == 0);
1439 		mutex_destroy(&ptr->tx_too_big_mutex);
1440 		(void) ibt_free_ud_dest(ptr->ac_dest);
1441 	}
1442 
1443 	list_destroy(&state->id_ah_free);
1444 	list_destroy(&state->id_ah_active);
1445 	list_destroy(&state->id_mc_full);
1446 	list_destroy(&state->id_mc_non);
1447 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1448 	mutex_exit(&state->id_ac_mutex);
1449 	mutex_destroy(&state->id_ac_mutex);
1450 	mutex_destroy(&state->id_mc_mutex);
1451 }
1452 
1453 /*
1454  * Search AH active hash list for a cached path to input destination.
1455  * If we are "just looking", hold == F. When we are in the Tx path,
1456  * we set hold == T to grab a reference on the AH so that it can not
1457  * be recycled to a new destination while the Tx request is posted.
1458  */
1459 ibd_ace_t *
ibd_acache_find(ibd_state_t * state,ipoib_mac_t * mac,boolean_t hold,int num)1460 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1461 {
1462 	ibd_ace_t *ptr;
1463 
1464 	ASSERT(mutex_owned(&state->id_ac_mutex));
1465 
1466 	/*
1467 	 * Do hash search.
1468 	 */
1469 	if (mod_hash_find(state->id_ah_active_hash,
1470 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1471 		if (hold)
1472 			INC_REF(ptr, num);
1473 		return (ptr);
1474 	}
1475 	return (NULL);
1476 }
1477 
1478 /*
1479  * This is called by the tx side; if an initialized AH is found in
1480  * the active list, it is locked down and can be used; if no entry
1481  * is found, an async request is queued to do path resolution.
1482  */
1483 static ibd_ace_t *
ibd_acache_lookup(ibd_state_t * state,ipoib_mac_t * mac,int * err,int numwqe)1484 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1485 {
1486 	ibd_ace_t *ptr;
1487 	ibd_req_t *req;
1488 
1489 	/*
1490 	 * Only attempt to print when we can; in the mdt pattr case, the
1491 	 * address is not aligned properly.
1492 	 */
1493 	if (((ulong_t)mac & 3) == 0) {
1494 		DPRINT(4,
1495 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1496 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1497 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1498 		    htonl(mac->ipoib_gidsuff[1]));
1499 	}
1500 
1501 	mutex_enter(&state->id_ac_mutex);
1502 
1503 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1504 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1505 		INC_REF(ptr, numwqe);
1506 		mutex_exit(&state->id_ac_mutex);
1507 		return (ptr);
1508 	}
1509 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1510 		state->id_ac_hot_ace = ptr;
1511 		mutex_exit(&state->id_ac_mutex);
1512 		return (ptr);
1513 	}
1514 
1515 	/*
1516 	 * Implementation of a single outstanding async request; if
1517 	 * the operation is not started yet, queue a request and move
1518 	 * to ongoing state. Remember in id_ah_addr for which address
1519 	 * we are queueing the request, in case we need to flag an error;
1520 	 * Any further requests, for the same or different address, until
1521 	 * the operation completes, is sent back to GLDv3 to be retried.
1522 	 * The async thread will update id_ah_op with an error indication
1523 	 * or will set it to indicate the next look up can start; either
1524 	 * way, it will mac_tx_update() so that all blocked requests come
1525 	 * back here.
1526 	 */
1527 	*err = EAGAIN;
1528 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1529 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1530 		if (req != NULL) {
1531 			/*
1532 			 * We did not even find the entry; queue a request
1533 			 * for it.
1534 			 */
1535 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1536 			state->id_ah_op = IBD_OP_ONGOING;
1537 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1538 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1539 		}
1540 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1541 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1542 		/*
1543 		 * Check the status of the pathrecord lookup request
1544 		 * we had queued before.
1545 		 */
1546 		if (state->id_ah_op == IBD_OP_ERRORED) {
1547 			*err = EFAULT;
1548 			state->id_ah_error++;
1549 		} else {
1550 			/*
1551 			 * IBD_OP_ROUTERED case: We need to send to the
1552 			 * all-router MCG. If we can find the AH for
1553 			 * the mcg, the Tx will be attempted. If we
1554 			 * do not find the AH, we return NORESOURCES
1555 			 * to retry.
1556 			 */
1557 			ipoib_mac_t routermac;
1558 
1559 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1560 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1561 			    numwqe);
1562 		}
1563 		state->id_ah_op = IBD_OP_NOTSTARTED;
1564 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1565 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1566 		/*
1567 		 * This case can happen when we get a higher band
1568 		 * packet. The easiest way is to reset the state machine
1569 		 * to accommodate the higher priority packet.
1570 		 */
1571 		state->id_ah_op = IBD_OP_NOTSTARTED;
1572 	}
1573 	mutex_exit(&state->id_ac_mutex);
1574 
1575 	return (ptr);
1576 }
1577 
1578 /*
1579  * Grab a not-currently-in-use AH/PathRecord from the active
1580  * list to recycle to a new destination. Only the async thread
1581  * executes this code.
1582  */
1583 static ibd_ace_t *
ibd_acache_get_unref(ibd_state_t * state)1584 ibd_acache_get_unref(ibd_state_t *state)
1585 {
1586 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1587 	boolean_t try_rc_chan_recycle = B_FALSE;
1588 
1589 	ASSERT(mutex_owned(&state->id_ac_mutex));
1590 
1591 	/*
1592 	 * Do plain linear search.
1593 	 */
1594 	while (ptr != NULL) {
1595 		/*
1596 		 * Note that it is possible that the "cycle" bit
1597 		 * is set on the AH w/o any reference count. The
1598 		 * mcg must have been deleted, and the tx cleanup
1599 		 * just decremented the reference count to 0, but
1600 		 * hasn't gotten around to grabbing the id_ac_mutex
1601 		 * to move the AH into the free list.
1602 		 */
1603 		if (GET_REF(ptr) == 0) {
1604 			if (ptr->ac_chan != NULL) {
1605 				ASSERT(state->id_enable_rc == B_TRUE);
1606 				if (!try_rc_chan_recycle) {
1607 					try_rc_chan_recycle = B_TRUE;
1608 					ibd_rc_signal_ace_recycle(state, ptr);
1609 				}
1610 			} else {
1611 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1612 				break;
1613 			}
1614 		}
1615 		ptr = list_prev(&state->id_ah_active, ptr);
1616 	}
1617 	return (ptr);
1618 }
1619 
1620 /*
1621  * Invoked to clean up AH from active list in case of multicast
1622  * disable and to handle sendonly memberships during mcg traps.
1623  * And for port up processing for multicast and unicast AHs.
1624  * Normally, the AH is taken off the active list, and put into
1625  * the free list to be recycled for a new destination. In case
1626  * Tx requests on the AH have not completed yet, the AH is marked
1627  * for reaping (which will put the AH on the free list) once the Tx's
1628  * complete; in this case, depending on the "force" input, we take
1629  * out the AH from the active list right now, or leave it also for
1630  * the reap operation. Returns TRUE if the AH is taken off the active
1631  * list (and either put into the free list right now, or arranged for
1632  * later), FALSE otherwise.
1633  */
1634 boolean_t
ibd_acache_recycle(ibd_state_t * state,ipoib_mac_t * acmac,boolean_t force)1635 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1636 {
1637 	ibd_ace_t *acactive;
1638 	boolean_t ret = B_TRUE;
1639 
1640 	ASSERT(mutex_owned(&state->id_ac_mutex));
1641 
1642 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1643 
1644 		/*
1645 		 * Note that the AH might already have the cycle bit set
1646 		 * on it; this might happen if sequences of multicast
1647 		 * enables and disables are coming so fast, that posted
1648 		 * Tx's to the mcg have not completed yet, and the cycle
1649 		 * bit is set successively by each multicast disable.
1650 		 */
1651 		if (SET_CYCLE_IF_REF(acactive)) {
1652 			if (!force) {
1653 				/*
1654 				 * The ace is kept on the active list, further
1655 				 * Tx's can still grab a reference on it; the
1656 				 * ace is reaped when all pending Tx's
1657 				 * referencing the AH complete.
1658 				 */
1659 				ret = B_FALSE;
1660 			} else {
1661 				/*
1662 				 * In the mcg trap case, we always pull the
1663 				 * AH from the active list. And also the port
1664 				 * up multi/unicast case.
1665 				 */
1666 				ASSERT(acactive->ac_chan == NULL);
1667 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1668 				acactive->ac_mce = NULL;
1669 			}
1670 		} else {
1671 			/*
1672 			 * Determined the ref count is 0, thus reclaim
1673 			 * immediately after pulling out the ace from
1674 			 * the active list.
1675 			 */
1676 			ASSERT(acactive->ac_chan == NULL);
1677 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1678 			acactive->ac_mce = NULL;
1679 			IBD_ACACHE_INSERT_FREE(state, acactive);
1680 		}
1681 
1682 	}
1683 	return (ret);
1684 }
1685 
1686 /*
1687  * Helper function for async path record lookup. If we are trying to
1688  * Tx to a MCG, check our membership, possibly trying to join the
1689  * group if required. If that fails, try to send the packet to the
1690  * all router group (indicated by the redirect output), pointing
1691  * the input mac address to the router mcg address.
1692  */
1693 static ibd_mce_t *
ibd_async_mcache(ibd_state_t * state,ipoib_mac_t * mac,boolean_t * redirect)1694 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1695 {
1696 	ib_gid_t mgid;
1697 	ibd_mce_t *mce;
1698 	ipoib_mac_t routermac;
1699 
1700 	*redirect = B_FALSE;
1701 	ibd_n2h_gid(mac, &mgid);
1702 
1703 	/*
1704 	 * Check the FullMember+SendOnlyNonMember list.
1705 	 * Since we are the only one who manipulates the
1706 	 * id_mc_full list, no locks are needed.
1707 	 */
1708 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1709 	if (mce != NULL) {
1710 		DPRINT(4, "ibd_async_mcache : already joined to group");
1711 		return (mce);
1712 	}
1713 
1714 	/*
1715 	 * Not found; try to join(SendOnlyNonMember) and attach.
1716 	 */
1717 	DPRINT(4, "ibd_async_mcache : not joined to group");
1718 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1719 	    NULL) {
1720 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1721 		return (mce);
1722 	}
1723 
1724 	/*
1725 	 * MCGroup not present; try to join the all-router group. If
1726 	 * any of the following steps succeed, we will be redirecting
1727 	 * to the all router group.
1728 	 */
1729 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1730 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1731 		return (NULL);
1732 	*redirect = B_TRUE;
1733 	ibd_n2h_gid(&routermac, &mgid);
1734 	bcopy(&routermac, mac, IPOIB_ADDRL);
1735 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1736 	    mgid.gid_prefix, mgid.gid_guid);
1737 
1738 	/*
1739 	 * Are we already joined to the router group?
1740 	 */
1741 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1742 		DPRINT(4, "ibd_async_mcache : using already joined router"
1743 		    "group\n");
1744 		return (mce);
1745 	}
1746 
1747 	/*
1748 	 * Can we join(SendOnlyNonMember) the router group?
1749 	 */
1750 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1751 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1752 	    NULL) {
1753 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1754 		return (mce);
1755 	}
1756 
1757 	return (NULL);
1758 }
1759 
1760 /*
1761  * Async path record lookup code.
1762  */
1763 static void
ibd_async_acache(ibd_state_t * state,ipoib_mac_t * mac)1764 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1765 {
1766 	ibd_ace_t *ce;
1767 	ibd_mce_t *mce = NULL;
1768 	ibt_path_attr_t path_attr;
1769 	ibt_path_info_t path_info;
1770 	ib_gid_t destgid;
1771 	char ret = IBD_OP_NOTSTARTED;
1772 
1773 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1774 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1775 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1776 	    htonl(mac->ipoib_gidsuff[1]));
1777 
1778 	/*
1779 	 * Check whether we are trying to transmit to a MCG.
1780 	 * In that case, we need to make sure we are a member of
1781 	 * the MCG.
1782 	 */
1783 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1784 		boolean_t redirected;
1785 
1786 		/*
1787 		 * If we can not find or join the group or even
1788 		 * redirect, error out.
1789 		 */
1790 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1791 		    NULL) {
1792 			state->id_ah_op = IBD_OP_ERRORED;
1793 			return;
1794 		}
1795 
1796 		/*
1797 		 * If we got redirected, we need to determine whether
1798 		 * the AH for the new mcg is in the cache already, and
1799 		 * not pull it in then; otherwise proceed to get the
1800 		 * path for the new mcg. There is no guarantee that
1801 		 * if the AH is currently in the cache, it will still be
1802 		 * there when we look in ibd_acache_lookup(), but that's
1803 		 * okay, we will come back here.
1804 		 */
1805 		if (redirected) {
1806 			ret = IBD_OP_ROUTERED;
1807 			DPRINT(4, "ibd_async_acache :  redirected to "
1808 			    "%08X:%08X:%08X:%08X:%08X",
1809 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1810 			    htonl(mac->ipoib_gidpref[1]),
1811 			    htonl(mac->ipoib_gidsuff[0]),
1812 			    htonl(mac->ipoib_gidsuff[1]));
1813 
1814 			mutex_enter(&state->id_ac_mutex);
1815 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1816 				state->id_ah_op = IBD_OP_ROUTERED;
1817 				mutex_exit(&state->id_ac_mutex);
1818 				DPRINT(4, "ibd_async_acache : router AH found");
1819 				return;
1820 			}
1821 			mutex_exit(&state->id_ac_mutex);
1822 		}
1823 	}
1824 
1825 	/*
1826 	 * Get an AH from the free list.
1827 	 */
1828 	mutex_enter(&state->id_ac_mutex);
1829 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1830 		/*
1831 		 * No free ones; try to grab an unreferenced active
1832 		 * one. Maybe we need to make the active list LRU,
1833 		 * but that will create more work for Tx callbacks.
1834 		 * Is there a way of not having to pull out the
1835 		 * entry from the active list, but just indicate it
1836 		 * is being recycled? Yes, but that creates one more
1837 		 * check in the fast lookup path.
1838 		 */
1839 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1840 			/*
1841 			 * Pretty serious shortage now.
1842 			 */
1843 			state->id_ah_op = IBD_OP_NOTSTARTED;
1844 			mutex_exit(&state->id_ac_mutex);
1845 			DPRINT(10, "ibd_async_acache : failed to find AH "
1846 			    "slot\n");
1847 			return;
1848 		}
1849 		/*
1850 		 * We could check whether ac_mce points to a SendOnly
1851 		 * member and drop that membership now. Or do it lazily
1852 		 * at detach time.
1853 		 */
1854 		ce->ac_mce = NULL;
1855 	}
1856 	mutex_exit(&state->id_ac_mutex);
1857 	ASSERT(ce->ac_mce == NULL);
1858 
1859 	/*
1860 	 * Update the entry.
1861 	 */
1862 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1863 
1864 	bzero(&path_info, sizeof (path_info));
1865 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1866 	path_attr.pa_sgid = state->id_sgid;
1867 	path_attr.pa_num_dgids = 1;
1868 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1869 	path_attr.pa_dgids = &destgid;
1870 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1871 	path_attr.pa_pkey = state->id_pkey;
1872 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1873 	    &path_info, NULL) != IBT_SUCCESS) {
1874 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1875 		goto error;
1876 	}
1877 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1878 	    ntohl(ce->ac_mac.ipoib_qpn),
1879 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1880 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1881 		goto error;
1882 	}
1883 
1884 	/*
1885 	 * mce is set whenever an AH is being associated with a
1886 	 * MCG; this will come in handy when we leave the MCG. The
1887 	 * lock protects Tx fastpath from scanning the active list.
1888 	 */
1889 	if (mce != NULL)
1890 		ce->ac_mce = mce;
1891 
1892 	/*
1893 	 * initiate a RC mode connection for unicast address
1894 	 */
1895 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1896 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1897 		ASSERT(ce->ac_chan == NULL);
1898 		DPRINT(10, "ibd_async_acache: call "
1899 		    "ibd_rc_try_connect(ace=%p)", ce);
1900 		ibd_rc_try_connect(state, ce, &path_info);
1901 		if (ce->ac_chan == NULL) {
1902 			DPRINT(10, "ibd_async_acache: fail to setup RC"
1903 			    " channel");
1904 			state->rc_conn_fail++;
1905 			goto error;
1906 		}
1907 	}
1908 
1909 	mutex_enter(&state->id_ac_mutex);
1910 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1911 	state->id_ah_op = ret;
1912 	mutex_exit(&state->id_ac_mutex);
1913 	return;
1914 error:
1915 	/*
1916 	 * We might want to drop SendOnly membership here if we
1917 	 * joined above. The lock protects Tx callbacks inserting
1918 	 * into the free list.
1919 	 */
1920 	mutex_enter(&state->id_ac_mutex);
1921 	state->id_ah_op = IBD_OP_ERRORED;
1922 	IBD_ACACHE_INSERT_FREE(state, ce);
1923 	mutex_exit(&state->id_ac_mutex);
1924 }
1925 
1926 /*
1927  * While restoring port's presence on the subnet on a port up, it is possible
1928  * that the port goes down again.
1929  */
1930 static void
ibd_async_link(ibd_state_t * state,ibd_req_t * req)1931 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1932 {
1933 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1934 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1935 	    LINK_STATE_UP;
1936 	ibd_mce_t *mce, *pmce;
1937 	ibd_ace_t *ace, *pace;
1938 
1939 	DPRINT(10, "ibd_async_link(): %d", opcode);
1940 
1941 	/*
1942 	 * On a link up, revalidate the link speed/width. No point doing
1943 	 * this on a link down, since we will be unable to do SA operations,
1944 	 * defaulting to the lowest speed. Also notice that we update our
1945 	 * notion of speed before calling mac_link_update(), which will do
1946 	 * necessary higher level notifications for speed changes.
1947 	 */
1948 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1949 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1950 		state->id_link_speed = ibd_get_portspeed(state);
1951 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1952 	}
1953 
1954 	/*
1955 	 * Do all the work required to establish our presence on
1956 	 * the subnet.
1957 	 */
1958 	if (opcode == IBD_LINK_UP_ABSENT) {
1959 		/*
1960 		 * If in promiscuous mode ...
1961 		 */
1962 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1963 			/*
1964 			 * Drop all nonmembership.
1965 			 */
1966 			ibd_async_unsetprom(state);
1967 
1968 			/*
1969 			 * Then, try to regain nonmembership to all mcg's.
1970 			 */
1971 			ibd_async_setprom(state);
1972 
1973 		}
1974 
1975 		/*
1976 		 * Drop all sendonly membership (which also gets rid of the
1977 		 * AHs); try to reacquire all full membership.
1978 		 */
1979 		mce = list_head(&state->id_mc_full);
1980 		while ((pmce = mce) != NULL) {
1981 			mce = list_next(&state->id_mc_full, mce);
1982 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1983 				ibd_leave_group(state,
1984 				    pmce->mc_info.mc_adds_vect.av_dgid,
1985 				    IB_MC_JSTATE_SEND_ONLY_NON);
1986 			else
1987 				ibd_reacquire_group(state, pmce);
1988 		}
1989 
1990 		/*
1991 		 * Recycle all active AHs to free list (and if there are
1992 		 * pending posts, make sure they will go into the free list
1993 		 * once the Tx's complete). Grab the lock to prevent
1994 		 * concurrent Tx's as well as Tx cleanups.
1995 		 */
1996 		mutex_enter(&state->id_ac_mutex);
1997 		ace = list_head(&state->id_ah_active);
1998 		while ((pace = ace) != NULL) {
1999 			boolean_t cycled;
2000 
2001 			ace = list_next(&state->id_ah_active, ace);
2002 			mce = pace->ac_mce;
2003 			if (pace->ac_chan != NULL) {
2004 				ASSERT(mce == NULL);
2005 				ASSERT(state->id_enable_rc == B_TRUE);
2006 				if (pace->ac_chan->chan_state ==
2007 				    IBD_RC_STATE_ACT_ESTAB) {
2008 					INC_REF(pace, 1);
2009 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2010 					pace->ac_chan->chan_state =
2011 					    IBD_RC_STATE_ACT_CLOSING;
2012 					ibd_rc_signal_act_close(state, pace);
2013 				} else {
2014 					state->rc_act_close_simultaneous++;
2015 					DPRINT(40, "ibd_async_link: other "
2016 					    "thread is closing it, ace=%p, "
2017 					    "ac_chan=%p, chan_state=%d",
2018 					    pace, pace->ac_chan,
2019 					    pace->ac_chan->chan_state);
2020 				}
2021 			} else {
2022 				cycled = ibd_acache_recycle(state,
2023 				    &pace->ac_mac, B_TRUE);
2024 			}
2025 			/*
2026 			 * If this is for an mcg, it must be for a fullmember,
2027 			 * since we got rid of send-only members above when
2028 			 * processing the mce list.
2029 			 */
2030 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2031 			    IB_MC_JSTATE_FULL)));
2032 
2033 			/*
2034 			 * Check if the fullmember mce needs to be torn down,
2035 			 * ie whether the DLPI disable has already been done.
2036 			 * If so, do some of the work of tx_cleanup, namely
2037 			 * causing leave (which will fail), detach and
2038 			 * mce-freeing. tx_cleanup will put the AH into free
2039 			 * list. The reason to duplicate some of this
2040 			 * tx_cleanup work is because we want to delete the
2041 			 * AH right now instead of waiting for tx_cleanup, to
2042 			 * force subsequent Tx's to reacquire an AH.
2043 			 */
2044 			if ((mce != NULL) && (mce->mc_fullreap))
2045 				ibd_async_reap_group(state, mce,
2046 				    mce->mc_info.mc_adds_vect.av_dgid,
2047 				    mce->mc_jstate);
2048 		}
2049 		mutex_exit(&state->id_ac_mutex);
2050 	}
2051 
2052 	/*
2053 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2054 	 * (which stops further events from being delivered) before
2055 	 * mac_unregister(). At this point, it is guaranteed that mac_register
2056 	 * has already been done.
2057 	 */
2058 	mutex_enter(&state->id_link_mutex);
2059 	state->id_link_state = lstate;
2060 	mac_link_update(state->id_mh, lstate);
2061 	mutex_exit(&state->id_link_mutex);
2062 
2063 	ibd_async_done(state);
2064 }
2065 
2066 /*
2067  * Check the pkey table to see if we can find the pkey we're looking for.
2068  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2069  * failure.
2070  */
2071 static int
ibd_locate_pkey(ib_pkey_t * pkey_tbl,uint16_t pkey_tbl_sz,ib_pkey_t pkey,uint16_t * pkix)2072 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2073     uint16_t *pkix)
2074 {
2075 	uint16_t ndx;
2076 
2077 	ASSERT(pkix != NULL);
2078 
2079 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2080 		if (pkey_tbl[ndx] == pkey) {
2081 			*pkix = ndx;
2082 			return (0);
2083 		}
2084 	}
2085 	return (-1);
2086 }
2087 
2088 /*
2089  * Late HCA Initialization:
2090  * If plumb had succeeded without the availability of an active port or the
2091  * pkey, and either of their availability is now being indicated via PORT_UP
2092  * or PORT_CHANGE respectively, try a start of the interface.
2093  *
2094  * Normal Operation:
2095  * When the link is notified up, we need to do a few things, based
2096  * on the port's current p_init_type_reply claiming a reinit has been
2097  * done or not. The reinit steps are:
2098  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2099  *    the old Pkey and GID0 are correct.
2100  * 2. Register for mcg traps (already done by ibmf).
2101  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2102  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2103  * 4. Give up all sendonly memberships.
2104  * 5. Acquire all full memberships.
2105  * 6. In promiscuous mode, acquire all non memberships.
2106  * 7. Recycle all AHs to free list.
2107  */
2108 static void
ibd_link_mod(ibd_state_t * state,ibt_async_code_t code)2109 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2110 {
2111 	ibt_hca_portinfo_t *port_infop = NULL;
2112 	ibt_status_t ibt_status;
2113 	uint_t psize, port_infosz;
2114 	ibd_link_op_t opcode;
2115 	ibd_req_t *req;
2116 	link_state_t new_link_state = LINK_STATE_UP;
2117 	uint8_t itreply;
2118 	uint16_t pkix;
2119 	int ret;
2120 
2121 	/*
2122 	 * Let's not race with a plumb or an unplumb; if we detect a
2123 	 * pkey relocation event later on here, we may have to restart.
2124 	 */
2125 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2126 
2127 	mutex_enter(&state->id_link_mutex);
2128 
2129 	/*
2130 	 * If the link state is unknown, a plumb has not yet been attempted
2131 	 * on the interface. Nothing to do.
2132 	 */
2133 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2134 		mutex_exit(&state->id_link_mutex);
2135 		goto link_mod_return;
2136 	}
2137 
2138 	/*
2139 	 * If link state is down because of plumb failure, and we are not in
2140 	 * late HCA init, and we were not successfully plumbed, nothing to do.
2141 	 */
2142 	if ((state->id_link_state == LINK_STATE_DOWN) &&
2143 	    ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2144 	    ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2145 		mutex_exit(&state->id_link_mutex);
2146 		goto link_mod_return;
2147 	}
2148 
2149 	/*
2150 	 * If this routine was called in response to a port down event,
2151 	 * we just need to see if this should be informed.
2152 	 */
2153 	if (code == IBT_ERROR_PORT_DOWN) {
2154 		new_link_state = LINK_STATE_DOWN;
2155 		goto update_link_state;
2156 	}
2157 
2158 	/*
2159 	 * If it's not a port down event we've received, try to get the port
2160 	 * attributes first. If we fail here, the port is as good as down.
2161 	 * Otherwise, if the link went down by the time the handler gets
2162 	 * here, give up - we cannot even validate the pkey/gid since those
2163 	 * are not valid and this is as bad as a port down anyway.
2164 	 */
2165 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2166 	    &port_infop, &psize, &port_infosz);
2167 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2168 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2169 		new_link_state = LINK_STATE_DOWN;
2170 		goto update_link_state;
2171 	}
2172 
2173 	/*
2174 	 * If in the previous attempt, the pkey was not found either due to the
2175 	 * port state being down, or due to it's absence in the pkey table,
2176 	 * look for it now and try to start the interface.
2177 	 */
2178 	if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2179 		mutex_exit(&state->id_link_mutex);
2180 		if ((ret = ibd_start(state)) != 0) {
2181 			DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2182 			    "init, ret=%d", ret);
2183 		}
2184 		ibt_free_portinfo(port_infop, port_infosz);
2185 		goto link_mod_return;
2186 	}
2187 
2188 	/*
2189 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2190 	 * PreserveContentReply are 0, we don't know anything about the
2191 	 * data loaded into the port attributes, so we need to verify
2192 	 * if gid0 and pkey are still valid.
2193 	 */
2194 	itreply = port_infop->p_init_type_reply;
2195 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2196 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2197 		/*
2198 		 * Check to see if the subnet part of GID0 has changed. If
2199 		 * not, check the simple case first to see if the pkey
2200 		 * index is the same as before; finally check to see if the
2201 		 * pkey has been relocated to a different index in the table.
2202 		 */
2203 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2204 		if (bcmp(port_infop->p_sgid_tbl,
2205 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2206 
2207 			new_link_state = LINK_STATE_DOWN;
2208 
2209 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2210 		    state->id_pkey) {
2211 
2212 			new_link_state = LINK_STATE_UP;
2213 
2214 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2215 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2216 
2217 			ibt_free_portinfo(port_infop, port_infosz);
2218 			mutex_exit(&state->id_link_mutex);
2219 
2220 			/*
2221 			 * Currently a restart is required if our pkey has moved
2222 			 * in the pkey table. If we get the ibt_recycle_ud() to
2223 			 * work as documented (expected), we may be able to
2224 			 * avoid a complete restart.  Note that we've already
2225 			 * marked both the start and stop 'in-progress' flags,
2226 			 * so it is ok to go ahead and do this restart.
2227 			 */
2228 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2229 			if ((ret = ibd_start(state)) != 0) {
2230 				DPRINT(10, "ibd_restart: cannot restart, "
2231 				    "ret=%d", ret);
2232 			}
2233 
2234 			goto link_mod_return;
2235 		} else {
2236 			new_link_state = LINK_STATE_DOWN;
2237 		}
2238 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2239 	}
2240 
2241 update_link_state:
2242 	if (port_infop) {
2243 		ibt_free_portinfo(port_infop, port_infosz);
2244 	}
2245 
2246 	/*
2247 	 * If we're reporting a link up, check InitTypeReply to see if
2248 	 * the SM has ensured that the port's presence in mcg, traps,
2249 	 * etc. is intact.
2250 	 */
2251 	if (new_link_state == LINK_STATE_DOWN) {
2252 		opcode = IBD_LINK_DOWN;
2253 	} else {
2254 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2255 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2256 			opcode = IBD_LINK_UP;
2257 		} else {
2258 			opcode = IBD_LINK_UP_ABSENT;
2259 		}
2260 	}
2261 
2262 	/*
2263 	 * If the old state is the same as the new state, and the SM indicated
2264 	 * no change in the port parameters, nothing to do.
2265 	 */
2266 	if ((state->id_link_state == new_link_state) && (opcode !=
2267 	    IBD_LINK_UP_ABSENT)) {
2268 		mutex_exit(&state->id_link_mutex);
2269 		goto link_mod_return;
2270 	}
2271 
2272 	/*
2273 	 * Ok, so there was a link state change; see if it's safe to ask
2274 	 * the async thread to do the work
2275 	 */
2276 	if (!ibd_async_safe(state)) {
2277 		state->id_link_state = new_link_state;
2278 		mutex_exit(&state->id_link_mutex);
2279 		goto link_mod_return;
2280 	}
2281 
2282 	mutex_exit(&state->id_link_mutex);
2283 
2284 	/*
2285 	 * Queue up a request for ibd_async_link() to handle this link
2286 	 * state change event
2287 	 */
2288 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2289 	req->rq_ptr = (void *)opcode;
2290 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2291 
2292 link_mod_return:
2293 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2294 }
2295 
2296 /*
2297  * For the port up/down events, IBTL guarantees there will not be concurrent
2298  * invocations of the handler. IBTL might coalesce link transition events,
2299  * and not invoke the handler for _each_ up/down transition, but it will
2300  * invoke the handler with last known state
2301  */
2302 static void
ibd_async_handler(void * clnt_private,ibt_hca_hdl_t hca_hdl,ibt_async_code_t code,ibt_async_event_t * event)2303 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2304     ibt_async_code_t code, ibt_async_event_t *event)
2305 {
2306 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2307 
2308 	switch (code) {
2309 	case IBT_ERROR_CATASTROPHIC_CHAN:
2310 		ibd_print_warn(state, "catastrophic channel error");
2311 		break;
2312 	case IBT_ERROR_CQ:
2313 		ibd_print_warn(state, "completion queue error");
2314 		break;
2315 	case IBT_PORT_CHANGE_EVENT:
2316 		/*
2317 		 * Events will be delivered to all instances that have
2318 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2319 		 * Only need to do work for our port; IBTF will deliver
2320 		 * events for other ports on the hca we have ibt_open_hca'ed
2321 		 * too. Note that id_port is initialized in ibd_attach()
2322 		 * before we do an ibt_open_hca() in ibd_attach().
2323 		 */
2324 		ASSERT(state->id_hca_hdl == hca_hdl);
2325 		if (state->id_port != event->ev_port)
2326 			break;
2327 
2328 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2329 		    IBT_PORT_CHANGE_PKEY) {
2330 			ibd_link_mod(state, code);
2331 		}
2332 		break;
2333 	case IBT_ERROR_PORT_DOWN:
2334 	case IBT_CLNT_REREG_EVENT:
2335 	case IBT_EVENT_PORT_UP:
2336 		/*
2337 		 * Events will be delivered to all instances that have
2338 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2339 		 * Only need to do work for our port; IBTF will deliver
2340 		 * events for other ports on the hca we have ibt_open_hca'ed
2341 		 * too. Note that id_port is initialized in ibd_attach()
2342 		 * before we do an ibt_open_hca() in ibd_attach().
2343 		 */
2344 		ASSERT(state->id_hca_hdl == hca_hdl);
2345 		if (state->id_port != event->ev_port)
2346 			break;
2347 
2348 		ibd_link_mod(state, code);
2349 		break;
2350 
2351 	case IBT_HCA_ATTACH_EVENT:
2352 	case IBT_HCA_DETACH_EVENT:
2353 		/*
2354 		 * When a new card is plugged to the system, attach_event is
2355 		 * invoked. Additionally, a cfgadm needs to be run to make the
2356 		 * card known to the system, and an ifconfig needs to be run to
2357 		 * plumb up any ibd interfaces on the card. In the case of card
2358 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2359 		 * unplumb the ibd interfaces on the card; when the card is
2360 		 * actually unplugged, the detach_event is invoked;
2361 		 * additionally, if any ibd instances are still active on the
2362 		 * card (eg there were no associated RCM scripts), driver's
2363 		 * detach routine is invoked.
2364 		 */
2365 		break;
2366 	default:
2367 		break;
2368 	}
2369 }
2370 
2371 static int
ibd_register_mac(ibd_state_t * state,dev_info_t * dip)2372 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2373 {
2374 	mac_register_t *macp;
2375 	int ret;
2376 
2377 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2378 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2379 		return (DDI_FAILURE);
2380 	}
2381 
2382 	/*
2383 	 * Note that when we register with mac during attach, we don't
2384 	 * have the id_macaddr yet, so we'll simply be registering a
2385 	 * zero macaddr that we'll overwrite later during plumb (in
2386 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2387 	 * update the mac layer with the correct mtu during plumb.
2388 	 */
2389 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2390 	macp->m_driver = state;
2391 	macp->m_dip = dip;
2392 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2393 	macp->m_callbacks = &ibd_m_callbacks;
2394 	macp->m_min_sdu = 0;
2395 	macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
2396 	if (state->id_type == IBD_PORT_DRIVER) {
2397 		macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2398 	} else if (state->id_enable_rc) {
2399 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2400 	} else {
2401 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
2402 	}
2403 	macp->m_priv_props = ibd_priv_props;
2404 
2405 	/*
2406 	 *  Register ourselves with the GLDv3 interface
2407 	 */
2408 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2409 		mac_free(macp);
2410 		DPRINT(10,
2411 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2412 		return (DDI_FAILURE);
2413 	}
2414 
2415 	mac_free(macp);
2416 	return (DDI_SUCCESS);
2417 }
2418 
2419 static int
ibd_record_capab(ibd_state_t * state)2420 ibd_record_capab(ibd_state_t *state)
2421 {
2422 	ibt_hca_attr_t hca_attrs;
2423 	ibt_status_t ibt_status;
2424 
2425 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2426 
2427 	/*
2428 	 * Query the HCA and fetch its attributes
2429 	 */
2430 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2431 	ASSERT(ibt_status == IBT_SUCCESS);
2432 
2433 	/*
2434 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2435 	 *    full checksum offload.
2436 	 */
2437 	if (state->id_enable_rc) {
2438 			state->id_hwcksum_capab = 0;
2439 	} else {
2440 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2441 		    == IBT_HCA_CKSUM_FULL) {
2442 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2443 		}
2444 	}
2445 
2446 	/*
2447 	 * 2. Set LSO policy, capability and maximum length
2448 	 */
2449 	if (state->id_enable_rc) {
2450 		state->id_lso_capable = B_FALSE;
2451 		state->id_lso_maxlen = 0;
2452 	} else {
2453 		if (hca_attrs.hca_max_lso_size > 0) {
2454 			state->id_lso_capable = B_TRUE;
2455 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2456 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
2457 			else
2458 				state->id_lso_maxlen =
2459 				    hca_attrs.hca_max_lso_size;
2460 		} else {
2461 			state->id_lso_capable = B_FALSE;
2462 			state->id_lso_maxlen = 0;
2463 		}
2464 	}
2465 
2466 	/*
2467 	 * 3. Set Reserved L_Key capability
2468 	 */
2469 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2470 		state->id_hca_res_lkey_capab = 1;
2471 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2472 		state->rc_enable_iov_map = B_TRUE;
2473 	} else {
2474 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
2475 		state->rc_enable_iov_map = B_FALSE;
2476 	}
2477 
2478 	/*
2479 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2480 	 *    size information is provided by the hca
2481 	 */
2482 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2483 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2484 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2485 	} else {
2486 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2487 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2488 	}
2489 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2490 		state->id_max_sqseg = IBD_MAX_SQSEG;
2491 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2492 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2493 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2494 	}
2495 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2496 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2497 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2498 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2499 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2500 	}
2501 
2502 	/*
2503 	 * Translating the virtual address regions into physical regions
2504 	 * for using the Reserved LKey feature results in a wr sgl that
2505 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2506 	 * we'll fix a high-water mark (65%) for when we should stop.
2507 	 */
2508 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2509 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2510 
2511 	/*
2512 	 * 5. Set number of recv and send wqes after checking hca maximum
2513 	 *    channel size. Store the max channel size in the state so that it
2514 	 *    can be referred to when the swqe/rwqe change is requested via
2515 	 *    dladm.
2516 	 */
2517 
2518 	state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2519 
2520 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2521 		state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2522 
2523 	state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2524 	    IBD_RWQE_MIN;
2525 
2526 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2527 		state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2528 
2529 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2530 
2531 	return (DDI_SUCCESS);
2532 }
2533 
2534 static int
ibd_part_busy(ibd_state_t * state)2535 ibd_part_busy(ibd_state_t *state)
2536 {
2537 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2538 		DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2539 		return (DDI_FAILURE);
2540 	}
2541 
2542 	if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2543 		DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2544 		return (DDI_FAILURE);
2545 	}
2546 
2547 	/*
2548 	 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2549 	 * connecting to a remote IPoIB port. We can't remove this port.
2550 	 */
2551 	if (state->id_ah_op == IBD_OP_ONGOING) {
2552 		DPRINT(10, "ibd_part_busy: failed: connecting\n");
2553 		return (DDI_FAILURE);
2554 	}
2555 
2556 	return (DDI_SUCCESS);
2557 }
2558 
2559 
2560 static void
ibd_part_unattach(ibd_state_t * state)2561 ibd_part_unattach(ibd_state_t *state)
2562 {
2563 	uint32_t progress = state->id_mac_state;
2564 	ibt_status_t ret;
2565 
2566 	/* make sure rx resources are freed */
2567 	ibd_free_rx_rsrcs(state);
2568 
2569 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2570 		ASSERT(state->id_enable_rc);
2571 		ibd_rc_fini_srq_list(state);
2572 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2573 	}
2574 
2575 	if (progress & IBD_DRV_MAC_REGISTERED) {
2576 		(void) mac_unregister(state->id_mh);
2577 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2578 	}
2579 
2580 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2581 		/*
2582 		 * No new async requests will be posted since the device
2583 		 * link state has been marked as unknown; completion handlers
2584 		 * have been turned off, so Tx handler will not cause any
2585 		 * more IBD_ASYNC_REAP requests.
2586 		 *
2587 		 * Queue a request for the async thread to exit, which will
2588 		 * be serviced after any pending ones. This can take a while,
2589 		 * specially if the SM is unreachable, since IBMF will slowly
2590 		 * timeout each SM request issued by the async thread.  Reap
2591 		 * the thread before continuing on, we do not want it to be
2592 		 * lingering in modunloaded code.
2593 		 */
2594 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2595 		thread_join(state->id_async_thrid);
2596 
2597 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2598 	}
2599 
2600 	if (progress & IBD_DRV_REQ_LIST_INITED) {
2601 		list_destroy(&state->id_req_list);
2602 		mutex_destroy(&state->id_acache_req_lock);
2603 		cv_destroy(&state->id_acache_req_cv);
2604 		state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2605 	}
2606 
2607 	if (progress & IBD_DRV_PD_ALLOCD) {
2608 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2609 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2610 			ibd_print_warn(state, "failed to free "
2611 			    "protection domain, ret=%d", ret);
2612 		}
2613 		state->id_pd_hdl = NULL;
2614 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2615 	}
2616 
2617 	if (progress & IBD_DRV_HCA_OPENED) {
2618 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2619 		    IBT_SUCCESS) {
2620 			ibd_print_warn(state, "failed to close "
2621 			    "HCA device, ret=%d", ret);
2622 		}
2623 		state->id_hca_hdl = NULL;
2624 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2625 	}
2626 
2627 	mutex_enter(&ibd_gstate.ig_mutex);
2628 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2629 		if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2630 		    IBT_SUCCESS) {
2631 			ibd_print_warn(state,
2632 			    "ibt_detach() failed, ret=%d", ret);
2633 		}
2634 		state->id_ibt_hdl = NULL;
2635 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2636 		ibd_gstate.ig_ibt_hdl_ref_cnt--;
2637 	}
2638 	if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2639 	    (ibd_gstate.ig_ibt_hdl != NULL)) {
2640 		if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2641 		    IBT_SUCCESS) {
2642 			ibd_print_warn(state, "ibt_detach(): global "
2643 			    "failed, ret=%d", ret);
2644 		}
2645 		ibd_gstate.ig_ibt_hdl = NULL;
2646 	}
2647 	mutex_exit(&ibd_gstate.ig_mutex);
2648 
2649 	if (progress & IBD_DRV_TXINTR_ADDED) {
2650 		ddi_remove_softintr(state->id_tx);
2651 		state->id_tx = NULL;
2652 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2653 	}
2654 
2655 	if (progress & IBD_DRV_RXINTR_ADDED) {
2656 		ddi_remove_softintr(state->id_rx);
2657 		state->id_rx = NULL;
2658 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2659 	}
2660 
2661 #ifdef DEBUG
2662 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2663 		kstat_delete(state->rc_ksp);
2664 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2665 	}
2666 #endif
2667 
2668 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2669 		ibd_state_fini(state);
2670 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2671 	}
2672 }
2673 
2674 int
ibd_part_attach(ibd_state_t * state,dev_info_t * dip)2675 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2676 {
2677 	ibt_status_t ret;
2678 	int rv;
2679 	kthread_t *kht;
2680 
2681 	/*
2682 	 * Initialize mutexes and condition variables
2683 	 */
2684 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2685 		DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2686 		return (DDI_FAILURE);
2687 	}
2688 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2689 
2690 	/*
2691 	 * Allocate rx,tx softintr
2692 	 */
2693 	if (ibd_rx_softintr == 1) {
2694 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2695 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2696 			DPRINT(10, "ibd_part_attach: failed in "
2697 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2698 			return (DDI_FAILURE);
2699 		}
2700 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2701 	}
2702 	if (ibd_tx_softintr == 1) {
2703 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2704 		    NULL, NULL, ibd_tx_recycle,
2705 		    (caddr_t)state)) != DDI_SUCCESS) {
2706 			DPRINT(10, "ibd_part_attach: failed in "
2707 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2708 			return (DDI_FAILURE);
2709 		}
2710 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2711 	}
2712 
2713 	/*
2714 	 * Attach to IBTL
2715 	 */
2716 	mutex_enter(&ibd_gstate.ig_mutex);
2717 	if (ibd_gstate.ig_ibt_hdl == NULL) {
2718 		if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2719 		    &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2720 			DPRINT(10, "ibd_part_attach: global: failed in "
2721 			    "ibt_attach(), ret=%d", ret);
2722 			mutex_exit(&ibd_gstate.ig_mutex);
2723 			return (DDI_FAILURE);
2724 		}
2725 	}
2726 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2727 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2728 		DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2729 		    ret);
2730 		mutex_exit(&ibd_gstate.ig_mutex);
2731 		return (DDI_FAILURE);
2732 	}
2733 	ibd_gstate.ig_ibt_hdl_ref_cnt++;
2734 	mutex_exit(&ibd_gstate.ig_mutex);
2735 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2736 
2737 	/*
2738 	 * Open the HCA
2739 	 */
2740 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2741 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2742 		DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2743 		    ret);
2744 		return (DDI_FAILURE);
2745 	}
2746 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2747 
2748 #ifdef DEBUG
2749 	/* Initialize Driver Counters for Reliable Connected Mode */
2750 	if (state->id_enable_rc) {
2751 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2752 			DPRINT(10, "ibd_part_attach: failed in "
2753 			    "ibd_rc_init_stats");
2754 			return (DDI_FAILURE);
2755 		}
2756 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2757 	}
2758 #endif
2759 
2760 	/*
2761 	 * Record capabilities
2762 	 */
2763 	(void) ibd_record_capab(state);
2764 
2765 	/*
2766 	 * Allocate a protection domain on the HCA
2767 	 */
2768 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2769 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2770 		DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2771 		    ret);
2772 		return (DDI_FAILURE);
2773 	}
2774 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2775 
2776 
2777 	/*
2778 	 * We need to initialise the req_list that is required for the
2779 	 * operation of the async_thread.
2780 	 */
2781 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2782 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2783 	list_create(&state->id_req_list, sizeof (ibd_req_t),
2784 	    offsetof(ibd_req_t, rq_list));
2785 	state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2786 
2787 	/*
2788 	 * Create the async thread; thread_create never fails.
2789 	 */
2790 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2791 	    TS_RUN, minclsyspri);
2792 	state->id_async_thrid = kht->t_did;
2793 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2794 
2795 	return (DDI_SUCCESS);
2796 }
2797 
2798 /*
2799  * Attach device to the IO framework.
2800  */
2801 static int
ibd_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)2802 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2803 {
2804 	int ret;
2805 
2806 	switch (cmd) {
2807 		case DDI_ATTACH:
2808 			ret = ibd_port_attach(dip);
2809 			break;
2810 		default:
2811 			ret = DDI_FAILURE;
2812 			break;
2813 	}
2814 	return (ret);
2815 }
2816 
2817 /*
2818  * Detach device from the IO framework.
2819  */
2820 static int
ibd_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2821 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2822 {
2823 	ibd_state_t *state;
2824 	int instance;
2825 
2826 	/*
2827 	 * IBD doesn't support suspend/resume
2828 	 */
2829 	if (cmd != DDI_DETACH)
2830 		return (DDI_FAILURE);
2831 
2832 	/*
2833 	 * Get the instance softstate
2834 	 */
2835 	instance = ddi_get_instance(dip);
2836 	state = ddi_get_soft_state(ibd_list, instance);
2837 
2838 	/*
2839 	 * Release all resources we're holding still.  Note that if we'd
2840 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2841 	 * so far, we should find all the flags we need in id_mac_state.
2842 	 */
2843 	return (ibd_port_unattach(state, dip));
2844 }
2845 
2846 /*
2847  * Pre ibt_attach() driver initialization
2848  */
2849 static int
ibd_state_init(ibd_state_t * state,dev_info_t * dip)2850 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2851 {
2852 	char buf[64];
2853 
2854 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2855 	state->id_link_state = LINK_STATE_UNKNOWN;
2856 
2857 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2858 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2859 	state->id_trap_stop = B_TRUE;
2860 	state->id_trap_inprog = 0;
2861 
2862 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2863 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2864 	state->id_dip = dip;
2865 
2866 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2867 
2868 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2869 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2870 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2871 	state->id_tx_busy = 0;
2872 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2873 
2874 	state->id_rx_list.dl_bufs_outstanding = 0;
2875 	state->id_rx_list.dl_cnt = 0;
2876 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2877 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2878 	(void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2879 	    state->id_pkey, state->id_plinkid);
2880 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2881 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2882 
2883 	/* For Reliable Connected Mode */
2884 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2885 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2886 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2887 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2888 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2889 	    MUTEX_DRIVER, NULL);
2890 	mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2891 
2892 	/*
2893 	 * Make the default link mode as RC. If this fails during connection
2894 	 * setup, the link mode is automatically transitioned to UD.
2895 	 * Also set the RC MTU.
2896 	 */
2897 	state->id_enable_rc = IBD_DEF_LINK_MODE;
2898 	state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2899 	state->id_mtu = IBD_DEF_MAX_MTU;
2900 
2901 	/* Iniatialize all tunables to default */
2902 	state->id_lso_policy = IBD_DEF_LSO_POLICY;
2903 	state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2904 	state->id_num_ah = IBD_DEF_NUM_AH;
2905 	state->id_hash_size = IBD_DEF_HASH_SIZE;
2906 	state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2907 	state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2908 	state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2909 	state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2910 	state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2911 	state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2912 	state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2913 	state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2914 	state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2915 	state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2916 	state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2917 	state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2918 	state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2919 	state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2920 	state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2921 	state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2922 	state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2923 	state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2924 	state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2925 	state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2926 
2927 	return (DDI_SUCCESS);
2928 }
2929 
2930 /*
2931  * Post ibt_detach() driver deconstruction
2932  */
2933 static void
ibd_state_fini(ibd_state_t * state)2934 ibd_state_fini(ibd_state_t *state)
2935 {
2936 	kmem_cache_destroy(state->id_req_kmc);
2937 
2938 	mutex_destroy(&state->id_rx_list.dl_mutex);
2939 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2940 
2941 	mutex_destroy(&state->id_txpost_lock);
2942 	mutex_destroy(&state->id_tx_list.dl_mutex);
2943 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2944 	mutex_destroy(&state->id_lso_lock);
2945 
2946 	mutex_destroy(&state->id_sched_lock);
2947 	mutex_destroy(&state->id_scq_poll_lock);
2948 	mutex_destroy(&state->id_rcq_poll_lock);
2949 
2950 	cv_destroy(&state->id_trap_cv);
2951 	mutex_destroy(&state->id_trap_lock);
2952 	mutex_destroy(&state->id_link_mutex);
2953 
2954 	/* For Reliable Connected Mode */
2955 	mutex_destroy(&state->rc_timeout_lock);
2956 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2957 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2958 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2959 	mutex_destroy(&state->rc_tx_large_bufs_lock);
2960 	mutex_destroy(&state->rc_rx_lock);
2961 }
2962 
2963 /*
2964  * Fetch link speed from SA for snmp ifspeed reporting.
2965  */
2966 static uint64_t
ibd_get_portspeed(ibd_state_t * state)2967 ibd_get_portspeed(ibd_state_t *state)
2968 {
2969 	int			ret;
2970 	ibt_path_info_t		path;
2971 	ibt_path_attr_t		path_attr;
2972 	uint8_t			num_paths;
2973 	uint64_t		ifspeed;
2974 
2975 	/*
2976 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2977 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2978 	 * 2000000000. Start with that as default.
2979 	 */
2980 	ifspeed = 2000000000;
2981 
2982 	bzero(&path_attr, sizeof (path_attr));
2983 
2984 	/*
2985 	 * Get the port speed from Loopback path information.
2986 	 */
2987 	path_attr.pa_dgids = &state->id_sgid;
2988 	path_attr.pa_num_dgids = 1;
2989 	path_attr.pa_sgid = state->id_sgid;
2990 
2991 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2992 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2993 		goto earlydone;
2994 
2995 	if (num_paths < 1)
2996 		goto earlydone;
2997 
2998 	/*
2999 	 * In case SA does not return an expected value, report the default
3000 	 * speed as 1X.
3001 	 */
3002 	ret = 1;
3003 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
3004 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
3005 			ret = 1;
3006 			break;
3007 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
3008 			ret = 4;
3009 			break;
3010 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
3011 			ret = 12;
3012 			break;
3013 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
3014 			ret = 2;
3015 			break;
3016 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
3017 			ret = 8;
3018 			break;
3019 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
3020 			ret = 16;
3021 			break;
3022 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
3023 			ret = 24;
3024 			break;
3025 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
3026 			ret = 32;
3027 			break;
3028 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
3029 			ret = 48;
3030 			break;
3031 	}
3032 
3033 	ifspeed *= ret;
3034 
3035 earlydone:
3036 	return (ifspeed);
3037 }
3038 
3039 /*
3040  * Search input mcg list (id_mc_full or id_mc_non) for an entry
3041  * representing the input mcg mgid.
3042  */
3043 static ibd_mce_t *
ibd_mcache_find(ib_gid_t mgid,struct list * mlist)3044 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3045 {
3046 	ibd_mce_t *ptr = list_head(mlist);
3047 
3048 	/*
3049 	 * Do plain linear search.
3050 	 */
3051 	while (ptr != NULL) {
3052 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3053 		    sizeof (ib_gid_t)) == 0)
3054 			return (ptr);
3055 		ptr = list_next(mlist, ptr);
3056 	}
3057 	return (NULL);
3058 }
3059 
3060 /*
3061  * Execute IBA JOIN.
3062  */
3063 static ibt_status_t
ibd_iba_join(ibd_state_t * state,ib_gid_t mgid,ibd_mce_t * mce)3064 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3065 {
3066 	ibt_mcg_attr_t mcg_attr;
3067 
3068 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3069 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3070 	mcg_attr.mc_mgid = mgid;
3071 	mcg_attr.mc_join_state = mce->mc_jstate;
3072 	mcg_attr.mc_scope = state->id_scope;
3073 	mcg_attr.mc_pkey = state->id_pkey;
3074 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3075 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3076 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3077 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3078 	    NULL, NULL));
3079 }
3080 
3081 /*
3082  * This code JOINs the port in the proper way (depending on the join
3083  * state) so that IBA fabric will forward mcg packets to/from the port.
3084  * It also attaches the QPN to the mcg so it can receive those mcg
3085  * packets. This code makes sure not to attach the mcg to the QP if
3086  * that has been previously done due to the mcg being joined with a
3087  * different join state, even though this is not required by SWG_0216,
3088  * refid 3610.
3089  */
3090 static ibd_mce_t *
ibd_join_group(ibd_state_t * state,ib_gid_t mgid,uint8_t jstate)3091 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3092 {
3093 	ibt_status_t ibt_status;
3094 	ibd_mce_t *mce, *tmce, *omce = NULL;
3095 	boolean_t do_attach = B_TRUE;
3096 
3097 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3098 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3099 
3100 	/*
3101 	 * For enable_multicast Full member joins, we need to do some
3102 	 * extra work. If there is already an mce on the list that
3103 	 * indicates full membership, that means the membership has
3104 	 * not yet been dropped (since the disable_multicast was issued)
3105 	 * because there are pending Tx's to the mcg; in that case, just
3106 	 * mark the mce not to be reaped when the Tx completion queues
3107 	 * an async reap operation.
3108 	 *
3109 	 * If there is already an mce on the list indicating sendonly
3110 	 * membership, try to promote to full membership. Be careful
3111 	 * not to deallocate the old mce, since there might be an AH
3112 	 * pointing to it; instead, update the old mce with new data
3113 	 * that tracks the full membership.
3114 	 */
3115 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3116 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3117 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3118 			ASSERT(omce->mc_fullreap);
3119 			omce->mc_fullreap = B_FALSE;
3120 			return (omce);
3121 		} else {
3122 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3123 		}
3124 	}
3125 
3126 	/*
3127 	 * Allocate the ibd_mce_t to track this JOIN.
3128 	 */
3129 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3130 	mce->mc_fullreap = B_FALSE;
3131 	mce->mc_jstate = jstate;
3132 
3133 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3134 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3135 		    ibt_status);
3136 		kmem_free(mce, sizeof (ibd_mce_t));
3137 		return (NULL);
3138 	}
3139 
3140 	/*
3141 	 * Is an IBA attach required? Not if the interface is already joined
3142 	 * to the mcg in a different appropriate join state.
3143 	 */
3144 	if (jstate == IB_MC_JSTATE_NON) {
3145 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3146 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3147 			do_attach = B_FALSE;
3148 	} else if (jstate == IB_MC_JSTATE_FULL) {
3149 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3150 			do_attach = B_FALSE;
3151 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3152 		do_attach = B_FALSE;
3153 	}
3154 
3155 	if (do_attach) {
3156 		/*
3157 		 * Do the IBA attach.
3158 		 */
3159 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3160 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3161 		    &mce->mc_info)) != IBT_SUCCESS) {
3162 			DPRINT(10, "ibd_join_group : failed qp attachment "
3163 			    "%d\n", ibt_status);
3164 			/*
3165 			 * NOTE that we should probably preserve the join info
3166 			 * in the list and later try to leave again at detach
3167 			 * time.
3168 			 */
3169 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3170 			    state->id_sgid, jstate);
3171 			kmem_free(mce, sizeof (ibd_mce_t));
3172 			return (NULL);
3173 		}
3174 	}
3175 
3176 	/*
3177 	 * Insert the ibd_mce_t in the proper list.
3178 	 */
3179 	if (jstate == IB_MC_JSTATE_NON) {
3180 		IBD_MCACHE_INSERT_NON(state, mce);
3181 	} else {
3182 		/*
3183 		 * Set up the mc_req fields used for reaping the
3184 		 * mcg in case of delayed tx completion (see
3185 		 * ibd_tx_cleanup()). Also done for sendonly join in
3186 		 * case we are promoted to fullmembership later and
3187 		 * keep using the same mce.
3188 		 */
3189 		mce->mc_req.rq_gid = mgid;
3190 		mce->mc_req.rq_ptr = mce;
3191 		/*
3192 		 * Check whether this is the case of trying to join
3193 		 * full member, and we were already joined send only.
3194 		 * We try to drop our SendOnly membership, but it is
3195 		 * possible that the mcg does not exist anymore (and
3196 		 * the subnet trap never reached us), so the leave
3197 		 * operation might fail.
3198 		 */
3199 		if (omce != NULL) {
3200 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3201 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3202 			omce->mc_jstate = IB_MC_JSTATE_FULL;
3203 			bcopy(&mce->mc_info, &omce->mc_info,
3204 			    sizeof (ibt_mcg_info_t));
3205 			kmem_free(mce, sizeof (ibd_mce_t));
3206 			return (omce);
3207 		}
3208 		mutex_enter(&state->id_mc_mutex);
3209 		IBD_MCACHE_INSERT_FULL(state, mce);
3210 		mutex_exit(&state->id_mc_mutex);
3211 	}
3212 
3213 	return (mce);
3214 }
3215 
3216 /*
3217  * Called during port up event handling to attempt to reacquire full
3218  * membership to an mcg. Stripped down version of ibd_join_group().
3219  * Note that it is possible that the mcg might have gone away, and
3220  * gets recreated at this point.
3221  */
3222 static void
ibd_reacquire_group(ibd_state_t * state,ibd_mce_t * mce)3223 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3224 {
3225 	ib_gid_t mgid;
3226 
3227 	/*
3228 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
3229 	 * reap/leave is going to try to leave the group. We could prevent
3230 	 * that by adding a boolean flag into ibd_mce_t, if required.
3231 	 */
3232 	if (mce->mc_fullreap)
3233 		return;
3234 
3235 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
3236 
3237 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3238 	    mgid.gid_guid);
3239 
3240 	/* While reacquiring, leave and then join the MCG */
3241 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3242 	    mce->mc_jstate);
3243 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3244 		ibd_print_warn(state, "Failure on port up to rejoin "
3245 		    "multicast gid %016llx:%016llx",
3246 		    (u_longlong_t)mgid.gid_prefix,
3247 		    (u_longlong_t)mgid.gid_guid);
3248 }
3249 
3250 /*
3251  * This code handles delayed Tx completion cleanups for mcg's to which
3252  * disable_multicast has been issued, regular mcg related cleanups during
3253  * disable_multicast, disable_promiscuous and mcg traps, as well as
3254  * cleanups during driver detach time. Depending on the join state,
3255  * it deletes the mce from the appropriate list and issues the IBA
3256  * leave/detach; except in the disable_multicast case when the mce
3257  * is left on the active list for a subsequent Tx completion cleanup.
3258  */
3259 static void
ibd_async_reap_group(ibd_state_t * state,ibd_mce_t * mce,ib_gid_t mgid,uint8_t jstate)3260 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3261     uint8_t jstate)
3262 {
3263 	ibd_mce_t *tmce;
3264 	boolean_t do_detach = B_TRUE;
3265 
3266 	/*
3267 	 * Before detaching, we must check whether the other list
3268 	 * contains the mcg; if we detach blindly, the consumer
3269 	 * who set up the other list will also stop receiving
3270 	 * traffic.
3271 	 */
3272 	if (jstate == IB_MC_JSTATE_FULL) {
3273 		/*
3274 		 * The following check is only relevant while coming
3275 		 * from the Tx completion path in the reap case.
3276 		 */
3277 		if (!mce->mc_fullreap)
3278 			return;
3279 		mutex_enter(&state->id_mc_mutex);
3280 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3281 		mutex_exit(&state->id_mc_mutex);
3282 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3283 			do_detach = B_FALSE;
3284 	} else if (jstate == IB_MC_JSTATE_NON) {
3285 		IBD_MCACHE_PULLOUT_NON(state, mce);
3286 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3287 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3288 			do_detach = B_FALSE;
3289 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3290 		mutex_enter(&state->id_mc_mutex);
3291 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3292 		mutex_exit(&state->id_mc_mutex);
3293 		do_detach = B_FALSE;
3294 	}
3295 
3296 	/*
3297 	 * If we are reacting to a mcg trap and leaving our sendonly or
3298 	 * non membership, the mcg is possibly already gone, so attempting
3299 	 * to leave might fail. On the other hand, we must try to leave
3300 	 * anyway, since this might be a trap from long ago, and we could
3301 	 * have potentially sendonly joined to a recent incarnation of
3302 	 * the mcg and are about to loose track of this information.
3303 	 */
3304 	if (do_detach) {
3305 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3306 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3307 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3308 	}
3309 
3310 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3311 	kmem_free(mce, sizeof (ibd_mce_t));
3312 }
3313 
3314 /*
3315  * Async code executed due to multicast and promiscuous disable requests
3316  * and mcg trap handling; also executed during driver detach. Mostly, a
3317  * leave and detach is done; except for the fullmember case when Tx
3318  * requests are pending, whence arrangements are made for subsequent
3319  * cleanup on Tx completion.
3320  */
3321 static void
ibd_leave_group(ibd_state_t * state,ib_gid_t mgid,uint8_t jstate)3322 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3323 {
3324 	ipoib_mac_t mcmac;
3325 	boolean_t recycled;
3326 	ibd_mce_t *mce;
3327 
3328 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3329 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3330 
3331 	if (jstate == IB_MC_JSTATE_NON) {
3332 		recycled = B_TRUE;
3333 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3334 		/*
3335 		 * In case we are handling a mcg trap, we might not find
3336 		 * the mcg in the non list.
3337 		 */
3338 		if (mce == NULL) {
3339 			return;
3340 		}
3341 	} else {
3342 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3343 
3344 		/*
3345 		 * In case we are handling a mcg trap, make sure the trap
3346 		 * is not arriving late; if we have an mce that indicates
3347 		 * that we are already a fullmember, that would be a clear
3348 		 * indication that the trap arrived late (ie, is for a
3349 		 * previous incarnation of the mcg).
3350 		 */
3351 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3352 			if ((mce == NULL) || (mce->mc_jstate ==
3353 			    IB_MC_JSTATE_FULL)) {
3354 				return;
3355 			}
3356 		} else {
3357 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3358 
3359 			/*
3360 			 * If join group failed, mce will be NULL here.
3361 			 * This is because in GLDv3 driver, set multicast
3362 			 *  will always return success.
3363 			 */
3364 			if (mce == NULL) {
3365 				return;
3366 			}
3367 
3368 			mce->mc_fullreap = B_TRUE;
3369 		}
3370 
3371 		/*
3372 		 * If no pending Tx's remain that reference the AH
3373 		 * for the mcg, recycle it from active to free list.
3374 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3375 		 * so the last completing Tx will cause an async reap
3376 		 * operation to be invoked, at which time we will drop our
3377 		 * membership to the mcg so that the pending Tx's complete
3378 		 * successfully. Refer to comments on "AH and MCE active
3379 		 * list manipulation" at top of this file. The lock protects
3380 		 * against Tx fast path and Tx cleanup code.
3381 		 */
3382 		mutex_enter(&state->id_ac_mutex);
3383 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3384 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3385 		    IB_MC_JSTATE_SEND_ONLY_NON));
3386 		mutex_exit(&state->id_ac_mutex);
3387 	}
3388 
3389 	if (recycled) {
3390 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3391 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3392 		ibd_async_reap_group(state, mce, mgid, jstate);
3393 	}
3394 }
3395 
3396 /*
3397  * Find the broadcast address as defined by IPoIB; implicitly
3398  * determines the IBA scope, mtu, tclass etc of the link the
3399  * interface is going to be a member of.
3400  */
3401 static ibt_status_t
ibd_find_bgroup(ibd_state_t * state)3402 ibd_find_bgroup(ibd_state_t *state)
3403 {
3404 	ibt_mcg_attr_t mcg_attr;
3405 	uint_t numg;
3406 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3407 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3408 	    IB_MC_SCOPE_GLOBAL };
3409 	int i, mcgmtu;
3410 	boolean_t found = B_FALSE;
3411 	int ret;
3412 	ibt_mcg_info_t mcg_info;
3413 
3414 	state->id_bgroup_created = B_FALSE;
3415 	state->id_bgroup_present = B_FALSE;
3416 
3417 query_bcast_grp:
3418 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3419 	mcg_attr.mc_pkey = state->id_pkey;
3420 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3421 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3422 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3423 
3424 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3425 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3426 
3427 		/*
3428 		 * Look for the IPoIB broadcast group.
3429 		 */
3430 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3431 		state->id_mgid.gid_prefix =
3432 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3433 		    ((uint64_t)state->id_scope << 48) |
3434 		    ((uint32_t)(state->id_pkey << 16)));
3435 		mcg_attr.mc_mgid = state->id_mgid;
3436 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3437 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3438 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3439 			found = B_TRUE;
3440 			break;
3441 		}
3442 	}
3443 
3444 	if (!found) {
3445 		if (state->id_create_broadcast_group) {
3446 			/*
3447 			 * If we created the broadcast group, but failed to
3448 			 * find it, we can't do anything except leave the
3449 			 * one we created and return failure.
3450 			 */
3451 			if (state->id_bgroup_created) {
3452 				ibd_print_warn(state, "IPoIB broadcast group "
3453 				    "absent. Unable to query after create.");
3454 				goto find_bgroup_fail;
3455 			}
3456 
3457 			/*
3458 			 * Create the ipoib broadcast group if it didn't exist
3459 			 */
3460 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3461 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3462 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3463 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3464 			mcg_attr.mc_pkey = state->id_pkey;
3465 			mcg_attr.mc_flow = 0;
3466 			mcg_attr.mc_sl = 0;
3467 			mcg_attr.mc_tclass = 0;
3468 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3469 			state->id_mgid.gid_prefix =
3470 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3471 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3472 			    ((uint32_t)(state->id_pkey << 16)));
3473 			mcg_attr.mc_mgid = state->id_mgid;
3474 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3475 
3476 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3477 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3478 				ibd_print_warn(state, "IPoIB broadcast group "
3479 				    "absent, create failed: ret = %d\n", ret);
3480 				state->id_bgroup_created = B_FALSE;
3481 				return (IBT_FAILURE);
3482 			}
3483 			state->id_bgroup_created = B_TRUE;
3484 			goto query_bcast_grp;
3485 		} else {
3486 			ibd_print_warn(state, "IPoIB broadcast group absent");
3487 			return (IBT_FAILURE);
3488 		}
3489 	}
3490 
3491 	/*
3492 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3493 	 */
3494 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3495 	if (state->id_mtu < mcgmtu) {
3496 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3497 		    "greater than port's maximum MTU %d", mcgmtu,
3498 		    state->id_mtu);
3499 		ibt_free_mcg_info(state->id_mcinfo, 1);
3500 		goto find_bgroup_fail;
3501 	}
3502 	state->id_mtu = mcgmtu;
3503 	state->id_bgroup_present = B_TRUE;
3504 
3505 	return (IBT_SUCCESS);
3506 
3507 find_bgroup_fail:
3508 	if (state->id_bgroup_created) {
3509 		(void) ibt_leave_mcg(state->id_sgid,
3510 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3511 		    IB_MC_JSTATE_FULL);
3512 	}
3513 
3514 	return (IBT_FAILURE);
3515 }
3516 
3517 static int
ibd_alloc_tx_copybufs(ibd_state_t * state)3518 ibd_alloc_tx_copybufs(ibd_state_t *state)
3519 {
3520 	ibt_mr_attr_t mem_attr;
3521 
3522 	/*
3523 	 * Allocate one big chunk for all regular tx copy bufs
3524 	 */
3525 	state->id_tx_buf_sz = state->id_mtu;
3526 	if (state->id_lso_policy && state->id_lso_capable &&
3527 	    (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3528 		state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3529 	}
3530 
3531 	state->id_tx_bufs = kmem_zalloc(state->id_ud_num_