1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * An implementation of the IPoIB standard based on PSARC 2001/289. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 43 #include <sys/sysmacros.h> /* for offsetof */ 44 #include <sys/disp.h> /* for async thread pri */ 45 #include <sys/atomic.h> /* for atomic_add*() */ 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 47 #include <netinet/in.h> /* for netinet/ip.h below */ 48 #include <netinet/ip.h> /* for struct ip */ 49 #include <netinet/udp.h> /* for struct udphdr */ 50 #include <inet/common.h> /* for inet/ip.h below */ 51 #include <inet/ip.h> /* for ipha_t */ 52 #include <inet/ip6.h> /* for ip6_t */ 53 #include <inet/tcp.h> /* for tcph_t */ 54 #include <netinet/icmp6.h> /* for icmp6_t */ 55 #include <sys/callb.h> 56 #include <sys/modhash.h> 57 58 #include <sys/ib/clients/ibd/ibd.h> 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 60 #include <sys/note.h> 61 #include <sys/multidata.h> 62 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 64 65 #include <sys/priv_names.h> 66 #include <sys/dls.h> 67 #include <sys/dld_ioc.h> 68 #include <sys/policy.h> 69 #include <sys/ibpart.h> 70 #include <sys/file.h> 71 72 /* 73 * The write-up below includes details on the following: 74 * 1. The dladm administrative model. 75 * 2. Late HCA initialization feature. 76 * 3. Brussels support and its implications to the current architecture. 77 * 78 * 1. The dladm administrative model. 79 * ------------------------------------------ 80 * With the dladm model, ibnex will create one ibd instance per port. These 81 * instances will be created independent of the port state. 82 * 83 * The ibd driver is two faceted: One side of it working as the port driver and 84 * the other as the partition object driver. 85 * 86 * The port instance is a child of the HCA, and will have an entry in the devfs. 87 * A DDI attach only happens for the port driver, and its attach is 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is 89 * handled in ibd_port_unattach(). 90 * 91 * The partition object is only a registrant to the mac layer via mac_register() 92 * and does not have an entry in the device tree. There is no DDI softstate 93 * managed by the DDI framework for the partition objects. However, the state is 94 * managed inside the ibd driver, and every partition object hangs off the 95 * "ibd_objlist_head". 96 * 97 * The partition object first comes into existence when a user runs the 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry 99 * point of the partition object. The partition object goes away with the 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry 101 * point of the partition object. 102 * 103 * The create-part and delete-part subcommands result in dld ioctls that end up 104 * calling ibd_create_parition() and ibd_delete_partition respectively. 105 * There ioctls are registered with the dld layer in _init() via a call to 106 * dld_ioc_register(). 107 * 108 * The port instance by itself cannot be plumbed. It is only the partition 109 * objects that can be plumbed and they alone participate in I/O and not the 110 * port driver. 111 * 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to 113 * display useful information. The info entry point for ibd is 114 * ibd_get_partition_info(). 115 * 116 * 2. Late HCA initialization feature. 117 * ------------------------------------ 118 * As mentioned in section 1, the user creates the partition objects via 119 * dladm(1M). It is possible that: 120 * a) The physical port itself is down and the SM cannot be reached. 121 * b) The PKEY specified by the used has not been created in the SM yet. 122 * c) An IPoIB broadcast group for the specified PKEY is not present. 123 * 124 * In all of the above cases, complete initialization of the partition object is 125 * not possible. However, the new model allows the creation of partition 126 * objects even in such cases but will defer the initialization for later. 127 * When such a partition object is plumbed, the link state will be displayed as 128 * "down". 129 * The driver, at this point, is listening to events that herald the 130 * availability of resources - 131 * i) LINK_UP when the link becomes available 132 * ii) PORT_CHANGE when the PKEY has been created 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been 134 * created 135 * via ibd_async_handler() for events i) and ii), and via 136 * ibd_snet_notices_handler() for iii. 137 * The driver handles these events (as and when they arrive) and completes the 138 * initialization of the partition object and transitions it to a usable state. 139 * 140 * 3. Brussels support and its implications to the current architecture. 141 * --------------------------------------------------------------------- 142 * The brussels support introduces two new interfaces to the ibd driver - 143 * ibd_m_getprop() and ibd_m_setprop(). 144 * These interfaces allow setting and retrieval of certain properties. 145 * Some of them are public properties while most other are private properties 146 * meant to be used by developers. Tuning the latter kind can cause 147 * performance issues and should not be used without understanding the 148 * implications. All properties are specific to an instance of either the 149 * partition object or the port driver. 150 * 151 * The public properties are : mtu and linkmode. 152 * mtu is a read-only property. 153 * linkmode can take two values - UD and CM. 154 * 155 * Changing the linkmode requires some bookkeeping in the driver. The 156 * capabilities need to be re-reported to the mac layer. This is done by 157 * calling mac_capab_update(). The maxsdu is updated by calling 158 * mac_maxsdu_update2(). 159 * The private properties retain their values across the change of linkmode. 160 * NOTE: 161 * - The port driver does not support any property apart from mtu. 162 * - All other properties are only meant for the partition object. 163 * - The properties cannot be set when an instance is plumbed. The 164 * instance has to be unplumbed to effect any setting. 165 */ 166 167 /* 168 * Driver wide tunables 169 * 170 * ibd_tx_softintr 171 * ibd_rx_softintr 172 * The softintr mechanism allows ibd to avoid event queue overflows if 173 * the receive/completion handlers are to be expensive. These are enabled 174 * by default. 175 * 176 * ibd_log_sz 177 * This specifies the size of the ibd log buffer in bytes. The buffer is 178 * allocated and logging is enabled only when IBD_LOGGING is defined. 179 * 180 */ 181 uint_t ibd_rx_softintr = 1; 182 uint_t ibd_tx_softintr = 1; 183 184 #ifdef IBD_LOGGING 185 uint_t ibd_log_sz = 0x20000; 186 #endif 187 188 #ifdef IBD_LOGGING 189 #define IBD_LOG_SZ ibd_log_sz 190 #endif 191 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 193 #define IBD_RX_POST_CNT 8 194 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 196 #define IBD_LOG_RX_POST 4 197 198 /* Minimum number of receive work requests driver needs to always have */ 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 200 201 /* 202 * LSO parameters 203 */ 204 #define IBD_LSO_MAXLEN 65536 205 #define IBD_LSO_BUFSZ 8192 206 207 /* 208 * Async operation states 209 */ 210 #define IBD_OP_NOTSTARTED 0 211 #define IBD_OP_ONGOING 1 212 #define IBD_OP_COMPLETED 2 213 #define IBD_OP_ERRORED 3 214 #define IBD_OP_ROUTERED 4 215 216 /* 217 * Start/stop in-progress flags; note that restart must always remain 218 * the OR of start and stop flag values. 219 */ 220 #define IBD_DRV_START_IN_PROGRESS 0x10000000 221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS 224 225 /* 226 * Miscellaneous constants 227 */ 228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 229 #define IBD_DEF_MAX_SDU 2044 230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) 231 #define IBD_DEF_RC_MAX_SDU 65520 232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) 233 #define IBD_DEFAULT_QKEY 0xB1B 234 #ifdef IBD_LOGGING 235 #define IBD_DMAX_LINE 100 236 #endif 237 238 /* 239 * Enumerations for link states 240 */ 241 typedef enum { 242 IBD_LINK_DOWN, 243 IBD_LINK_UP, 244 IBD_LINK_UP_ABSENT 245 } ibd_link_op_t; 246 247 /* 248 * Driver State Pointer 249 */ 250 void *ibd_list; 251 252 /* 253 * Driver Global Data 254 */ 255 ibd_global_state_t ibd_gstate; 256 257 /* 258 * Partition object list 259 */ 260 ibd_state_t *ibd_objlist_head = NULL; 261 kmutex_t ibd_objlist_lock; 262 263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */ 264 265 /* 266 * Logging 267 */ 268 #ifdef IBD_LOGGING 269 kmutex_t ibd_lbuf_lock; 270 uint8_t *ibd_lbuf; 271 uint32_t ibd_lbuf_ndx; 272 #endif 273 274 /* 275 * Required system entry points 276 */ 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 279 280 /* 281 * Required driver entry points for GLDv3 282 */ 283 static int ibd_m_stat(void *, uint_t, uint64_t *); 284 static int ibd_m_start(void *); 285 static void ibd_m_stop(void *); 286 static int ibd_m_promisc(void *, boolean_t); 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 288 static int ibd_m_unicst(void *, const uint8_t *); 289 static mblk_t *ibd_m_tx(void *, mblk_t *); 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 291 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 293 const void *); 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, 296 mac_prop_info_handle_t); 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, 298 const void *); 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); 300 301 /* 302 * Private driver entry points for GLDv3 303 */ 304 305 /* 306 * Initialization 307 */ 308 static int ibd_state_init(ibd_state_t *, dev_info_t *); 309 static int ibd_init_txlist(ibd_state_t *); 310 static int ibd_init_rxlist(ibd_state_t *); 311 static int ibd_acache_init(ibd_state_t *); 312 #ifdef IBD_LOGGING 313 static void ibd_log_init(void); 314 #endif 315 316 /* 317 * Termination/cleanup 318 */ 319 static void ibd_state_fini(ibd_state_t *); 320 static void ibd_fini_txlist(ibd_state_t *); 321 static void ibd_fini_rxlist(ibd_state_t *); 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 324 static void ibd_acache_fini(ibd_state_t *); 325 #ifdef IBD_LOGGING 326 static void ibd_log_fini(void); 327 #endif 328 329 /* 330 * Allocation/acquire/map routines 331 */ 332 static int ibd_alloc_tx_copybufs(ibd_state_t *); 333 static int ibd_alloc_rx_copybufs(ibd_state_t *); 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 337 uint32_t *); 338 339 /* 340 * Free/release/unmap routines 341 */ 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 343 static void ibd_free_tx_copybufs(ibd_state_t *); 344 static void ibd_free_rx_copybufs(ibd_state_t *); 345 static void ibd_free_rx_rsrcs(ibd_state_t *); 346 static void ibd_free_tx_lsobufs(ibd_state_t *); 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 350 351 /* 352 * Handlers/callback routines 353 */ 354 static uint_t ibd_intr(caddr_t); 355 static uint_t ibd_tx_recycle(caddr_t); 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 362 static void ibd_freemsg_cb(char *); 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 364 ibt_async_event_t *); 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 366 ibt_async_event_t *); 367 static void ibd_snet_notices_handler(void *, ib_gid_t, 368 ibt_subnet_event_code_t, ibt_subnet_event_t *); 369 370 /* 371 * Send/receive routines 372 */ 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 377 378 /* 379 * Threads 380 */ 381 static void ibd_async_work(ibd_state_t *); 382 383 /* 384 * Async tasks 385 */ 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 388 static void ibd_async_setprom(ibd_state_t *); 389 static void ibd_async_unsetprom(ibd_state_t *); 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 392 static void ibd_async_txsched(ibd_state_t *); 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 394 395 /* 396 * Async task helpers 397 */ 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 402 ipoib_mac_t *, ipoib_mac_t *); 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 409 static uint64_t ibd_get_portspeed(ibd_state_t *); 410 static boolean_t ibd_async_safe(ibd_state_t *); 411 static void ibd_async_done(ibd_state_t *); 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 416 417 /* 418 * Helpers for attach/start routines 419 */ 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 421 static int ibd_record_capab(ibd_state_t *); 422 static int ibd_get_port_details(ibd_state_t *); 423 static int ibd_alloc_cqs(ibd_state_t *); 424 static int ibd_setup_ud_channel(ibd_state_t *); 425 static int ibd_start(ibd_state_t *); 426 static int ibd_undo_start(ibd_state_t *, link_state_t); 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); 430 static void ibd_part_unattach(ibd_state_t *state); 431 static int ibd_port_attach(dev_info_t *); 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *); 434 static int ibd_part_busy(ibd_state_t *); 435 436 /* 437 * Miscellaneous helpers 438 */ 439 static int ibd_sched_poll(ibd_state_t *, int, int); 440 static void ibd_resume_transmission(ibd_state_t *); 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 443 static void *list_get_head(list_t *); 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 446 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); 449 450 #ifdef IBD_LOGGING 451 static void ibd_log(const char *, ...); 452 #endif 453 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 456 457 /* Module Driver Info */ 458 static struct modldrv ibd_modldrv = { 459 &mod_driverops, /* This one is a driver */ 460 "InfiniBand GLDv3 Driver", /* short description */ 461 &ibd_dev_ops /* driver specific ops */ 462 }; 463 464 /* Module Linkage */ 465 static struct modlinkage ibd_modlinkage = { 466 MODREV_1, (void *)&ibd_modldrv, NULL 467 }; 468 469 /* 470 * Module (static) info passed to IBTL during ibt_attach 471 */ 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 473 IBTI_V_CURR, 474 IBT_NETWORK, 475 ibd_async_handler, 476 NULL, 477 "IBPART" 478 }; 479 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { 481 IBTI_V_CURR, 482 IBT_NETWORK, 483 ibdpd_async_handler, 484 NULL, 485 "IPIB" 486 }; 487 488 /* 489 * GLDv3 entry points 490 */ 491 #define IBD_M_CALLBACK_FLAGS \ 492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) 493 494 static mac_callbacks_t ibd_m_callbacks = { 495 IBD_M_CALLBACK_FLAGS, 496 ibd_m_stat, 497 ibd_m_start, 498 ibd_m_stop, 499 ibd_m_promisc, 500 ibd_m_multicst, 501 ibd_m_unicst, 502 ibd_m_tx, 503 NULL, 504 NULL, 505 ibd_m_getcapab, 506 NULL, 507 NULL, 508 ibd_m_setprop, 509 ibd_m_getprop, 510 ibd_m_propinfo 511 }; 512 513 /* Private properties */ 514 char *ibd_priv_props[] = { 515 "_ibd_broadcast_group", 516 "_ibd_coalesce_completions", 517 "_ibd_create_broadcast_group", 518 "_ibd_hash_size", 519 "_ibd_lso_enable", 520 "_ibd_num_ah", 521 "_ibd_num_lso_bufs", 522 "_ibd_rc_enable_srq", 523 "_ibd_rc_num_rwqe", 524 "_ibd_rc_num_srq", 525 "_ibd_rc_num_swqe", 526 "_ibd_rc_rx_comp_count", 527 "_ibd_rc_rx_comp_usec", 528 "_ibd_rc_rx_copy_thresh", 529 "_ibd_rc_rx_rwqe_thresh", 530 "_ibd_rc_tx_comp_count", 531 "_ibd_rc_tx_comp_usec", 532 "_ibd_rc_tx_copy_thresh", 533 "_ibd_ud_num_rwqe", 534 "_ibd_ud_num_swqe", 535 "_ibd_ud_rx_comp_count", 536 "_ibd_ud_rx_comp_usec", 537 "_ibd_ud_tx_comp_count", 538 "_ibd_ud_tx_comp_usec", 539 "_ibd_ud_tx_copy_thresh", 540 NULL 541 }; 542 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); 546 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = { 548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), 549 ibd_create_partition, secpolicy_dl_config}, 550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), 551 ibd_delete_partition, secpolicy_dl_config}, 552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), 553 ibd_get_partition_info, NULL} 554 }; 555 556 /* 557 * Fill/clear <scope> and <p_key> in multicast/broadcast address 558 */ 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 560 { \ 561 *(uint32_t *)((char *)(maddr) + 4) |= \ 562 htonl((uint32_t)(scope) << 16); \ 563 *(uint32_t *)((char *)(maddr) + 8) |= \ 564 htonl((uint32_t)(pkey) << 16); \ 565 } 566 567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 568 { \ 569 *(uint32_t *)((char *)(maddr) + 4) &= \ 570 htonl(~((uint32_t)0xF << 16)); \ 571 *(uint32_t *)((char *)(maddr) + 8) &= \ 572 htonl(~((uint32_t)0xFFFF << 16)); \ 573 } 574 575 /* 576 * Rudimentary debugging support 577 */ 578 #ifdef DEBUG 579 int ibd_debuglevel = 100; 580 void 581 debug_print(int l, char *fmt, ...) 582 { 583 va_list ap; 584 585 if (l < ibd_debuglevel) 586 return; 587 va_start(ap, fmt); 588 vcmn_err(CE_CONT, fmt, ap); 589 va_end(ap); 590 } 591 #endif 592 593 /* 594 * Common routine to print warning messages; adds in hca guid, port number 595 * and pkey to be able to identify the IBA interface. 596 */ 597 void 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 599 { 600 ib_guid_t hca_guid; 601 char ibd_print_buf[MAXNAMELEN + 256]; 602 int len; 603 va_list ap; 604 char part_name[MAXNAMELEN]; 605 datalink_id_t linkid = state->id_plinkid; 606 607 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 608 0, "hca-guid", 0); 609 (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL); 610 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 611 "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ", 612 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 613 (u_longlong_t)hca_guid, state->id_port, state->id_pkey, 614 part_name); 615 va_start(ap, fmt); 616 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 617 fmt, ap); 618 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 619 va_end(ap); 620 } 621 622 /* 623 * Warlock directives 624 */ 625 626 /* 627 * id_lso_lock 628 * 629 * state->id_lso->bkt_nfree may be accessed without a lock to 630 * determine the threshold at which we have to ask the nw layer 631 * to resume transmission (see ibd_resume_transmission()). 632 */ 633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 634 ibd_state_t::id_lso)) 635 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 636 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 637 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 638 639 /* 640 * id_scq_poll_lock 641 */ 642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 643 ibd_state_t::id_scq_poll_busy)) 644 645 /* 646 * id_txpost_lock 647 */ 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 649 ibd_state_t::id_tx_head)) 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 651 ibd_state_t::id_tx_busy)) 652 653 /* 654 * id_acache_req_lock 655 */ 656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 657 ibd_state_t::id_acache_req_cv)) 658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 659 ibd_state_t::id_req_list)) 660 _NOTE(SCHEME_PROTECTS_DATA("atomic", 661 ibd_acache_s::ac_ref)) 662 663 /* 664 * id_ac_mutex 665 * 666 * This mutex is actually supposed to protect id_ah_op as well, 667 * but this path of the code isn't clean (see update of id_ah_op 668 * in ibd_async_acache(), immediately after the call to 669 * ibd_async_mcache()). For now, we'll skip this check by 670 * declaring that id_ah_op is protected by some internal scheme 671 * that warlock isn't aware of. 672 */ 673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 674 ibd_state_t::id_ah_active)) 675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 676 ibd_state_t::id_ah_free)) 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 678 ibd_state_t::id_ah_addr)) 679 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 680 ibd_state_t::id_ah_op)) 681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 682 ibd_state_t::id_ah_error)) 683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 684 ibd_state_t::id_ac_hot_ace)) 685 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 686 687 /* 688 * id_mc_mutex 689 */ 690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 691 ibd_state_t::id_mc_full)) 692 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 693 ibd_state_t::id_mc_non)) 694 695 /* 696 * id_trap_lock 697 */ 698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 699 ibd_state_t::id_trap_cv)) 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 701 ibd_state_t::id_trap_stop)) 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 703 ibd_state_t::id_trap_inprog)) 704 705 /* 706 * id_prom_op 707 */ 708 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 709 ibd_state_t::id_prom_op)) 710 711 /* 712 * id_sched_lock 713 */ 714 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 715 ibd_state_t::id_sched_needed)) 716 717 /* 718 * id_link_mutex 719 */ 720 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 721 ibd_state_t::id_link_state)) 722 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 723 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 724 ibd_state_t::id_link_speed)) 725 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 726 727 /* 728 * id_tx_list.dl_mutex 729 */ 730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 731 ibd_state_t::id_tx_list.dl_head)) 732 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 733 ibd_state_t::id_tx_list.dl_pending_sends)) 734 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 735 ibd_state_t::id_tx_list.dl_cnt)) 736 737 /* 738 * id_rx_list.dl_mutex 739 */ 740 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 741 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 742 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 743 ibd_state_t::id_rx_list.dl_cnt)) 744 745 /* 746 * rc_timeout_lock 747 */ 748 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 749 ibd_state_t::rc_timeout_start)) 750 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 751 ibd_state_t::rc_timeout)) 752 753 754 /* 755 * Items protected by atomic updates 756 */ 757 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 758 ibd_state_s::id_brd_rcv 759 ibd_state_s::id_brd_xmt 760 ibd_state_s::id_multi_rcv 761 ibd_state_s::id_multi_xmt 762 ibd_state_s::id_num_intrs 763 ibd_state_s::id_rcv_bytes 764 ibd_state_s::id_rcv_pkt 765 ibd_state_s::id_rx_post_queue_index 766 ibd_state_s::id_tx_short 767 ibd_state_s::id_xmt_bytes 768 ibd_state_s::id_xmt_pkt 769 ibd_state_s::rc_rcv_trans_byte 770 ibd_state_s::rc_rcv_trans_pkt 771 ibd_state_s::rc_rcv_copy_byte 772 ibd_state_s::rc_rcv_copy_pkt 773 ibd_state_s::rc_xmt_bytes 774 ibd_state_s::rc_xmt_small_pkt 775 ibd_state_s::rc_xmt_fragmented_pkt 776 ibd_state_s::rc_xmt_map_fail_pkt 777 ibd_state_s::rc_xmt_map_succ_pkt 778 ibd_rc_chan_s::rcq_invoking)) 779 780 /* 781 * Non-mutex protection schemes for data elements. Almost all of 782 * these are non-shared items. 783 */ 784 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 785 callb_cpr 786 ib_gid_s 787 ib_header_info 788 ibd_acache_rq 789 ibd_acache_s::ac_mce 790 ibd_acache_s::ac_chan 791 ibd_mcache::mc_fullreap 792 ibd_mcache::mc_jstate 793 ibd_mcache::mc_req 794 ibd_rwqe_s 795 ibd_swqe_s 796 ibd_wqe_s 797 ibt_wr_ds_s::ds_va 798 ibt_wr_lso_s 799 ipoib_mac::ipoib_qpn 800 mac_capab_lso_s 801 msgb::b_next 802 msgb::b_cont 803 msgb::b_rptr 804 msgb::b_wptr 805 ibd_state_s::id_bgroup_created 806 ibd_state_s::id_mac_state 807 ibd_state_s::id_mtu 808 ibd_state_s::id_ud_num_rwqe 809 ibd_state_s::id_ud_num_swqe 810 ibd_state_s::id_qpnum 811 ibd_state_s::id_rcq_hdl 812 ibd_state_s::id_rx_buf_sz 813 ibd_state_s::id_rx_bufs 814 ibd_state_s::id_rx_mr_hdl 815 ibd_state_s::id_rx_wqes 816 ibd_state_s::id_rxwcs 817 ibd_state_s::id_rxwcs_size 818 ibd_state_s::id_rx_nqueues 819 ibd_state_s::id_rx_queues 820 ibd_state_s::id_scope 821 ibd_state_s::id_scq_hdl 822 ibd_state_s::id_tx_buf_sz 823 ibd_state_s::id_tx_bufs 824 ibd_state_s::id_tx_mr_hdl 825 ibd_state_s::id_tx_rel_list.dl_cnt 826 ibd_state_s::id_tx_wqes 827 ibd_state_s::id_txwcs 828 ibd_state_s::id_txwcs_size 829 ibd_state_s::rc_listen_hdl 830 ibd_state_s::rc_listen_hdl_OFED_interop 831 ibd_state_s::rc_srq_size 832 ibd_state_s::rc_srq_rwqes 833 ibd_state_s::rc_srq_rx_bufs 834 ibd_state_s::rc_srq_rx_mr_hdl 835 ibd_state_s::rc_tx_largebuf_desc_base 836 ibd_state_s::rc_tx_mr_bufs 837 ibd_state_s::rc_tx_mr_hdl 838 ipha_s 839 icmph_s 840 ibt_path_info_s::pi_sid 841 ibd_rc_chan_s::ace 842 ibd_rc_chan_s::chan_hdl 843 ibd_rc_chan_s::state 844 ibd_rc_chan_s::chan_state 845 ibd_rc_chan_s::is_tx_chan 846 ibd_rc_chan_s::rcq_hdl 847 ibd_rc_chan_s::rcq_size 848 ibd_rc_chan_s::scq_hdl 849 ibd_rc_chan_s::scq_size 850 ibd_rc_chan_s::rx_bufs 851 ibd_rc_chan_s::rx_mr_hdl 852 ibd_rc_chan_s::rx_rwqes 853 ibd_rc_chan_s::tx_wqes 854 ibd_rc_chan_s::tx_mr_bufs 855 ibd_rc_chan_s::tx_mr_hdl 856 ibd_rc_chan_s::tx_rel_list.dl_cnt 857 ibd_rc_chan_s::is_used 858 ibd_rc_tx_largebuf_s::lb_buf 859 ibd_rc_msg_hello_s 860 ibt_cm_return_args_s)) 861 862 /* 863 * ibd_rc_chan_s::next is protected by two mutexes: 864 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 865 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 866 */ 867 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 868 ibd_rc_chan_s::next)) 869 870 /* 871 * ibd_state_s.rc_tx_large_bufs_lock 872 */ 873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 874 ibd_state_s::rc_tx_largebuf_free_head)) 875 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 876 ibd_state_s::rc_tx_largebuf_nfree)) 877 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 878 ibd_rc_tx_largebuf_s::lb_next)) 879 880 /* 881 * ibd_acache_s.tx_too_big_mutex 882 */ 883 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 884 ibd_acache_s::tx_too_big_ongoing)) 885 886 /* 887 * tx_wqe_list.dl_mutex 888 */ 889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 890 ibd_rc_chan_s::tx_wqe_list.dl_head)) 891 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 892 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 893 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 894 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 895 896 /* 897 * ibd_state_s.rc_ace_recycle_lock 898 */ 899 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 900 ibd_state_s::rc_ace_recycle)) 901 902 /* 903 * rc_srq_rwqe_list.dl_mutex 904 */ 905 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 906 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 907 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 908 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 909 910 /* 911 * Non-mutex protection schemes for data elements. They are counters 912 * for problem diagnosis. Don't need be protected. 913 */ 914 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 915 ibd_state_s::rc_rcv_alloc_fail 916 ibd_state_s::rc_rcq_err 917 ibd_state_s::rc_ace_not_found 918 ibd_state_s::rc_xmt_drop_too_long_pkt 919 ibd_state_s::rc_xmt_icmp_too_long_pkt 920 ibd_state_s::rc_xmt_reenter_too_long_pkt 921 ibd_state_s::rc_swqe_short 922 ibd_state_s::rc_swqe_mac_update 923 ibd_state_s::rc_xmt_buf_short 924 ibd_state_s::rc_xmt_buf_mac_update 925 ibd_state_s::rc_scq_no_swqe 926 ibd_state_s::rc_scq_no_largebuf 927 ibd_state_s::rc_conn_succ 928 ibd_state_s::rc_conn_fail 929 ibd_state_s::rc_null_conn 930 ibd_state_s::rc_no_estab_conn 931 ibd_state_s::rc_act_close 932 ibd_state_s::rc_pas_close 933 ibd_state_s::rc_delay_ace_recycle 934 ibd_state_s::rc_act_close_simultaneous 935 ibd_state_s::rc_act_close_not_clean 936 ibd_state_s::rc_pas_close_rcq_invoking 937 ibd_state_s::rc_reset_cnt 938 ibd_state_s::rc_timeout_act 939 ibd_state_s::rc_timeout_pas 940 ibd_state_s::rc_stop_connect)) 941 942 #ifdef DEBUG 943 /* 944 * Non-mutex protection schemes for data elements. They are counters 945 * for problem diagnosis. Don't need be protected. 946 */ 947 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 948 ibd_state_s::rc_rwqe_short 949 ibd_rc_stat_s::rc_rcv_trans_byte 950 ibd_rc_stat_s::rc_rcv_trans_pkt 951 ibd_rc_stat_s::rc_rcv_copy_byte 952 ibd_rc_stat_s::rc_rcv_copy_pkt 953 ibd_rc_stat_s::rc_rcv_alloc_fail 954 ibd_rc_stat_s::rc_rcq_err 955 ibd_rc_stat_s::rc_rwqe_short 956 ibd_rc_stat_s::rc_xmt_bytes 957 ibd_rc_stat_s::rc_xmt_small_pkt 958 ibd_rc_stat_s::rc_xmt_fragmented_pkt 959 ibd_rc_stat_s::rc_xmt_map_fail_pkt 960 ibd_rc_stat_s::rc_xmt_map_succ_pkt 961 ibd_rc_stat_s::rc_ace_not_found 962 ibd_rc_stat_s::rc_scq_no_swqe 963 ibd_rc_stat_s::rc_scq_no_largebuf 964 ibd_rc_stat_s::rc_swqe_short 965 ibd_rc_stat_s::rc_swqe_mac_update 966 ibd_rc_stat_s::rc_xmt_buf_short 967 ibd_rc_stat_s::rc_xmt_buf_mac_update 968 ibd_rc_stat_s::rc_conn_succ 969 ibd_rc_stat_s::rc_conn_fail 970 ibd_rc_stat_s::rc_null_conn 971 ibd_rc_stat_s::rc_no_estab_conn 972 ibd_rc_stat_s::rc_act_close 973 ibd_rc_stat_s::rc_pas_close 974 ibd_rc_stat_s::rc_delay_ace_recycle 975 ibd_rc_stat_s::rc_act_close_simultaneous 976 ibd_rc_stat_s::rc_reset_cnt 977 ibd_rc_stat_s::rc_timeout_act 978 ibd_rc_stat_s::rc_timeout_pas)) 979 #endif 980 981 int 982 _init() 983 { 984 int status; 985 986 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 987 PAGESIZE), 0); 988 if (status != 0) { 989 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 990 return (status); 991 } 992 993 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); 994 995 mac_init_ops(&ibd_dev_ops, "ibp"); 996 status = mod_install(&ibd_modlinkage); 997 if (status != 0) { 998 DPRINT(10, "_init:failed in mod_install()"); 999 ddi_soft_state_fini(&ibd_list); 1000 mac_fini_ops(&ibd_dev_ops); 1001 return (status); 1002 } 1003 1004 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 1005 mutex_enter(&ibd_gstate.ig_mutex); 1006 ibd_gstate.ig_ibt_hdl = NULL; 1007 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 1008 ibd_gstate.ig_service_list = NULL; 1009 mutex_exit(&ibd_gstate.ig_mutex); 1010 1011 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, 1012 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { 1013 return (EIO); 1014 } 1015 1016 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); 1017 1018 #ifdef IBD_LOGGING 1019 ibd_log_init(); 1020 #endif 1021 return (0); 1022 } 1023 1024 int 1025 _info(struct modinfo *modinfop) 1026 { 1027 return (mod_info(&ibd_modlinkage, modinfop)); 1028 } 1029 1030 int 1031 _fini() 1032 { 1033 int status; 1034 1035 status = mod_remove(&ibd_modlinkage); 1036 if (status != 0) 1037 return (status); 1038 1039 ibt_unregister_part_attr_cb(); 1040 1041 mac_fini_ops(&ibd_dev_ops); 1042 mutex_destroy(&ibd_objlist_lock); 1043 ddi_soft_state_fini(&ibd_list); 1044 mutex_destroy(&ibd_gstate.ig_mutex); 1045 #ifdef IBD_LOGGING 1046 ibd_log_fini(); 1047 #endif 1048 return (0); 1049 } 1050 1051 /* 1052 * Convert the GID part of the mac address from network byte order 1053 * to host order. 1054 */ 1055 static void 1056 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 1057 { 1058 ib_sn_prefix_t nbopref; 1059 ib_guid_t nboguid; 1060 1061 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 1062 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 1063 dgid->gid_prefix = b2h64(nbopref); 1064 dgid->gid_guid = b2h64(nboguid); 1065 } 1066 1067 /* 1068 * Create the IPoIB address in network byte order from host order inputs. 1069 */ 1070 static void 1071 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 1072 ib_guid_t guid) 1073 { 1074 ib_sn_prefix_t nbopref; 1075 ib_guid_t nboguid; 1076 1077 mac->ipoib_qpn = htonl(qpn); 1078 nbopref = h2b64(prefix); 1079 nboguid = h2b64(guid); 1080 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 1081 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 1082 } 1083 1084 /* 1085 * Send to the appropriate all-routers group when the IBA multicast group 1086 * does not exist, based on whether the target group is v4 or v6. 1087 */ 1088 static boolean_t 1089 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 1090 ipoib_mac_t *rmac) 1091 { 1092 boolean_t retval = B_TRUE; 1093 uint32_t adjscope = state->id_scope << 16; 1094 uint32_t topword; 1095 1096 /* 1097 * Copy the first 4 bytes in without assuming any alignment of 1098 * input mac address; this will have IPoIB signature, flags and 1099 * scope bits. 1100 */ 1101 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 1102 topword = ntohl(topword); 1103 1104 /* 1105 * Generate proper address for IPv4/v6, adding in the Pkey properly. 1106 */ 1107 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 1108 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 1109 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 1110 ((uint32_t)(state->id_pkey << 16))), 1111 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 1112 else 1113 /* 1114 * Does not have proper bits in the mgid address. 1115 */ 1116 retval = B_FALSE; 1117 1118 return (retval); 1119 } 1120 1121 /* 1122 * Membership states for different mcg's are tracked by two lists: 1123 * the "non" list is used for promiscuous mode, when all mcg traffic 1124 * needs to be inspected. This type of membership is never used for 1125 * transmission, so there can not be an AH in the active list 1126 * corresponding to a member in this list. This list does not need 1127 * any protection, since all operations are performed by the async 1128 * thread. 1129 * 1130 * "Full" and "SendOnly" membership is tracked using a single list, 1131 * the "full" list. This is because this single list can then be 1132 * searched during transmit to a multicast group (if an AH for the 1133 * mcg is not found in the active list), since at least one type 1134 * of membership must be present before initiating the transmit. 1135 * This list is also emptied during driver detach, since sendonly 1136 * membership acquired during transmit is dropped at detach time 1137 * along with ipv4 broadcast full membership. Insert/deletes to 1138 * this list are done only by the async thread, but it is also 1139 * searched in program context (see multicast disable case), thus 1140 * the id_mc_mutex protects the list. The driver detach path also 1141 * deconstructs the "full" list, but it ensures that the async 1142 * thread will not be accessing the list (by blocking out mcg 1143 * trap handling and making sure no more Tx reaping will happen). 1144 * 1145 * Currently, an IBA attach is done in the SendOnly case too, 1146 * although this is not required. 1147 */ 1148 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1149 list_insert_head(&state->id_mc_full, mce) 1150 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1151 list_insert_head(&state->id_mc_non, mce) 1152 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1153 ibd_mcache_find(mgid, &state->id_mc_full) 1154 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1155 ibd_mcache_find(mgid, &state->id_mc_non) 1156 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1157 list_remove(&state->id_mc_full, mce) 1158 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1159 list_remove(&state->id_mc_non, mce) 1160 1161 static void * 1162 list_get_head(list_t *list) 1163 { 1164 list_node_t *lhead = list_head(list); 1165 1166 if (lhead != NULL) 1167 list_remove(list, lhead); 1168 return (lhead); 1169 } 1170 1171 /* 1172 * This is always guaranteed to be able to queue the work. 1173 */ 1174 void 1175 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1176 { 1177 /* Initialize request */ 1178 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1179 ptr->rq_op = op; 1180 1181 /* 1182 * Queue provided slot onto request pool. 1183 */ 1184 mutex_enter(&state->id_acache_req_lock); 1185 list_insert_tail(&state->id_req_list, ptr); 1186 1187 /* Go, fetch, async thread */ 1188 cv_signal(&state->id_acache_req_cv); 1189 mutex_exit(&state->id_acache_req_lock); 1190 } 1191 1192 /* 1193 * Main body of the per interface async thread. 1194 */ 1195 static void 1196 ibd_async_work(ibd_state_t *state) 1197 { 1198 ibd_req_t *ptr; 1199 callb_cpr_t cprinfo; 1200 1201 mutex_enter(&state->id_acache_req_lock); 1202 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1203 callb_generic_cpr, "ibd_async_work"); 1204 1205 for (;;) { 1206 ptr = list_get_head(&state->id_req_list); 1207 if (ptr != NULL) { 1208 mutex_exit(&state->id_acache_req_lock); 1209 1210 /* 1211 * If we are in late hca initialization mode, do not 1212 * process any other async request other than TRAP. TRAP 1213 * is used for indicating creation of a broadcast group; 1214 * in which case, we need to join/create the group. 1215 */ 1216 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 1217 (ptr->rq_op != IBD_ASYNC_TRAP)) { 1218 goto free_req_and_continue; 1219 } 1220 1221 /* 1222 * Once we have done the operation, there is no 1223 * guarantee the request slot is going to be valid, 1224 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1225 * TRAP). 1226 * 1227 * Perform the request. 1228 */ 1229 switch (ptr->rq_op) { 1230 case IBD_ASYNC_GETAH: 1231 ibd_async_acache(state, &ptr->rq_mac); 1232 break; 1233 case IBD_ASYNC_JOIN: 1234 case IBD_ASYNC_LEAVE: 1235 ibd_async_multicast(state, 1236 ptr->rq_gid, ptr->rq_op); 1237 break; 1238 case IBD_ASYNC_PROMON: 1239 ibd_async_setprom(state); 1240 break; 1241 case IBD_ASYNC_PROMOFF: 1242 ibd_async_unsetprom(state); 1243 break; 1244 case IBD_ASYNC_REAP: 1245 ibd_async_reap_group(state, 1246 ptr->rq_ptr, ptr->rq_gid, 1247 IB_MC_JSTATE_FULL); 1248 /* 1249 * the req buf contains in mce 1250 * structure, so we do not need 1251 * to free it here. 1252 */ 1253 ptr = NULL; 1254 break; 1255 case IBD_ASYNC_TRAP: 1256 ibd_async_trap(state, ptr); 1257 break; 1258 case IBD_ASYNC_SCHED: 1259 ibd_async_txsched(state); 1260 break; 1261 case IBD_ASYNC_LINK: 1262 ibd_async_link(state, ptr); 1263 break; 1264 case IBD_ASYNC_EXIT: 1265 mutex_enter(&state->id_acache_req_lock); 1266 #ifndef __lock_lint 1267 CALLB_CPR_EXIT(&cprinfo); 1268 #else 1269 mutex_exit(&state->id_acache_req_lock); 1270 #endif 1271 return; 1272 case IBD_ASYNC_RC_TOO_BIG: 1273 ibd_async_rc_process_too_big(state, 1274 ptr); 1275 break; 1276 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1277 ibd_async_rc_close_act_chan(state, ptr); 1278 break; 1279 case IBD_ASYNC_RC_RECYCLE_ACE: 1280 ibd_async_rc_recycle_ace(state, ptr); 1281 break; 1282 case IBD_ASYNC_RC_CLOSE_PAS_CHAN: 1283 (void) ibd_rc_pas_close(ptr->rq_ptr, 1284 B_TRUE, B_TRUE); 1285 break; 1286 } 1287 free_req_and_continue: 1288 if (ptr != NULL) 1289 kmem_cache_free(state->id_req_kmc, ptr); 1290 1291 mutex_enter(&state->id_acache_req_lock); 1292 } else { 1293 #ifndef __lock_lint 1294 /* 1295 * Nothing to do: wait till new request arrives. 1296 */ 1297 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1298 cv_wait(&state->id_acache_req_cv, 1299 &state->id_acache_req_lock); 1300 CALLB_CPR_SAFE_END(&cprinfo, 1301 &state->id_acache_req_lock); 1302 #endif 1303 } 1304 } 1305 1306 /*NOTREACHED*/ 1307 _NOTE(NOT_REACHED) 1308 } 1309 1310 /* 1311 * Return when it is safe to queue requests to the async daemon; primarily 1312 * for subnet trap and async event handling. Disallow requests before the 1313 * daemon is created, and when interface deinitilization starts. 1314 */ 1315 static boolean_t 1316 ibd_async_safe(ibd_state_t *state) 1317 { 1318 mutex_enter(&state->id_trap_lock); 1319 if (state->id_trap_stop) { 1320 mutex_exit(&state->id_trap_lock); 1321 return (B_FALSE); 1322 } 1323 state->id_trap_inprog++; 1324 mutex_exit(&state->id_trap_lock); 1325 return (B_TRUE); 1326 } 1327 1328 /* 1329 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1330 * trap or event handling to complete to kill the async thread and deconstruct 1331 * the mcg/ace list. 1332 */ 1333 static void 1334 ibd_async_done(ibd_state_t *state) 1335 { 1336 mutex_enter(&state->id_trap_lock); 1337 if (--state->id_trap_inprog == 0) 1338 cv_signal(&state->id_trap_cv); 1339 mutex_exit(&state->id_trap_lock); 1340 } 1341 1342 /* 1343 * Hash functions: 1344 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1345 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1346 * These operate on mac addresses input into ibd_send, but there is no 1347 * guarantee on the alignment of the ipoib_mac_t structure. 1348 */ 1349 /*ARGSUSED*/ 1350 static uint_t 1351 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1352 { 1353 ulong_t ptraddr = (ulong_t)key; 1354 uint_t hval; 1355 1356 /* 1357 * If the input address is 4 byte aligned, we can just dereference 1358 * it. This is most common, since IP will send in a 4 byte aligned 1359 * IP header, which implies the 24 byte IPoIB psuedo header will be 1360 * 4 byte aligned too. 1361 */ 1362 if ((ptraddr & 3) == 0) 1363 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1364 1365 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1366 return (hval); 1367 } 1368 1369 static int 1370 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1371 { 1372 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1373 return (0); 1374 else 1375 return (1); 1376 } 1377 1378 /* 1379 * Initialize all the per interface caches and lists; AH cache, 1380 * MCG list etc. 1381 */ 1382 static int 1383 ibd_acache_init(ibd_state_t *state) 1384 { 1385 ibd_ace_t *ce; 1386 int i; 1387 1388 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1389 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1390 mutex_enter(&state->id_ac_mutex); 1391 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1392 offsetof(ibd_ace_t, ac_list)); 1393 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1394 offsetof(ibd_ace_t, ac_list)); 1395 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1396 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, 1397 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1398 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1399 offsetof(ibd_mce_t, mc_list)); 1400 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1401 offsetof(ibd_mce_t, mc_list)); 1402 state->id_ac_hot_ace = NULL; 1403 1404 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1405 state->id_num_ah, KM_SLEEP); 1406 for (i = 0; i < state->id_num_ah; i++, ce++) { 1407 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1408 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1409 mutex_exit(&state->id_ac_mutex); 1410 ibd_acache_fini(state); 1411 return (DDI_FAILURE); 1412 } else { 1413 CLEAR_REFCYCLE(ce); 1414 ce->ac_mce = NULL; 1415 mutex_init(&ce->tx_too_big_mutex, NULL, 1416 MUTEX_DRIVER, NULL); 1417 IBD_ACACHE_INSERT_FREE(state, ce); 1418 } 1419 } 1420 mutex_exit(&state->id_ac_mutex); 1421 return (DDI_SUCCESS); 1422 } 1423 1424 static void 1425 ibd_acache_fini(ibd_state_t *state) 1426 { 1427 ibd_ace_t *ptr; 1428 1429 mutex_enter(&state->id_ac_mutex); 1430 1431 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1432 ASSERT(GET_REF(ptr) == 0); 1433 mutex_destroy(&ptr->tx_too_big_mutex); 1434 (void) ibt_free_ud_dest(ptr->ac_dest); 1435 } 1436 1437 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1438 ASSERT(GET_REF(ptr) == 0); 1439 mutex_destroy(&ptr->tx_too_big_mutex); 1440 (void) ibt_free_ud_dest(ptr->ac_dest); 1441 } 1442 1443 list_destroy(&state->id_ah_free); 1444 list_destroy(&state->id_ah_active); 1445 list_destroy(&state->id_mc_full); 1446 list_destroy(&state->id_mc_non); 1447 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); 1448 mutex_exit(&state->id_ac_mutex); 1449 mutex_destroy(&state->id_ac_mutex); 1450 mutex_destroy(&state->id_mc_mutex); 1451 } 1452 1453 /* 1454 * Search AH active hash list for a cached path to input destination. 1455 * If we are "just looking", hold == F. When we are in the Tx path, 1456 * we set hold == T to grab a reference on the AH so that it can not 1457 * be recycled to a new destination while the Tx request is posted. 1458 */ 1459 ibd_ace_t * 1460 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1461 { 1462 ibd_ace_t *ptr; 1463 1464 ASSERT(mutex_owned(&state->id_ac_mutex)); 1465 1466 /* 1467 * Do hash search. 1468 */ 1469 if (mod_hash_find(state->id_ah_active_hash, 1470 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1471 if (hold) 1472 INC_REF(ptr, num); 1473 return (ptr); 1474 } 1475 return (NULL); 1476 } 1477 1478 /* 1479 * This is called by the tx side; if an initialized AH is found in 1480 * the active list, it is locked down and can be used; if no entry 1481 * is found, an async request is queued to do path resolution. 1482 */ 1483 static ibd_ace_t * 1484 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1485 { 1486 ibd_ace_t *ptr; 1487 ibd_req_t *req; 1488 1489 /* 1490 * Only attempt to print when we can; in the mdt pattr case, the 1491 * address is not aligned properly. 1492 */ 1493 if (((ulong_t)mac & 3) == 0) { 1494 DPRINT(4, 1495 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1496 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1497 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1498 htonl(mac->ipoib_gidsuff[1])); 1499 } 1500 1501 mutex_enter(&state->id_ac_mutex); 1502 1503 if (((ptr = state->id_ac_hot_ace) != NULL) && 1504 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1505 INC_REF(ptr, numwqe); 1506 mutex_exit(&state->id_ac_mutex); 1507 return (ptr); 1508 } 1509 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1510 state->id_ac_hot_ace = ptr; 1511 mutex_exit(&state->id_ac_mutex); 1512 return (ptr); 1513 } 1514 1515 /* 1516 * Implementation of a single outstanding async request; if 1517 * the operation is not started yet, queue a request and move 1518 * to ongoing state. Remember in id_ah_addr for which address 1519 * we are queueing the request, in case we need to flag an error; 1520 * Any further requests, for the same or different address, until 1521 * the operation completes, is sent back to GLDv3 to be retried. 1522 * The async thread will update id_ah_op with an error indication 1523 * or will set it to indicate the next look up can start; either 1524 * way, it will mac_tx_update() so that all blocked requests come 1525 * back here. 1526 */ 1527 *err = EAGAIN; 1528 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1529 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1530 if (req != NULL) { 1531 /* 1532 * We did not even find the entry; queue a request 1533 * for it. 1534 */ 1535 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1536 state->id_ah_op = IBD_OP_ONGOING; 1537 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1538 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1539 } 1540 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1541 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1542 /* 1543 * Check the status of the pathrecord lookup request 1544 * we had queued before. 1545 */ 1546 if (state->id_ah_op == IBD_OP_ERRORED) { 1547 *err = EFAULT; 1548 state->id_ah_error++; 1549 } else { 1550 /* 1551 * IBD_OP_ROUTERED case: We need to send to the 1552 * all-router MCG. If we can find the AH for 1553 * the mcg, the Tx will be attempted. If we 1554 * do not find the AH, we return NORESOURCES 1555 * to retry. 1556 */ 1557 ipoib_mac_t routermac; 1558 1559 (void) ibd_get_allroutergroup(state, mac, &routermac); 1560 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1561 numwqe); 1562 } 1563 state->id_ah_op = IBD_OP_NOTSTARTED; 1564 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1565 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1566 /* 1567 * This case can happen when we get a higher band 1568 * packet. The easiest way is to reset the state machine 1569 * to accommodate the higher priority packet. 1570 */ 1571 state->id_ah_op = IBD_OP_NOTSTARTED; 1572 } 1573 mutex_exit(&state->id_ac_mutex); 1574 1575 return (ptr); 1576 } 1577 1578 /* 1579 * Grab a not-currently-in-use AH/PathRecord from the active 1580 * list to recycle to a new destination. Only the async thread 1581 * executes this code. 1582 */ 1583 static ibd_ace_t * 1584 ibd_acache_get_unref(ibd_state_t *state) 1585 { 1586 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1587 boolean_t try_rc_chan_recycle = B_FALSE; 1588 1589 ASSERT(mutex_owned(&state->id_ac_mutex)); 1590 1591 /* 1592 * Do plain linear search. 1593 */ 1594 while (ptr != NULL) { 1595 /* 1596 * Note that it is possible that the "cycle" bit 1597 * is set on the AH w/o any reference count. The 1598 * mcg must have been deleted, and the tx cleanup 1599 * just decremented the reference count to 0, but 1600 * hasn't gotten around to grabbing the id_ac_mutex 1601 * to move the AH into the free list. 1602 */ 1603 if (GET_REF(ptr) == 0) { 1604 if (ptr->ac_chan != NULL) { 1605 ASSERT(state->id_enable_rc == B_TRUE); 1606 if (!try_rc_chan_recycle) { 1607 try_rc_chan_recycle = B_TRUE; 1608 ibd_rc_signal_ace_recycle(state, ptr); 1609 } 1610 } else { 1611 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1612 break; 1613 } 1614 } 1615 ptr = list_prev(&state->id_ah_active, ptr); 1616 } 1617 return (ptr); 1618 } 1619 1620 /* 1621 * Invoked to clean up AH from active list in case of multicast 1622 * disable and to handle sendonly memberships during mcg traps. 1623 * And for port up processing for multicast and unicast AHs. 1624 * Normally, the AH is taken off the active list, and put into 1625 * the free list to be recycled for a new destination. In case 1626 * Tx requests on the AH have not completed yet, the AH is marked 1627 * for reaping (which will put the AH on the free list) once the Tx's 1628 * complete; in this case, depending on the "force" input, we take 1629 * out the AH from the active list right now, or leave it also for 1630 * the reap operation. Returns TRUE if the AH is taken off the active 1631 * list (and either put into the free list right now, or arranged for 1632 * later), FALSE otherwise. 1633 */ 1634 boolean_t 1635 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1636 { 1637 ibd_ace_t *acactive; 1638 boolean_t ret = B_TRUE; 1639 1640 ASSERT(mutex_owned(&state->id_ac_mutex)); 1641 1642 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1643 1644 /* 1645 * Note that the AH might already have the cycle bit set 1646 * on it; this might happen if sequences of multicast 1647 * enables and disables are coming so fast, that posted 1648 * Tx's to the mcg have not completed yet, and the cycle 1649 * bit is set successively by each multicast disable. 1650 */ 1651 if (SET_CYCLE_IF_REF(acactive)) { 1652 if (!force) { 1653 /* 1654 * The ace is kept on the active list, further 1655 * Tx's can still grab a reference on it; the 1656 * ace is reaped when all pending Tx's 1657 * referencing the AH complete. 1658 */ 1659 ret = B_FALSE; 1660 } else { 1661 /* 1662 * In the mcg trap case, we always pull the 1663 * AH from the active list. And also the port 1664 * up multi/unicast case. 1665 */ 1666 ASSERT(acactive->ac_chan == NULL); 1667 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1668 acactive->ac_mce = NULL; 1669 } 1670 } else { 1671 /* 1672 * Determined the ref count is 0, thus reclaim 1673 * immediately after pulling out the ace from 1674 * the active list. 1675 */ 1676 ASSERT(acactive->ac_chan == NULL); 1677 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1678 acactive->ac_mce = NULL; 1679 IBD_ACACHE_INSERT_FREE(state, acactive); 1680 } 1681 1682 } 1683 return (ret); 1684 } 1685 1686 /* 1687 * Helper function for async path record lookup. If we are trying to 1688 * Tx to a MCG, check our membership, possibly trying to join the 1689 * group if required. If that fails, try to send the packet to the 1690 * all router group (indicated by the redirect output), pointing 1691 * the input mac address to the router mcg address. 1692 */ 1693 static ibd_mce_t * 1694 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1695 { 1696 ib_gid_t mgid; 1697 ibd_mce_t *mce; 1698 ipoib_mac_t routermac; 1699 1700 *redirect = B_FALSE; 1701 ibd_n2h_gid(mac, &mgid); 1702 1703 /* 1704 * Check the FullMember+SendOnlyNonMember list. 1705 * Since we are the only one who manipulates the 1706 * id_mc_full list, no locks are needed. 1707 */ 1708 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1709 if (mce != NULL) { 1710 DPRINT(4, "ibd_async_mcache : already joined to group"); 1711 return (mce); 1712 } 1713 1714 /* 1715 * Not found; try to join(SendOnlyNonMember) and attach. 1716 */ 1717 DPRINT(4, "ibd_async_mcache : not joined to group"); 1718 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1719 NULL) { 1720 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1721 return (mce); 1722 } 1723 1724 /* 1725 * MCGroup not present; try to join the all-router group. If 1726 * any of the following steps succeed, we will be redirecting 1727 * to the all router group. 1728 */ 1729 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1730 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1731 return (NULL); 1732 *redirect = B_TRUE; 1733 ibd_n2h_gid(&routermac, &mgid); 1734 bcopy(&routermac, mac, IPOIB_ADDRL); 1735 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1736 mgid.gid_prefix, mgid.gid_guid); 1737 1738 /* 1739 * Are we already joined to the router group? 1740 */ 1741 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1742 DPRINT(4, "ibd_async_mcache : using already joined router" 1743 "group\n"); 1744 return (mce); 1745 } 1746 1747 /* 1748 * Can we join(SendOnlyNonMember) the router group? 1749 */ 1750 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1751 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1752 NULL) { 1753 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1754 return (mce); 1755 } 1756 1757 return (NULL); 1758 } 1759 1760 /* 1761 * Async path record lookup code. 1762 */ 1763 static void 1764 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1765 { 1766 ibd_ace_t *ce; 1767 ibd_mce_t *mce = NULL; 1768 ibt_path_attr_t path_attr; 1769 ibt_path_info_t path_info; 1770 ib_gid_t destgid; 1771 char ret = IBD_OP_NOTSTARTED; 1772 1773 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1774 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1775 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1776 htonl(mac->ipoib_gidsuff[1])); 1777 1778 /* 1779 * Check whether we are trying to transmit to a MCG. 1780 * In that case, we need to make sure we are a member of 1781 * the MCG. 1782 */ 1783 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1784 boolean_t redirected; 1785 1786 /* 1787 * If we can not find or join the group or even 1788 * redirect, error out. 1789 */ 1790 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1791 NULL) { 1792 state->id_ah_op = IBD_OP_ERRORED; 1793 return; 1794 } 1795 1796 /* 1797 * If we got redirected, we need to determine whether 1798 * the AH for the new mcg is in the cache already, and 1799 * not pull it in then; otherwise proceed to get the 1800 * path for the new mcg. There is no guarantee that 1801 * if the AH is currently in the cache, it will still be 1802 * there when we look in ibd_acache_lookup(), but that's 1803 * okay, we will come back here. 1804 */ 1805 if (redirected) { 1806 ret = IBD_OP_ROUTERED; 1807 DPRINT(4, "ibd_async_acache : redirected to " 1808 "%08X:%08X:%08X:%08X:%08X", 1809 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1810 htonl(mac->ipoib_gidpref[1]), 1811 htonl(mac->ipoib_gidsuff[0]), 1812 htonl(mac->ipoib_gidsuff[1])); 1813 1814 mutex_enter(&state->id_ac_mutex); 1815 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1816 state->id_ah_op = IBD_OP_ROUTERED; 1817 mutex_exit(&state->id_ac_mutex); 1818 DPRINT(4, "ibd_async_acache : router AH found"); 1819 return; 1820 } 1821 mutex_exit(&state->id_ac_mutex); 1822 } 1823 } 1824 1825 /* 1826 * Get an AH from the free list. 1827 */ 1828 mutex_enter(&state->id_ac_mutex); 1829 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1830 /* 1831 * No free ones; try to grab an unreferenced active 1832 * one. Maybe we need to make the active list LRU, 1833 * but that will create more work for Tx callbacks. 1834 * Is there a way of not having to pull out the 1835 * entry from the active list, but just indicate it 1836 * is being recycled? Yes, but that creates one more 1837 * check in the fast lookup path. 1838 */ 1839 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1840 /* 1841 * Pretty serious shortage now. 1842 */ 1843 state->id_ah_op = IBD_OP_NOTSTARTED; 1844 mutex_exit(&state->id_ac_mutex); 1845 DPRINT(10, "ibd_async_acache : failed to find AH " 1846 "slot\n"); 1847 return; 1848 } 1849 /* 1850 * We could check whether ac_mce points to a SendOnly 1851 * member and drop that membership now. Or do it lazily 1852 * at detach time. 1853 */ 1854 ce->ac_mce = NULL; 1855 } 1856 mutex_exit(&state->id_ac_mutex); 1857 ASSERT(ce->ac_mce == NULL); 1858 1859 /* 1860 * Update the entry. 1861 */ 1862 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1863 1864 bzero(&path_info, sizeof (path_info)); 1865 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1866 path_attr.pa_sgid = state->id_sgid; 1867 path_attr.pa_num_dgids = 1; 1868 ibd_n2h_gid(&ce->ac_mac, &destgid); 1869 path_attr.pa_dgids = &destgid; 1870 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1871 path_attr.pa_pkey = state->id_pkey; 1872 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1, 1873 &path_info, NULL) != IBT_SUCCESS) { 1874 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1875 goto error; 1876 } 1877 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1878 ntohl(ce->ac_mac.ipoib_qpn), 1879 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1880 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1881 goto error; 1882 } 1883 1884 /* 1885 * mce is set whenever an AH is being associated with a 1886 * MCG; this will come in handy when we leave the MCG. The 1887 * lock protects Tx fastpath from scanning the active list. 1888 */ 1889 if (mce != NULL) 1890 ce->ac_mce = mce; 1891 1892 /* 1893 * initiate a RC mode connection for unicast address 1894 */ 1895 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1896 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1897 ASSERT(ce->ac_chan == NULL); 1898 DPRINT(10, "ibd_async_acache: call " 1899 "ibd_rc_try_connect(ace=%p)", ce); 1900 ibd_rc_try_connect(state, ce, &path_info); 1901 if (ce->ac_chan == NULL) { 1902 DPRINT(10, "ibd_async_acache: fail to setup RC" 1903 " channel"); 1904 state->rc_conn_fail++; 1905 goto error; 1906 } 1907 } 1908 1909 mutex_enter(&state->id_ac_mutex); 1910 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1911 state->id_ah_op = ret; 1912 mutex_exit(&state->id_ac_mutex); 1913 return; 1914 error: 1915 /* 1916 * We might want to drop SendOnly membership here if we 1917 * joined above. The lock protects Tx callbacks inserting 1918 * into the free list. 1919 */ 1920 mutex_enter(&state->id_ac_mutex); 1921 state->id_ah_op = IBD_OP_ERRORED; 1922 IBD_ACACHE_INSERT_FREE(state, ce); 1923 mutex_exit(&state->id_ac_mutex); 1924 } 1925 1926 /* 1927 * While restoring port's presence on the subnet on a port up, it is possible 1928 * that the port goes down again. 1929 */ 1930 static void 1931 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1932 { 1933 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1934 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1935 LINK_STATE_UP; 1936 ibd_mce_t *mce, *pmce; 1937 ibd_ace_t *ace, *pace; 1938 1939 DPRINT(10, "ibd_async_link(): %d", opcode); 1940 1941 /* 1942 * On a link up, revalidate the link speed/width. No point doing 1943 * this on a link down, since we will be unable to do SA operations, 1944 * defaulting to the lowest speed. Also notice that we update our 1945 * notion of speed before calling mac_link_update(), which will do 1946 * necessary higher level notifications for speed changes. 1947 */ 1948 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1949 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1950 state->id_link_speed = ibd_get_portspeed(state); 1951 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1952 } 1953 1954 /* 1955 * Do all the work required to establish our presence on 1956 * the subnet. 1957 */ 1958 if (opcode == IBD_LINK_UP_ABSENT) { 1959 /* 1960 * If in promiscuous mode ... 1961 */ 1962 if (state->id_prom_op == IBD_OP_COMPLETED) { 1963 /* 1964 * Drop all nonmembership. 1965 */ 1966 ibd_async_unsetprom(state); 1967 1968 /* 1969 * Then, try to regain nonmembership to all mcg's. 1970 */ 1971 ibd_async_setprom(state); 1972 1973 } 1974 1975 /* 1976 * Drop all sendonly membership (which also gets rid of the 1977 * AHs); try to reacquire all full membership. 1978 */ 1979 mce = list_head(&state->id_mc_full); 1980 while ((pmce = mce) != NULL) { 1981 mce = list_next(&state->id_mc_full, mce); 1982 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1983 ibd_leave_group(state, 1984 pmce->mc_info.mc_adds_vect.av_dgid, 1985 IB_MC_JSTATE_SEND_ONLY_NON); 1986 else 1987 ibd_reacquire_group(state, pmce); 1988 } 1989 1990 /* 1991 * Recycle all active AHs to free list (and if there are 1992 * pending posts, make sure they will go into the free list 1993 * once the Tx's complete). Grab the lock to prevent 1994 * concurrent Tx's as well as Tx cleanups. 1995 */ 1996 mutex_enter(&state->id_ac_mutex); 1997 ace = list_head(&state->id_ah_active); 1998 while ((pace = ace) != NULL) { 1999 boolean_t cycled; 2000 2001 ace = list_next(&state->id_ah_active, ace); 2002 mce = pace->ac_mce; 2003 if (pace->ac_chan != NULL) { 2004 ASSERT(mce == NULL); 2005 ASSERT(state->id_enable_rc == B_TRUE); 2006 if (pace->ac_chan->chan_state == 2007 IBD_RC_STATE_ACT_ESTAB) { 2008 INC_REF(pace, 1); 2009 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 2010 pace->ac_chan->chan_state = 2011 IBD_RC_STATE_ACT_CLOSING; 2012 ibd_rc_signal_act_close(state, pace); 2013 } else { 2014 state->rc_act_close_simultaneous++; 2015 DPRINT(40, "ibd_async_link: other " 2016 "thread is closing it, ace=%p, " 2017 "ac_chan=%p, chan_state=%d", 2018 pace, pace->ac_chan, 2019 pace->ac_chan->chan_state); 2020 } 2021 } else { 2022 cycled = ibd_acache_recycle(state, 2023 &pace->ac_mac, B_TRUE); 2024 } 2025 /* 2026 * If this is for an mcg, it must be for a fullmember, 2027 * since we got rid of send-only members above when 2028 * processing the mce list. 2029 */ 2030 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2031 IB_MC_JSTATE_FULL))); 2032 2033 /* 2034 * Check if the fullmember mce needs to be torn down, 2035 * ie whether the DLPI disable has already been done. 2036 * If so, do some of the work of tx_cleanup, namely 2037 * causing leave (which will fail), detach and 2038 * mce-freeing. tx_cleanup will put the AH into free 2039 * list. The reason to duplicate some of this 2040 * tx_cleanup work is because we want to delete the 2041 * AH right now instead of waiting for tx_cleanup, to 2042 * force subsequent Tx's to reacquire an AH. 2043 */ 2044 if ((mce != NULL) && (mce->mc_fullreap)) 2045 ibd_async_reap_group(state, mce, 2046 mce->mc_info.mc_adds_vect.av_dgid, 2047 mce->mc_jstate); 2048 } 2049 mutex_exit(&state->id_ac_mutex); 2050 } 2051 2052 /* 2053 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2054 * (which stops further events from being delivered) before 2055 * mac_unregister(). At this point, it is guaranteed that mac_register 2056 * has already been done. 2057 */ 2058 mutex_enter(&state->id_link_mutex); 2059 state->id_link_state = lstate; 2060 mac_link_update(state->id_mh, lstate); 2061 mutex_exit(&state->id_link_mutex); 2062 2063 ibd_async_done(state); 2064 } 2065 2066 /* 2067 * Check the pkey table to see if we can find the pkey we're looking for. 2068 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 2069 * failure. 2070 */ 2071 static int 2072 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 2073 uint16_t *pkix) 2074 { 2075 uint16_t ndx; 2076 2077 ASSERT(pkix != NULL); 2078 2079 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 2080 if (pkey_tbl[ndx] == pkey) { 2081 *pkix = ndx; 2082 return (0); 2083 } 2084 } 2085 return (-1); 2086 } 2087 2088 /* 2089 * Late HCA Initialization: 2090 * If plumb had succeeded without the availability of an active port or the 2091 * pkey, and either of their availability is now being indicated via PORT_UP 2092 * or PORT_CHANGE respectively, try a start of the interface. 2093 * 2094 * Normal Operation: 2095 * When the link is notified up, we need to do a few things, based 2096 * on the port's current p_init_type_reply claiming a reinit has been 2097 * done or not. The reinit steps are: 2098 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2099 * the old Pkey and GID0 are correct. 2100 * 2. Register for mcg traps (already done by ibmf). 2101 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2102 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2103 * 4. Give up all sendonly memberships. 2104 * 5. Acquire all full memberships. 2105 * 6. In promiscuous mode, acquire all non memberships. 2106 * 7. Recycle all AHs to free list. 2107 */ 2108 static void 2109 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2110 { 2111 ibt_hca_portinfo_t *port_infop = NULL; 2112 ibt_status_t ibt_status; 2113 uint_t psize, port_infosz; 2114 ibd_link_op_t opcode; 2115 ibd_req_t *req; 2116 link_state_t new_link_state = LINK_STATE_UP; 2117 uint8_t itreply; 2118 uint16_t pkix; 2119 int ret; 2120 2121 /* 2122 * Let's not race with a plumb or an unplumb; if we detect a 2123 * pkey relocation event later on here, we may have to restart. 2124 */ 2125 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2126 2127 mutex_enter(&state->id_link_mutex); 2128 2129 /* 2130 * If the link state is unknown, a plumb has not yet been attempted 2131 * on the interface. Nothing to do. 2132 */ 2133 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2134 mutex_exit(&state->id_link_mutex); 2135 goto link_mod_return; 2136 } 2137 2138 /* 2139 * If link state is down because of plumb failure, and we are not in 2140 * late HCA init, and we were not successfully plumbed, nothing to do. 2141 */ 2142 if ((state->id_link_state == LINK_STATE_DOWN) && 2143 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && 2144 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { 2145 mutex_exit(&state->id_link_mutex); 2146 goto link_mod_return; 2147 } 2148 2149 /* 2150 * If this routine was called in response to a port down event, 2151 * we just need to see if this should be informed. 2152 */ 2153 if (code == IBT_ERROR_PORT_DOWN) { 2154 new_link_state = LINK_STATE_DOWN; 2155 goto update_link_state; 2156 } 2157 2158 /* 2159 * If it's not a port down event we've received, try to get the port 2160 * attributes first. If we fail here, the port is as good as down. 2161 * Otherwise, if the link went down by the time the handler gets 2162 * here, give up - we cannot even validate the pkey/gid since those 2163 * are not valid and this is as bad as a port down anyway. 2164 */ 2165 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2166 &port_infop, &psize, &port_infosz); 2167 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2168 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2169 new_link_state = LINK_STATE_DOWN; 2170 goto update_link_state; 2171 } 2172 2173 /* 2174 * If in the previous attempt, the pkey was not found either due to the 2175 * port state being down, or due to it's absence in the pkey table, 2176 * look for it now and try to start the interface. 2177 */ 2178 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { 2179 mutex_exit(&state->id_link_mutex); 2180 if ((ret = ibd_start(state)) != 0) { 2181 DPRINT(10, "ibd_linkmod: cannot start from late HCA " 2182 "init, ret=%d", ret); 2183 } 2184 ibt_free_portinfo(port_infop, port_infosz); 2185 goto link_mod_return; 2186 } 2187 2188 /* 2189 * Check the SM InitTypeReply flags. If both NoLoadReply and 2190 * PreserveContentReply are 0, we don't know anything about the 2191 * data loaded into the port attributes, so we need to verify 2192 * if gid0 and pkey are still valid. 2193 */ 2194 itreply = port_infop->p_init_type_reply; 2195 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2196 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2197 /* 2198 * Check to see if the subnet part of GID0 has changed. If 2199 * not, check the simple case first to see if the pkey 2200 * index is the same as before; finally check to see if the 2201 * pkey has been relocated to a different index in the table. 2202 */ 2203 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2204 if (bcmp(port_infop->p_sgid_tbl, 2205 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2206 2207 new_link_state = LINK_STATE_DOWN; 2208 2209 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2210 state->id_pkey) { 2211 2212 new_link_state = LINK_STATE_UP; 2213 2214 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2215 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2216 2217 ibt_free_portinfo(port_infop, port_infosz); 2218 mutex_exit(&state->id_link_mutex); 2219 2220 /* 2221 * Currently a restart is required if our pkey has moved 2222 * in the pkey table. If we get the ibt_recycle_ud() to 2223 * work as documented (expected), we may be able to 2224 * avoid a complete restart. Note that we've already 2225 * marked both the start and stop 'in-progress' flags, 2226 * so it is ok to go ahead and do this restart. 2227 */ 2228 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2229 if ((ret = ibd_start(state)) != 0) { 2230 DPRINT(10, "ibd_restart: cannot restart, " 2231 "ret=%d", ret); 2232 } 2233 2234 goto link_mod_return; 2235 } else { 2236 new_link_state = LINK_STATE_DOWN; 2237 } 2238 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2239 } 2240 2241 update_link_state: 2242 if (port_infop) { 2243 ibt_free_portinfo(port_infop, port_infosz); 2244 } 2245 2246 /* 2247 * If we're reporting a link up, check InitTypeReply to see if 2248 * the SM has ensured that the port's presence in mcg, traps, 2249 * etc. is intact. 2250 */ 2251 if (new_link_state == LINK_STATE_DOWN) { 2252 opcode = IBD_LINK_DOWN; 2253 } else { 2254 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2255 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2256 opcode = IBD_LINK_UP; 2257 } else { 2258 opcode = IBD_LINK_UP_ABSENT; 2259 } 2260 } 2261 2262 /* 2263 * If the old state is the same as the new state, and the SM indicated 2264 * no change in the port parameters, nothing to do. 2265 */ 2266 if ((state->id_link_state == new_link_state) && (opcode != 2267 IBD_LINK_UP_ABSENT)) { 2268 mutex_exit(&state->id_link_mutex); 2269 goto link_mod_return; 2270 } 2271 2272 /* 2273 * Ok, so there was a link state change; see if it's safe to ask 2274 * the async thread to do the work 2275 */ 2276 if (!ibd_async_safe(state)) { 2277 state->id_link_state = new_link_state; 2278 mutex_exit(&state->id_link_mutex); 2279 goto link_mod_return; 2280 } 2281 2282 mutex_exit(&state->id_link_mutex); 2283 2284 /* 2285 * Queue up a request for ibd_async_link() to handle this link 2286 * state change event 2287 */ 2288 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2289 req->rq_ptr = (void *)opcode; 2290 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2291 2292 link_mod_return: 2293 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2294 } 2295 2296 /* 2297 * For the port up/down events, IBTL guarantees there will not be concurrent 2298 * invocations of the handler. IBTL might coalesce link transition events, 2299 * and not invoke the handler for _each_ up/down transition, but it will 2300 * invoke the handler with last known state 2301 */ 2302 static void 2303 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2304 ibt_async_code_t code, ibt_async_event_t *event) 2305 { 2306 ibd_state_t *state = (ibd_state_t *)clnt_private; 2307 2308 switch (code) { 2309 case IBT_ERROR_CATASTROPHIC_CHAN: 2310 ibd_print_warn(state, "catastrophic channel error"); 2311 break; 2312 case IBT_ERROR_CQ: 2313 ibd_print_warn(state, "completion queue error"); 2314 break; 2315 case IBT_PORT_CHANGE_EVENT: 2316 /* 2317 * Events will be delivered to all instances that have 2318 * done ibt_open_hca() but not yet done ibt_close_hca(). 2319 * Only need to do work for our port; IBTF will deliver 2320 * events for other ports on the hca we have ibt_open_hca'ed 2321 * too. Note that id_port is initialized in ibd_attach() 2322 * before we do an ibt_open_hca() in ibd_attach(). 2323 */ 2324 ASSERT(state->id_hca_hdl == hca_hdl); 2325 if (state->id_port != event->ev_port) 2326 break; 2327 2328 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2329 IBT_PORT_CHANGE_PKEY) { 2330 ibd_link_mod(state, code); 2331 } 2332 break; 2333 case IBT_ERROR_PORT_DOWN: 2334 case IBT_CLNT_REREG_EVENT: 2335 case IBT_EVENT_PORT_UP: 2336 /* 2337 * Events will be delivered to all instances that have 2338 * done ibt_open_hca() but not yet done ibt_close_hca(). 2339 * Only need to do work for our port; IBTF will deliver 2340 * events for other ports on the hca we have ibt_open_hca'ed 2341 * too. Note that id_port is initialized in ibd_attach() 2342 * before we do an ibt_open_hca() in ibd_attach(). 2343 */ 2344 ASSERT(state->id_hca_hdl == hca_hdl); 2345 if (state->id_port != event->ev_port) 2346 break; 2347 2348 ibd_link_mod(state, code); 2349 break; 2350 2351 case IBT_HCA_ATTACH_EVENT: 2352 case IBT_HCA_DETACH_EVENT: 2353 /* 2354 * When a new card is plugged to the system, attach_event is 2355 * invoked. Additionally, a cfgadm needs to be run to make the 2356 * card known to the system, and an ifconfig needs to be run to 2357 * plumb up any ibd interfaces on the card. In the case of card 2358 * unplug, a cfgadm is run that will trigger any RCM scripts to 2359 * unplumb the ibd interfaces on the card; when the card is 2360 * actually unplugged, the detach_event is invoked; 2361 * additionally, if any ibd instances are still active on the 2362 * card (eg there were no associated RCM scripts), driver's 2363 * detach routine is invoked. 2364 */ 2365 break; 2366 default: 2367 break; 2368 } 2369 } 2370 2371 static int 2372 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2373 { 2374 mac_register_t *macp; 2375 int ret; 2376 2377 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2378 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2379 return (DDI_FAILURE); 2380 } 2381 2382 /* 2383 * Note that when we register with mac during attach, we don't 2384 * have the id_macaddr yet, so we'll simply be registering a 2385 * zero macaddr that we'll overwrite later during plumb (in 2386 * ibd_m_start()). Similar is the case with id_mtu - we'll 2387 * update the mac layer with the correct mtu during plumb. 2388 */ 2389 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2390 macp->m_driver = state; 2391 macp->m_dip = dip; 2392 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2393 macp->m_callbacks = &ibd_m_callbacks; 2394 macp->m_min_sdu = 0; 2395 macp->m_multicast_sdu = IBD_DEF_MAX_SDU; 2396 if (state->id_type == IBD_PORT_DRIVER) { 2397 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 2398 } else if (state->id_enable_rc) { 2399 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2400 } else { 2401 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2402 } 2403 macp->m_priv_props = ibd_priv_props; 2404 2405 /* 2406 * Register ourselves with the GLDv3 interface 2407 */ 2408 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2409 mac_free(macp); 2410 DPRINT(10, 2411 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2412 return (DDI_FAILURE); 2413 } 2414 2415 mac_free(macp); 2416 return (DDI_SUCCESS); 2417 } 2418 2419 static int 2420 ibd_record_capab(ibd_state_t *state) 2421 { 2422 ibt_hca_attr_t hca_attrs; 2423 ibt_status_t ibt_status; 2424 2425 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2426 2427 /* 2428 * Query the HCA and fetch its attributes 2429 */ 2430 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2431 ASSERT(ibt_status == IBT_SUCCESS); 2432 2433 /* 2434 * 1. Set the Hardware Checksum capability. Currently we only consider 2435 * full checksum offload. 2436 */ 2437 if (state->id_enable_rc) { 2438 state->id_hwcksum_capab = 0; 2439 } else { 2440 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2441 == IBT_HCA_CKSUM_FULL) { 2442 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2443 } 2444 } 2445 2446 /* 2447 * 2. Set LSO policy, capability and maximum length 2448 */ 2449 if (state->id_enable_rc) { 2450 state->id_lso_capable = B_FALSE; 2451 state->id_lso_maxlen = 0; 2452 } else { 2453 if (hca_attrs.hca_max_lso_size > 0) { 2454 state->id_lso_capable = B_TRUE; 2455 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2456 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2457 else 2458 state->id_lso_maxlen = 2459 hca_attrs.hca_max_lso_size; 2460 } else { 2461 state->id_lso_capable = B_FALSE; 2462 state->id_lso_maxlen = 0; 2463 } 2464 } 2465 2466 /* 2467 * 3. Set Reserved L_Key capability 2468 */ 2469 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2470 state->id_hca_res_lkey_capab = 1; 2471 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2472 state->rc_enable_iov_map = B_TRUE; 2473 } else { 2474 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2475 state->rc_enable_iov_map = B_FALSE; 2476 } 2477 2478 /* 2479 * 4. Set maximum sqseg value after checking to see if extended sgl 2480 * size information is provided by the hca 2481 */ 2482 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2483 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2484 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2485 } else { 2486 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2487 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2488 } 2489 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2490 state->id_max_sqseg = IBD_MAX_SQSEG; 2491 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2492 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2493 state->id_max_sqseg, IBD_MAX_SQSEG); 2494 } 2495 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2496 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2497 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2498 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2499 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2500 } 2501 2502 /* 2503 * Translating the virtual address regions into physical regions 2504 * for using the Reserved LKey feature results in a wr sgl that 2505 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2506 * we'll fix a high-water mark (65%) for when we should stop. 2507 */ 2508 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2509 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2510 2511 /* 2512 * 5. Set number of recv and send wqes after checking hca maximum 2513 * channel size. Store the max channel size in the state so that it 2514 * can be referred to when the swqe/rwqe change is requested via 2515 * dladm. 2516 */ 2517 2518 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; 2519 2520 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) 2521 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; 2522 2523 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - 2524 IBD_RWQE_MIN; 2525 2526 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) 2527 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; 2528 2529 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2530 2531 return (DDI_SUCCESS); 2532 } 2533 2534 static int 2535 ibd_part_busy(ibd_state_t *state) 2536 { 2537 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2538 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n"); 2539 return (DDI_FAILURE); 2540 } 2541 2542 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2543 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n"); 2544 return (DDI_FAILURE); 2545 } 2546 2547 /* 2548 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is 2549 * connecting to a remote IPoIB port. We can't remove this port. 2550 */ 2551 if (state->id_ah_op == IBD_OP_ONGOING) { 2552 DPRINT(10, "ibd_part_busy: failed: connecting\n"); 2553 return (DDI_FAILURE); 2554 } 2555 2556 return (DDI_SUCCESS); 2557 } 2558 2559 2560 static void 2561 ibd_part_unattach(ibd_state_t *state) 2562 { 2563 uint32_t progress = state->id_mac_state; 2564 ibt_status_t ret; 2565 2566 /* make sure rx resources are freed */ 2567 ibd_free_rx_rsrcs(state); 2568 2569 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2570 ASSERT(state->id_enable_rc); 2571 ibd_rc_fini_srq_list(state); 2572 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2573 } 2574 2575 if (progress & IBD_DRV_MAC_REGISTERED) { 2576 (void) mac_unregister(state->id_mh); 2577 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2578 } 2579 2580 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 2581 /* 2582 * No new async requests will be posted since the device 2583 * link state has been marked as unknown; completion handlers 2584 * have been turned off, so Tx handler will not cause any 2585 * more IBD_ASYNC_REAP requests. 2586 * 2587 * Queue a request for the async thread to exit, which will 2588 * be serviced after any pending ones. This can take a while, 2589 * specially if the SM is unreachable, since IBMF will slowly 2590 * timeout each SM request issued by the async thread. Reap 2591 * the thread before continuing on, we do not want it to be 2592 * lingering in modunloaded code. 2593 */ 2594 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 2595 thread_join(state->id_async_thrid); 2596 2597 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 2598 } 2599 2600 if (progress & IBD_DRV_REQ_LIST_INITED) { 2601 list_destroy(&state->id_req_list); 2602 mutex_destroy(&state->id_acache_req_lock); 2603 cv_destroy(&state->id_acache_req_cv); 2604 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; 2605 } 2606 2607 if (progress & IBD_DRV_PD_ALLOCD) { 2608 if ((ret = ibt_free_pd(state->id_hca_hdl, 2609 state->id_pd_hdl)) != IBT_SUCCESS) { 2610 ibd_print_warn(state, "failed to free " 2611 "protection domain, ret=%d", ret); 2612 } 2613 state->id_pd_hdl = NULL; 2614 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2615 } 2616 2617 if (progress & IBD_DRV_HCA_OPENED) { 2618 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2619 IBT_SUCCESS) { 2620 ibd_print_warn(state, "failed to close " 2621 "HCA device, ret=%d", ret); 2622 } 2623 state->id_hca_hdl = NULL; 2624 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2625 } 2626 2627 mutex_enter(&ibd_gstate.ig_mutex); 2628 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2629 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2630 IBT_SUCCESS) { 2631 ibd_print_warn(state, 2632 "ibt_detach() failed, ret=%d", ret); 2633 } 2634 state->id_ibt_hdl = NULL; 2635 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2636 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2637 } 2638 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2639 (ibd_gstate.ig_ibt_hdl != NULL)) { 2640 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2641 IBT_SUCCESS) { 2642 ibd_print_warn(state, "ibt_detach(): global " 2643 "failed, ret=%d", ret); 2644 } 2645 ibd_gstate.ig_ibt_hdl = NULL; 2646 } 2647 mutex_exit(&ibd_gstate.ig_mutex); 2648 2649 if (progress & IBD_DRV_TXINTR_ADDED) { 2650 ddi_remove_softintr(state->id_tx); 2651 state->id_tx = NULL; 2652 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2653 } 2654 2655 if (progress & IBD_DRV_RXINTR_ADDED) { 2656 ddi_remove_softintr(state->id_rx); 2657 state->id_rx = NULL; 2658 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2659 } 2660 2661 #ifdef DEBUG 2662 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2663 kstat_delete(state->rc_ksp); 2664 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2665 } 2666 #endif 2667 2668 if (progress & IBD_DRV_STATE_INITIALIZED) { 2669 ibd_state_fini(state); 2670 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2671 } 2672 } 2673 2674 int 2675 ibd_part_attach(ibd_state_t *state, dev_info_t *dip) 2676 { 2677 ibt_status_t ret; 2678 int rv; 2679 kthread_t *kht; 2680 2681 /* 2682 * Initialize mutexes and condition variables 2683 */ 2684 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2685 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()"); 2686 return (DDI_FAILURE); 2687 } 2688 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2689 2690 /* 2691 * Allocate rx,tx softintr 2692 */ 2693 if (ibd_rx_softintr == 1) { 2694 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2695 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2696 DPRINT(10, "ibd_part_attach: failed in " 2697 "ddi_add_softintr(id_rx), ret=%d", rv); 2698 return (DDI_FAILURE); 2699 } 2700 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2701 } 2702 if (ibd_tx_softintr == 1) { 2703 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2704 NULL, NULL, ibd_tx_recycle, 2705 (caddr_t)state)) != DDI_SUCCESS) { 2706 DPRINT(10, "ibd_part_attach: failed in " 2707 "ddi_add_softintr(id_tx), ret=%d", rv); 2708 return (DDI_FAILURE); 2709 } 2710 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2711 } 2712 2713 /* 2714 * Attach to IBTL 2715 */ 2716 mutex_enter(&ibd_gstate.ig_mutex); 2717 if (ibd_gstate.ig_ibt_hdl == NULL) { 2718 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2719 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2720 DPRINT(10, "ibd_part_attach: global: failed in " 2721 "ibt_attach(), ret=%d", ret); 2722 mutex_exit(&ibd_gstate.ig_mutex); 2723 return (DDI_FAILURE); 2724 } 2725 } 2726 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2727 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2728 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d", 2729 ret); 2730 mutex_exit(&ibd_gstate.ig_mutex); 2731 return (DDI_FAILURE); 2732 } 2733 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2734 mutex_exit(&ibd_gstate.ig_mutex); 2735 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2736 2737 /* 2738 * Open the HCA 2739 */ 2740 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 2741 &state->id_hca_hdl)) != IBT_SUCCESS) { 2742 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d", 2743 ret); 2744 return (DDI_FAILURE); 2745 } 2746 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2747 2748 #ifdef DEBUG 2749 /* Initialize Driver Counters for Reliable Connected Mode */ 2750 if (state->id_enable_rc) { 2751 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2752 DPRINT(10, "ibd_part_attach: failed in " 2753 "ibd_rc_init_stats"); 2754 return (DDI_FAILURE); 2755 } 2756 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2757 } 2758 #endif 2759 2760 /* 2761 * Record capabilities 2762 */ 2763 (void) ibd_record_capab(state); 2764 2765 /* 2766 * Allocate a protection domain on the HCA 2767 */ 2768 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2769 &state->id_pd_hdl)) != IBT_SUCCESS) { 2770 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d", 2771 ret); 2772 return (DDI_FAILURE); 2773 } 2774 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2775 2776 2777 /* 2778 * We need to initialise the req_list that is required for the 2779 * operation of the async_thread. 2780 */ 2781 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 2782 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 2783 list_create(&state->id_req_list, sizeof (ibd_req_t), 2784 offsetof(ibd_req_t, rq_list)); 2785 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; 2786 2787 /* 2788 * Create the async thread; thread_create never fails. 2789 */ 2790 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 2791 TS_RUN, minclsyspri); 2792 state->id_async_thrid = kht->t_did; 2793 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 2794 2795 return (DDI_SUCCESS); 2796 } 2797 2798 /* 2799 * Attach device to the IO framework. 2800 */ 2801 static int 2802 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2803 { 2804 int ret; 2805 2806 switch (cmd) { 2807 case DDI_ATTACH: 2808 ret = ibd_port_attach(dip); 2809 break; 2810 default: 2811 ret = DDI_FAILURE; 2812 break; 2813 } 2814 return (ret); 2815 } 2816 2817 /* 2818 * Detach device from the IO framework. 2819 */ 2820 static int 2821 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2822 { 2823 ibd_state_t *state; 2824 int instance; 2825 2826 /* 2827 * IBD doesn't support suspend/resume 2828 */ 2829 if (cmd != DDI_DETACH) 2830 return (DDI_FAILURE); 2831 2832 /* 2833 * Get the instance softstate 2834 */ 2835 instance = ddi_get_instance(dip); 2836 state = ddi_get_soft_state(ibd_list, instance); 2837 2838 /* 2839 * Release all resources we're holding still. Note that if we'd 2840 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2841 * so far, we should find all the flags we need in id_mac_state. 2842 */ 2843 return (ibd_port_unattach(state, dip)); 2844 } 2845 2846 /* 2847 * Pre ibt_attach() driver initialization 2848 */ 2849 static int 2850 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2851 { 2852 char buf[64]; 2853 2854 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2855 state->id_link_state = LINK_STATE_UNKNOWN; 2856 2857 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2858 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2859 state->id_trap_stop = B_TRUE; 2860 state->id_trap_inprog = 0; 2861 2862 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2863 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2864 state->id_dip = dip; 2865 2866 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2867 2868 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2869 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2870 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2871 state->id_tx_busy = 0; 2872 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2873 2874 state->id_rx_list.dl_bufs_outstanding = 0; 2875 state->id_rx_list.dl_cnt = 0; 2876 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2877 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2878 (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip), 2879 state->id_pkey, state->id_plinkid); 2880 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2881 0, NULL, NULL, NULL, NULL, NULL, 0); 2882 2883 /* For Reliable Connected Mode */ 2884 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2885 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2886 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2887 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2888 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2889 MUTEX_DRIVER, NULL); 2890 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL); 2891 2892 /* 2893 * Make the default link mode as RC. If this fails during connection 2894 * setup, the link mode is automatically transitioned to UD. 2895 * Also set the RC MTU. 2896 */ 2897 state->id_enable_rc = IBD_DEF_LINK_MODE; 2898 state->rc_mtu = IBD_DEF_RC_MAX_MTU; 2899 state->id_mtu = IBD_DEF_MAX_MTU; 2900 2901 /* Iniatialize all tunables to default */ 2902 state->id_lso_policy = IBD_DEF_LSO_POLICY; 2903 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; 2904 state->id_num_ah = IBD_DEF_NUM_AH; 2905 state->id_hash_size = IBD_DEF_HASH_SIZE; 2906 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; 2907 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; 2908 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; 2909 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; 2910 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; 2911 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; 2912 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; 2913 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; 2914 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; 2915 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; 2916 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; 2917 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; 2918 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; 2919 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; 2920 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; 2921 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; 2922 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; 2923 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; 2924 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; 2925 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; 2926 2927 return (DDI_SUCCESS); 2928 } 2929 2930 /* 2931 * Post ibt_detach() driver deconstruction 2932 */ 2933 static void 2934 ibd_state_fini(ibd_state_t *state) 2935 { 2936 kmem_cache_destroy(state->id_req_kmc); 2937 2938 mutex_destroy(&state->id_rx_list.dl_mutex); 2939 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2940 2941 mutex_destroy(&state->id_txpost_lock); 2942 mutex_destroy(&state->id_tx_list.dl_mutex); 2943 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2944 mutex_destroy(&state->id_lso_lock); 2945 2946 mutex_destroy(&state->id_sched_lock); 2947 mutex_destroy(&state->id_scq_poll_lock); 2948 mutex_destroy(&state->id_rcq_poll_lock); 2949 2950 cv_destroy(&state->id_trap_cv); 2951 mutex_destroy(&state->id_trap_lock); 2952 mutex_destroy(&state->id_link_mutex); 2953 2954 /* For Reliable Connected Mode */ 2955 mutex_destroy(&state->rc_timeout_lock); 2956 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2957 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2958 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2959 mutex_destroy(&state->rc_tx_large_bufs_lock); 2960 mutex_destroy(&state->rc_rx_lock); 2961 } 2962 2963 /* 2964 * Fetch link speed from SA for snmp ifspeed reporting. 2965 */ 2966 static uint64_t 2967 ibd_get_portspeed(ibd_state_t *state) 2968 { 2969 int ret; 2970 ibt_path_info_t path; 2971 ibt_path_attr_t path_attr; 2972 uint8_t num_paths; 2973 uint64_t ifspeed; 2974 2975 /* 2976 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2977 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2978 * 2000000000. Start with that as default. 2979 */ 2980 ifspeed = 2000000000; 2981 2982 bzero(&path_attr, sizeof (path_attr)); 2983 2984 /* 2985 * Get the port speed from Loopback path information. 2986 */ 2987 path_attr.pa_dgids = &state->id_sgid; 2988 path_attr.pa_num_dgids = 1; 2989 path_attr.pa_sgid = state->id_sgid; 2990 2991 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2992 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2993 goto earlydone; 2994 2995 if (num_paths < 1) 2996 goto earlydone; 2997 2998 /* 2999 * In case SA does not return an expected value, report the default 3000 * speed as 1X. 3001 */ 3002 ret = 1; 3003 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 3004 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 3005 ret = 1; 3006 break; 3007 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 3008 ret = 4; 3009 break; 3010 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 3011 ret = 12; 3012 break; 3013 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 3014 ret = 2; 3015 break; 3016 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 3017 ret = 8; 3018 break; 3019 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 3020 ret = 16; 3021 break; 3022 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 3023 ret = 24; 3024 break; 3025 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 3026 ret = 32; 3027 break; 3028 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 3029 ret = 48; 3030 break; 3031 } 3032 3033 ifspeed *= ret; 3034 3035 earlydone: 3036 return (ifspeed); 3037 } 3038 3039 /* 3040 * Search input mcg list (id_mc_full or id_mc_non) for an entry 3041 * representing the input mcg mgid. 3042 */ 3043 static ibd_mce_t * 3044 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 3045 { 3046 ibd_mce_t *ptr = list_head(mlist); 3047 3048 /* 3049 * Do plain linear search. 3050 */ 3051 while (ptr != NULL) { 3052 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 3053 sizeof (ib_gid_t)) == 0) 3054 return (ptr); 3055 ptr = list_next(mlist, ptr); 3056 } 3057 return (NULL); 3058 } 3059 3060 /* 3061 * Execute IBA JOIN. 3062 */ 3063 static ibt_status_t 3064 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 3065 { 3066 ibt_mcg_attr_t mcg_attr; 3067 3068 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3069 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 3070 mcg_attr.mc_mgid = mgid; 3071 mcg_attr.mc_join_state = mce->mc_jstate; 3072 mcg_attr.mc_scope = state->id_scope; 3073 mcg_attr.mc_pkey = state->id_pkey; 3074 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 3075 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 3076 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 3077 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 3078 NULL, NULL)); 3079 } 3080 3081 /* 3082 * This code JOINs the port in the proper way (depending on the join 3083 * state) so that IBA fabric will forward mcg packets to/from the port. 3084 * It also attaches the QPN to the mcg so it can receive those mcg 3085 * packets. This code makes sure not to attach the mcg to the QP if 3086 * that has been previously done due to the mcg being joined with a 3087 * different join state, even though this is not required by SWG_0216, 3088 * refid 3610. 3089 */ 3090 static ibd_mce_t * 3091 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3092 { 3093 ibt_status_t ibt_status; 3094 ibd_mce_t *mce, *tmce, *omce = NULL; 3095 boolean_t do_attach = B_TRUE; 3096 3097 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 3098 jstate, mgid.gid_prefix, mgid.gid_guid); 3099 3100 /* 3101 * For enable_multicast Full member joins, we need to do some 3102 * extra work. If there is already an mce on the list that 3103 * indicates full membership, that means the membership has 3104 * not yet been dropped (since the disable_multicast was issued) 3105 * because there are pending Tx's to the mcg; in that case, just 3106 * mark the mce not to be reaped when the Tx completion queues 3107 * an async reap operation. 3108 * 3109 * If there is already an mce on the list indicating sendonly 3110 * membership, try to promote to full membership. Be careful 3111 * not to deallocate the old mce, since there might be an AH 3112 * pointing to it; instead, update the old mce with new data 3113 * that tracks the full membership. 3114 */ 3115 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 3116 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 3117 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 3118 ASSERT(omce->mc_fullreap); 3119 omce->mc_fullreap = B_FALSE; 3120 return (omce); 3121 } else { 3122 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3123 } 3124 } 3125 3126 /* 3127 * Allocate the ibd_mce_t to track this JOIN. 3128 */ 3129 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 3130 mce->mc_fullreap = B_FALSE; 3131 mce->mc_jstate = jstate; 3132 3133 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 3134 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 3135 ibt_status); 3136 kmem_free(mce, sizeof (ibd_mce_t)); 3137 return (NULL); 3138 } 3139 3140 /* 3141 * Is an IBA attach required? Not if the interface is already joined 3142 * to the mcg in a different appropriate join state. 3143 */ 3144 if (jstate == IB_MC_JSTATE_NON) { 3145 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3146 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3147 do_attach = B_FALSE; 3148 } else if (jstate == IB_MC_JSTATE_FULL) { 3149 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3150 do_attach = B_FALSE; 3151 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3152 do_attach = B_FALSE; 3153 } 3154 3155 if (do_attach) { 3156 /* 3157 * Do the IBA attach. 3158 */ 3159 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 3160 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 3161 &mce->mc_info)) != IBT_SUCCESS) { 3162 DPRINT(10, "ibd_join_group : failed qp attachment " 3163 "%d\n", ibt_status); 3164 /* 3165 * NOTE that we should probably preserve the join info 3166 * in the list and later try to leave again at detach 3167 * time. 3168 */ 3169 (void) ibt_leave_mcg(state->id_sgid, mgid, 3170 state->id_sgid, jstate); 3171 kmem_free(mce, sizeof (ibd_mce_t)); 3172 return (NULL); 3173 } 3174 } 3175 3176 /* 3177 * Insert the ibd_mce_t in the proper list. 3178 */ 3179 if (jstate == IB_MC_JSTATE_NON) { 3180 IBD_MCACHE_INSERT_NON(state, mce); 3181 } else { 3182 /* 3183 * Set up the mc_req fields used for reaping the 3184 * mcg in case of delayed tx completion (see 3185 * ibd_tx_cleanup()). Also done for sendonly join in 3186 * case we are promoted to fullmembership later and 3187 * keep using the same mce. 3188 */ 3189 mce->mc_req.rq_gid = mgid; 3190 mce->mc_req.rq_ptr = mce; 3191 /* 3192 * Check whether this is the case of trying to join 3193 * full member, and we were already joined send only. 3194 * We try to drop our SendOnly membership, but it is 3195 * possible that the mcg does not exist anymore (and 3196 * the subnet trap never reached us), so the leave 3197 * operation might fail. 3198 */ 3199 if (omce != NULL) { 3200 (void) ibt_leave_mcg(state->id_sgid, mgid, 3201 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3202 omce->mc_jstate = IB_MC_JSTATE_FULL; 3203 bcopy(&mce->mc_info, &omce->mc_info, 3204 sizeof (ibt_mcg_info_t)); 3205 kmem_free(mce, sizeof (ibd_mce_t)); 3206 return (omce); 3207 } 3208 mutex_enter(&state->id_mc_mutex); 3209 IBD_MCACHE_INSERT_FULL(state, mce); 3210 mutex_exit(&state->id_mc_mutex); 3211 } 3212 3213 return (mce); 3214 } 3215 3216 /* 3217 * Called during port up event handling to attempt to reacquire full 3218 * membership to an mcg. Stripped down version of ibd_join_group(). 3219 * Note that it is possible that the mcg might have gone away, and 3220 * gets recreated at this point. 3221 */ 3222 static void 3223 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3224 { 3225 ib_gid_t mgid; 3226 3227 /* 3228 * If the mc_fullreap flag is set, or this join fails, a subsequent 3229 * reap/leave is going to try to leave the group. We could prevent 3230 * that by adding a boolean flag into ibd_mce_t, if required. 3231 */ 3232 if (mce->mc_fullreap) 3233 return; 3234 3235 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3236 3237 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3238 mgid.gid_guid); 3239 3240 /* While reacquiring, leave and then join the MCG */ 3241 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 3242 mce->mc_jstate); 3243 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3244 ibd_print_warn(state, "Failure on port up to rejoin " 3245 "multicast gid %016llx:%016llx", 3246 (u_longlong_t)mgid.gid_prefix, 3247 (u_longlong_t)mgid.gid_guid); 3248 } 3249 3250 /* 3251 * This code handles delayed Tx completion cleanups for mcg's to which 3252 * disable_multicast has been issued, regular mcg related cleanups during 3253 * disable_multicast, disable_promiscuous and mcg traps, as well as 3254 * cleanups during driver detach time. Depending on the join state, 3255 * it deletes the mce from the appropriate list and issues the IBA 3256 * leave/detach; except in the disable_multicast case when the mce 3257 * is left on the active list for a subsequent Tx completion cleanup. 3258 */ 3259 static void 3260 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3261 uint8_t jstate) 3262 { 3263 ibd_mce_t *tmce; 3264 boolean_t do_detach = B_TRUE; 3265 3266 /* 3267 * Before detaching, we must check whether the other list 3268 * contains the mcg; if we detach blindly, the consumer 3269 * who set up the other list will also stop receiving 3270 * traffic. 3271 */ 3272 if (jstate == IB_MC_JSTATE_FULL) { 3273 /* 3274 * The following check is only relevant while coming 3275 * from the Tx completion path in the reap case. 3276 */ 3277 if (!mce->mc_fullreap) 3278 return; 3279 mutex_enter(&state->id_mc_mutex); 3280 IBD_MCACHE_PULLOUT_FULL(state, mce); 3281 mutex_exit(&state->id_mc_mutex); 3282 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3283 do_detach = B_FALSE; 3284 } else if (jstate == IB_MC_JSTATE_NON) { 3285 IBD_MCACHE_PULLOUT_NON(state, mce); 3286 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3287 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3288 do_detach = B_FALSE; 3289 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3290 mutex_enter(&state->id_mc_mutex); 3291 IBD_MCACHE_PULLOUT_FULL(state, mce); 3292 mutex_exit(&state->id_mc_mutex); 3293 do_detach = B_FALSE; 3294 } 3295 3296 /* 3297 * If we are reacting to a mcg trap and leaving our sendonly or 3298 * non membership, the mcg is possibly already gone, so attempting 3299 * to leave might fail. On the other hand, we must try to leave 3300 * anyway, since this might be a trap from long ago, and we could 3301 * have potentially sendonly joined to a recent incarnation of 3302 * the mcg and are about to loose track of this information. 3303 */ 3304 if (do_detach) { 3305 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3306 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3307 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3308 } 3309 3310 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3311 kmem_free(mce, sizeof (ibd_mce_t)); 3312 } 3313 3314 /* 3315 * Async code executed due to multicast and promiscuous disable requests 3316 * and mcg trap handling; also executed during driver detach. Mostly, a 3317 * leave and detach is done; except for the fullmember case when Tx 3318 * requests are pending, whence arrangements are made for subsequent 3319 * cleanup on Tx completion. 3320 */ 3321 static void 3322 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3323 { 3324 ipoib_mac_t mcmac; 3325 boolean_t recycled; 3326 ibd_mce_t *mce; 3327 3328 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3329 jstate, mgid.gid_prefix, mgid.gid_guid); 3330 3331 if (jstate == IB_MC_JSTATE_NON) { 3332 recycled = B_TRUE; 3333 mce = IBD_MCACHE_FIND_NON(state, mgid); 3334 /* 3335 * In case we are handling a mcg trap, we might not find 3336 * the mcg in the non list. 3337 */ 3338 if (mce == NULL) { 3339 return; 3340 } 3341 } else { 3342 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3343 3344 /* 3345 * In case we are handling a mcg trap, make sure the trap 3346 * is not arriving late; if we have an mce that indicates 3347 * that we are already a fullmember, that would be a clear 3348 * indication that the trap arrived late (ie, is for a 3349 * previous incarnation of the mcg). 3350 */ 3351 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3352 if ((mce == NULL) || (mce->mc_jstate == 3353 IB_MC_JSTATE_FULL)) { 3354 return; 3355 } 3356 } else { 3357 ASSERT(jstate == IB_MC_JSTATE_FULL); 3358 3359 /* 3360 * If join group failed, mce will be NULL here. 3361 * This is because in GLDv3 driver, set multicast 3362 * will always return success. 3363 */ 3364 if (mce == NULL) { 3365 return; 3366 } 3367 3368 mce->mc_fullreap = B_TRUE; 3369 } 3370 3371 /* 3372 * If no pending Tx's remain that reference the AH 3373 * for the mcg, recycle it from active to free list. 3374 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3375 * so the last completing Tx will cause an async reap 3376 * operation to be invoked, at which time we will drop our 3377 * membership to the mcg so that the pending Tx's complete 3378 * successfully. Refer to comments on "AH and MCE active 3379 * list manipulation" at top of this file. The lock protects 3380 * against Tx fast path and Tx cleanup code. 3381 */ 3382 mutex_enter(&state->id_ac_mutex); 3383 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3384 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3385 IB_MC_JSTATE_SEND_ONLY_NON)); 3386 mutex_exit(&state->id_ac_mutex); 3387 } 3388 3389 if (recycled) { 3390 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3391 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3392 ibd_async_reap_group(state, mce, mgid, jstate); 3393 } 3394 } 3395 3396 /* 3397 * Find the broadcast address as defined by IPoIB; implicitly 3398 * determines the IBA scope, mtu, tclass etc of the link the 3399 * interface is going to be a member of. 3400 */ 3401 static ibt_status_t 3402 ibd_find_bgroup(ibd_state_t *state) 3403 { 3404 ibt_mcg_attr_t mcg_attr; 3405 uint_t numg; 3406 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3407 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3408 IB_MC_SCOPE_GLOBAL }; 3409 int i, mcgmtu; 3410 boolean_t found = B_FALSE; 3411 int ret; 3412 ibt_mcg_info_t mcg_info; 3413 3414 state->id_bgroup_created = B_FALSE; 3415 state->id_bgroup_present = B_FALSE; 3416 3417 query_bcast_grp: 3418 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3419 mcg_attr.mc_pkey = state->id_pkey; 3420 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3421 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3422 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3423 3424 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3425 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3426 3427 /* 3428 * Look for the IPoIB broadcast group. 3429 */ 3430 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3431 state->id_mgid.gid_prefix = 3432 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3433 ((uint64_t)state->id_scope << 48) | 3434 ((uint32_t)(state->id_pkey << 16))); 3435 mcg_attr.mc_mgid = state->id_mgid; 3436 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3437 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3438 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3439 found = B_TRUE; 3440 break; 3441 } 3442 } 3443 3444 if (!found) { 3445 if (state->id_create_broadcast_group) { 3446 /* 3447 * If we created the broadcast group, but failed to 3448 * find it, we can't do anything except leave the 3449 * one we created and return failure. 3450 */ 3451 if (state->id_bgroup_created) { 3452 ibd_print_warn(state, "IPoIB broadcast group " 3453 "absent. Unable to query after create."); 3454 goto find_bgroup_fail; 3455 } 3456 3457 /* 3458 * Create the ipoib broadcast group if it didn't exist 3459 */ 3460 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3461 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3462 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3463 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3464 mcg_attr.mc_pkey = state->id_pkey; 3465 mcg_attr.mc_flow = 0; 3466 mcg_attr.mc_sl = 0; 3467 mcg_attr.mc_tclass = 0; 3468 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3469 state->id_mgid.gid_prefix = 3470 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3471 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3472 ((uint32_t)(state->id_pkey << 16))); 3473 mcg_attr.mc_mgid = state->id_mgid; 3474 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3475 3476 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3477 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3478 ibd_print_warn(state, "IPoIB broadcast group " 3479 "absent, create failed: ret = %d\n", ret); 3480 state->id_bgroup_created = B_FALSE; 3481 return (IBT_FAILURE); 3482 } 3483 state->id_bgroup_created = B_TRUE; 3484 goto query_bcast_grp; 3485 } else { 3486 ibd_print_warn(state, "IPoIB broadcast group absent"); 3487 return (IBT_FAILURE); 3488 } 3489 } 3490 3491 /* 3492 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3493 */ 3494 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3495 if (state->id_mtu < mcgmtu) { 3496 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3497 "greater than port's maximum MTU %d", mcgmtu, 3498 state->id_mtu); 3499 ibt_free_mcg_info(state->id_mcinfo, 1); 3500 goto find_bgroup_fail; 3501 } 3502 state->id_mtu = mcgmtu; 3503 state->id_bgroup_present = B_TRUE; 3504 3505 return (IBT_SUCCESS); 3506 3507 find_bgroup_fail: 3508 if (state->id_bgroup_created) { 3509 (void) ibt_leave_mcg(state->id_sgid, 3510 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3511 IB_MC_JSTATE_FULL); 3512 } 3513 3514 return (IBT_FAILURE); 3515 } 3516 3517 static int 3518 ibd_alloc_tx_copybufs(ibd_state_t *state) 3519 { 3520 ibt_mr_attr_t mem_attr; 3521 3522 /* 3523 * Allocate one big chunk for all regular tx copy bufs 3524 */ 3525 state->id_tx_buf_sz = state->id_mtu; 3526 if (state->id_lso_policy && state->id_lso_capable && 3527 (state->id_ud_tx_copy_thresh > state->id_mtu)) { 3528 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; 3529 } 3530 3531 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * 3532 state->id_tx_buf_sz, KM_SLEEP); 3533 3534 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * 3535 sizeof (ibd_swqe_t), KM_SLEEP); 3536 3537 /* 3538 * Do one memory registration on the entire txbuf area 3539 */ 3540 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3541 mem_attr.mr_len = state->id_ud_num_swqe * state->