1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * An implementation of the IPoIB standard based on PSARC 2001/289. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 43 #include <sys/sysmacros.h> /* for offsetof */ 44 #include <sys/disp.h> /* for async thread pri */ 45 #include <sys/atomic.h> /* for atomic_add*() */ 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 47 #include <netinet/in.h> /* for netinet/ip.h below */ 48 #include <netinet/ip.h> /* for struct ip */ 49 #include <netinet/udp.h> /* for struct udphdr */ 50 #include <inet/common.h> /* for inet/ip.h below */ 51 #include <inet/ip.h> /* for ipha_t */ 52 #include <inet/ip6.h> /* for ip6_t */ 53 #include <inet/tcp.h> /* for tcph_t */ 54 #include <netinet/icmp6.h> /* for icmp6_t */ 55 #include <sys/callb.h> 56 #include <sys/modhash.h> 57 58 #include <sys/ib/clients/ibd/ibd.h> 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 60 #include <sys/note.h> 61 #include <sys/multidata.h> 62 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 64 65 #include <sys/priv_names.h> 66 #include <sys/dls.h> 67 #include <sys/dld_ioc.h> 68 #include <sys/policy.h> 69 #include <sys/ibpart.h> 70 #include <sys/file.h> 71 72 /* 73 * The write-up below includes details on the following: 74 * 1. The dladm administrative model. 75 * 2. Late HCA initialization feature. 76 * 3. Brussels support and its implications to the current architecture. 77 * 78 * 1. The dladm administrative model. 79 * ------------------------------------------ 80 * With the dladm model, ibnex will create one ibd instance per port. These 81 * instances will be created independent of the port state. 82 * 83 * The ibd driver is two faceted: One side of it working as the port driver and 84 * the other as the partition object driver. 85 * 86 * The port instance is a child of the HCA, and will have an entry in the devfs. 87 * A DDI attach only happens for the port driver, and its attach is 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is 89 * handled in ibd_port_unattach(). 90 * 91 * The partition object is only a registrant to the mac layer via mac_register() 92 * and does not have an entry in the device tree. There is no DDI softstate 93 * managed by the DDI framework for the partition objects. However, the state is 94 * managed inside the ibd driver, and every partition object hangs off the 95 * "ibd_objlist_head". 96 * 97 * The partition object first comes into existence when a user runs the 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry 99 * point of the partition object. The partition object goes away with the 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry 101 * point of the partition object. 102 * 103 * The create-part and delete-part subcommands result in dld ioctls that end up 104 * calling ibd_create_parition() and ibd_delete_partition respectively. 105 * There ioctls are registered with the dld layer in _init() via a call to 106 * dld_ioc_register(). 107 * 108 * The port instance by itself cannot be plumbed. It is only the partition 109 * objects that can be plumbed and they alone participate in I/O and not the 110 * port driver. 111 * 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to 113 * display useful information. The info entry point for ibd is 114 * ibd_get_partition_info(). 115 * 116 * 2. Late HCA initialization feature. 117 * ------------------------------------ 118 * As mentioned in section 1, the user creates the partition objects via 119 * dladm(1M). It is possible that: 120 * a) The physical port itself is down and the SM cannot be reached. 121 * b) The PKEY specified by the used has not been created in the SM yet. 122 * c) An IPoIB broadcast group for the specified PKEY is not present. 123 * 124 * In all of the above cases, complete initialization of the partition object is 125 * not possible. However, the new model allows the creation of partition 126 * objects even in such cases but will defer the initialization for later. 127 * When such a partition object is plumbed, the link state will be displayed as 128 * "down". 129 * The driver, at this point, is listening to events that herald the 130 * availability of resources - 131 * i) LINK_UP when the link becomes available 132 * ii) PORT_CHANGE when the PKEY has been created 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been 134 * created 135 * via ibd_async_handler() for events i) and ii), and via 136 * ibd_snet_notices_handler() for iii. 137 * The driver handles these events (as and when they arrive) and completes the 138 * initialization of the partition object and transitions it to a usable state. 139 * 140 * 3. Brussels support and its implications to the current architecture. 141 * --------------------------------------------------------------------- 142 * The brussels support introduces two new interfaces to the ibd driver - 143 * ibd_m_getprop() and ibd_m_setprop(). 144 * These interfaces allow setting and retrieval of certain properties. 145 * Some of them are public properties while most other are private properties 146 * meant to be used by developers. Tuning the latter kind can cause 147 * performance issues and should not be used without understanding the 148 * implications. All properties are specific to an instance of either the 149 * partition object or the port driver. 150 * 151 * The public properties are : mtu and linkmode. 152 * mtu is a read-only property. 153 * linkmode can take two values - UD and CM. 154 * 155 * Changing the linkmode requires some bookkeeping in the driver. The 156 * capabilities need to be re-reported to the mac layer. This is done by 157 * calling mac_capab_update(). The maxsdu is updated by calling 158 * mac_maxsdu_update(). 159 * The private properties retain their values across the change of linkmode. 160 * NOTE: 161 * - The port driver does not support any property apart from mtu. 162 * - All other properties are only meant for the partition object. 163 * - The properties cannot be set when an instance is plumbed. The 164 * instance has to be unplumbed to effect any setting. 165 */ 166 167 /* 168 * Driver wide tunables 169 * 170 * ibd_tx_softintr 171 * ibd_rx_softintr 172 * The softintr mechanism allows ibd to avoid event queue overflows if 173 * the receive/completion handlers are to be expensive. These are enabled 174 * by default. 175 * 176 * ibd_log_sz 177 * This specifies the size of the ibd log buffer in bytes. The buffer is 178 * allocated and logging is enabled only when IBD_LOGGING is defined. 179 * 180 */ 181 uint_t ibd_rx_softintr = 1; 182 uint_t ibd_tx_softintr = 1; 183 184 #ifdef IBD_LOGGING 185 uint_t ibd_log_sz = 0x20000; 186 #endif 187 188 #ifdef IBD_LOGGING 189 #define IBD_LOG_SZ ibd_log_sz 190 #endif 191 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 193 #define IBD_RX_POST_CNT 8 194 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 196 #define IBD_LOG_RX_POST 4 197 198 /* Minimum number of receive work requests driver needs to always have */ 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 200 201 /* 202 * LSO parameters 203 */ 204 #define IBD_LSO_MAXLEN 65536 205 #define IBD_LSO_BUFSZ 8192 206 207 /* 208 * Async operation states 209 */ 210 #define IBD_OP_NOTSTARTED 0 211 #define IBD_OP_ONGOING 1 212 #define IBD_OP_COMPLETED 2 213 #define IBD_OP_ERRORED 3 214 #define IBD_OP_ROUTERED 4 215 216 /* 217 * Start/stop in-progress flags; note that restart must always remain 218 * the OR of start and stop flag values. 219 */ 220 #define IBD_DRV_START_IN_PROGRESS 0x10000000 221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS 224 225 /* 226 * Miscellaneous constants 227 */ 228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 229 #define IBD_DEF_MAX_SDU 2044 230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) 231 #define IBD_DEF_RC_MAX_SDU 65520 232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) 233 #define IBD_DEFAULT_QKEY 0xB1B 234 #ifdef IBD_LOGGING 235 #define IBD_DMAX_LINE 100 236 #endif 237 238 /* 239 * Enumerations for link states 240 */ 241 typedef enum { 242 IBD_LINK_DOWN, 243 IBD_LINK_UP, 244 IBD_LINK_UP_ABSENT 245 } ibd_link_op_t; 246 247 /* 248 * Driver State Pointer 249 */ 250 void *ibd_list; 251 252 /* 253 * Driver Global Data 254 */ 255 ibd_global_state_t ibd_gstate; 256 257 /* 258 * Partition object list 259 */ 260 ibd_state_t *ibd_objlist_head = NULL; 261 kmutex_t ibd_objlist_lock; 262 263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */ 264 265 /* 266 * Logging 267 */ 268 #ifdef IBD_LOGGING 269 kmutex_t ibd_lbuf_lock; 270 uint8_t *ibd_lbuf; 271 uint32_t ibd_lbuf_ndx; 272 #endif 273 274 /* 275 * Required system entry points 276 */ 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 279 280 /* 281 * Required driver entry points for GLDv3 282 */ 283 static int ibd_m_stat(void *, uint_t, uint64_t *); 284 static int ibd_m_start(void *); 285 static void ibd_m_stop(void *); 286 static int ibd_m_promisc(void *, boolean_t); 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 288 static int ibd_m_unicst(void *, const uint8_t *); 289 static mblk_t *ibd_m_tx(void *, mblk_t *); 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 291 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 293 const void *); 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, 296 mac_prop_info_handle_t); 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, 298 const void *); 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); 300 301 /* 302 * Private driver entry points for GLDv3 303 */ 304 305 /* 306 * Initialization 307 */ 308 static int ibd_state_init(ibd_state_t *, dev_info_t *); 309 static int ibd_init_txlist(ibd_state_t *); 310 static int ibd_init_rxlist(ibd_state_t *); 311 static int ibd_acache_init(ibd_state_t *); 312 #ifdef IBD_LOGGING 313 static void ibd_log_init(void); 314 #endif 315 316 /* 317 * Termination/cleanup 318 */ 319 static void ibd_state_fini(ibd_state_t *); 320 static void ibd_fini_txlist(ibd_state_t *); 321 static void ibd_fini_rxlist(ibd_state_t *); 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 324 static void ibd_acache_fini(ibd_state_t *); 325 #ifdef IBD_LOGGING 326 static void ibd_log_fini(void); 327 #endif 328 329 /* 330 * Allocation/acquire/map routines 331 */ 332 static int ibd_alloc_tx_copybufs(ibd_state_t *); 333 static int ibd_alloc_rx_copybufs(ibd_state_t *); 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 337 uint32_t *); 338 339 /* 340 * Free/release/unmap routines 341 */ 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 343 static void ibd_free_tx_copybufs(ibd_state_t *); 344 static void ibd_free_rx_copybufs(ibd_state_t *); 345 static void ibd_free_rx_rsrcs(ibd_state_t *); 346 static void ibd_free_tx_lsobufs(ibd_state_t *); 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 350 351 /* 352 * Handlers/callback routines 353 */ 354 static uint_t ibd_intr(caddr_t); 355 static uint_t ibd_tx_recycle(caddr_t); 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 362 static void ibd_freemsg_cb(char *); 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 364 ibt_async_event_t *); 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 366 ibt_async_event_t *); 367 static void ibd_snet_notices_handler(void *, ib_gid_t, 368 ibt_subnet_event_code_t, ibt_subnet_event_t *); 369 370 /* 371 * Send/receive routines 372 */ 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 377 378 /* 379 * Threads 380 */ 381 static void ibd_async_work(ibd_state_t *); 382 383 /* 384 * Async tasks 385 */ 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 388 static void ibd_async_setprom(ibd_state_t *); 389 static void ibd_async_unsetprom(ibd_state_t *); 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 392 static void ibd_async_txsched(ibd_state_t *); 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 394 395 /* 396 * Async task helpers 397 */ 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 402 ipoib_mac_t *, ipoib_mac_t *); 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 409 static uint64_t ibd_get_portspeed(ibd_state_t *); 410 static boolean_t ibd_async_safe(ibd_state_t *); 411 static void ibd_async_done(ibd_state_t *); 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 416 417 /* 418 * Helpers for attach/start routines 419 */ 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 421 static int ibd_record_capab(ibd_state_t *); 422 static int ibd_get_port_details(ibd_state_t *); 423 static int ibd_alloc_cqs(ibd_state_t *); 424 static int ibd_setup_ud_channel(ibd_state_t *); 425 static int ibd_start(ibd_state_t *); 426 static int ibd_undo_start(ibd_state_t *, link_state_t); 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); 430 static void ibd_part_unattach(ibd_state_t *state); 431 static int ibd_port_attach(dev_info_t *); 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *); 434 static int ibd_part_busy(ibd_state_t *); 435 436 /* 437 * Miscellaneous helpers 438 */ 439 static int ibd_sched_poll(ibd_state_t *, int, int); 440 static void ibd_resume_transmission(ibd_state_t *); 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 443 static void *list_get_head(list_t *); 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 446 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); 449 450 #ifdef IBD_LOGGING 451 static void ibd_log(const char *, ...); 452 #endif 453 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 456 457 /* Module Driver Info */ 458 static struct modldrv ibd_modldrv = { 459 &mod_driverops, /* This one is a driver */ 460 "InfiniBand GLDv3 Driver", /* short description */ 461 &ibd_dev_ops /* driver specific ops */ 462 }; 463 464 /* Module Linkage */ 465 static struct modlinkage ibd_modlinkage = { 466 MODREV_1, (void *)&ibd_modldrv, NULL 467 }; 468 469 /* 470 * Module (static) info passed to IBTL during ibt_attach 471 */ 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 473 IBTI_V_CURR, 474 IBT_NETWORK, 475 ibd_async_handler, 476 NULL, 477 "IBPART" 478 }; 479 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { 481 IBTI_V_CURR, 482 IBT_NETWORK, 483 ibdpd_async_handler, 484 NULL, 485 "IPIB" 486 }; 487 488 /* 489 * GLDv3 entry points 490 */ 491 #define IBD_M_CALLBACK_FLAGS \ 492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) 493 494 static mac_callbacks_t ibd_m_callbacks = { 495 IBD_M_CALLBACK_FLAGS, 496 ibd_m_stat, 497 ibd_m_start, 498 ibd_m_stop, 499 ibd_m_promisc, 500 ibd_m_multicst, 501 ibd_m_unicst, 502 ibd_m_tx, 503 NULL, 504 NULL, 505 ibd_m_getcapab, 506 NULL, 507 NULL, 508 ibd_m_setprop, 509 ibd_m_getprop, 510 ibd_m_propinfo 511 }; 512 513 /* Private properties */ 514 char *ibd_priv_props[] = { 515 "_ibd_broadcast_group", 516 "_ibd_coalesce_completions", 517 "_ibd_create_broadcast_group", 518 "_ibd_hash_size", 519 "_ibd_lso_enable", 520 "_ibd_num_ah", 521 "_ibd_num_lso_bufs", 522 "_ibd_rc_enable_srq", 523 "_ibd_rc_num_rwqe", 524 "_ibd_rc_num_srq", 525 "_ibd_rc_num_swqe", 526 "_ibd_rc_rx_comp_count", 527 "_ibd_rc_rx_comp_usec", 528 "_ibd_rc_rx_copy_thresh", 529 "_ibd_rc_rx_rwqe_thresh", 530 "_ibd_rc_tx_comp_count", 531 "_ibd_rc_tx_comp_usec", 532 "_ibd_rc_tx_copy_thresh", 533 "_ibd_ud_num_rwqe", 534 "_ibd_ud_num_swqe", 535 "_ibd_ud_rx_comp_count", 536 "_ibd_ud_rx_comp_usec", 537 "_ibd_ud_tx_comp_count", 538 "_ibd_ud_tx_comp_usec", 539 "_ibd_ud_tx_copy_thresh", 540 NULL 541 }; 542 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); 546 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = { 548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), 549 ibd_create_partition, secpolicy_dl_config}, 550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), 551 ibd_delete_partition, secpolicy_dl_config}, 552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), 553 ibd_get_partition_info, NULL} 554 }; 555 556 /* 557 * Fill/clear <scope> and <p_key> in multicast/broadcast address 558 */ 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 560 { \ 561 *(uint32_t *)((char *)(maddr) + 4) |= \ 562 htonl((uint32_t)(scope) << 16); \ 563 *(uint32_t *)((char *)(maddr) + 8) |= \ 564 htonl((uint32_t)(pkey) << 16); \ 565 } 566 567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 568 { \ 569 *(uint32_t *)((char *)(maddr) + 4) &= \ 570 htonl(~((uint32_t)0xF << 16)); \ 571 *(uint32_t *)((char *)(maddr) + 8) &= \ 572 htonl(~((uint32_t)0xFFFF << 16)); \ 573 } 574 575 /* 576 * Rudimentary debugging support 577 */ 578 #ifdef DEBUG 579 int ibd_debuglevel = 100; 580 void 581 debug_print(int l, char *fmt, ...) 582 { 583 va_list ap; 584 585 if (l < ibd_debuglevel) 586 return; 587 va_start(ap, fmt); 588 vcmn_err(CE_CONT, fmt, ap); 589 va_end(ap); 590 } 591 #endif 592 593 /* 594 * Common routine to print warning messages; adds in hca guid, port number 595 * and pkey to be able to identify the IBA interface. 596 */ 597 void 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 599 { 600 ib_guid_t hca_guid; 601 char ibd_print_buf[256]; 602 int len; 603 va_list ap; 604 605 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 606 0, "hca-guid", 0); 607 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 608 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 609 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 610 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 611 va_start(ap, fmt); 612 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 613 fmt, ap); 614 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 615 va_end(ap); 616 } 617 618 /* 619 * Warlock directives 620 */ 621 622 /* 623 * id_lso_lock 624 * 625 * state->id_lso->bkt_nfree may be accessed without a lock to 626 * determine the threshold at which we have to ask the nw layer 627 * to resume transmission (see ibd_resume_transmission()). 628 */ 629 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 630 ibd_state_t::id_lso)) 631 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 632 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 633 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 634 635 /* 636 * id_scq_poll_lock 637 */ 638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 639 ibd_state_t::id_scq_poll_busy)) 640 641 /* 642 * id_txpost_lock 643 */ 644 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 645 ibd_state_t::id_tx_head)) 646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 647 ibd_state_t::id_tx_busy)) 648 649 /* 650 * id_acache_req_lock 651 */ 652 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 653 ibd_state_t::id_acache_req_cv)) 654 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 655 ibd_state_t::id_req_list)) 656 _NOTE(SCHEME_PROTECTS_DATA("atomic", 657 ibd_acache_s::ac_ref)) 658 659 /* 660 * id_ac_mutex 661 * 662 * This mutex is actually supposed to protect id_ah_op as well, 663 * but this path of the code isn't clean (see update of id_ah_op 664 * in ibd_async_acache(), immediately after the call to 665 * ibd_async_mcache()). For now, we'll skip this check by 666 * declaring that id_ah_op is protected by some internal scheme 667 * that warlock isn't aware of. 668 */ 669 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 670 ibd_state_t::id_ah_active)) 671 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 672 ibd_state_t::id_ah_free)) 673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 674 ibd_state_t::id_ah_addr)) 675 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 676 ibd_state_t::id_ah_op)) 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 678 ibd_state_t::id_ah_error)) 679 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 680 ibd_state_t::id_ac_hot_ace)) 681 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 682 683 /* 684 * id_mc_mutex 685 */ 686 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 687 ibd_state_t::id_mc_full)) 688 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 689 ibd_state_t::id_mc_non)) 690 691 /* 692 * id_trap_lock 693 */ 694 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 695 ibd_state_t::id_trap_cv)) 696 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 697 ibd_state_t::id_trap_stop)) 698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 699 ibd_state_t::id_trap_inprog)) 700 701 /* 702 * id_prom_op 703 */ 704 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 705 ibd_state_t::id_prom_op)) 706 707 /* 708 * id_sched_lock 709 */ 710 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 711 ibd_state_t::id_sched_needed)) 712 713 /* 714 * id_link_mutex 715 */ 716 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 717 ibd_state_t::id_link_state)) 718 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 719 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 720 ibd_state_t::id_link_speed)) 721 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 722 723 /* 724 * id_tx_list.dl_mutex 725 */ 726 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 727 ibd_state_t::id_tx_list.dl_head)) 728 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 729 ibd_state_t::id_tx_list.dl_pending_sends)) 730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 731 ibd_state_t::id_tx_list.dl_cnt)) 732 733 /* 734 * id_rx_list.dl_mutex 735 */ 736 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 737 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 738 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 739 ibd_state_t::id_rx_list.dl_cnt)) 740 741 /* 742 * rc_timeout_lock 743 */ 744 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 745 ibd_state_t::rc_timeout_start)) 746 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 747 ibd_state_t::rc_timeout)) 748 749 750 /* 751 * Items protected by atomic updates 752 */ 753 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 754 ibd_state_s::id_brd_rcv 755 ibd_state_s::id_brd_xmt 756 ibd_state_s::id_multi_rcv 757 ibd_state_s::id_multi_xmt 758 ibd_state_s::id_num_intrs 759 ibd_state_s::id_rcv_bytes 760 ibd_state_s::id_rcv_pkt 761 ibd_state_s::id_rx_post_queue_index 762 ibd_state_s::id_tx_short 763 ibd_state_s::id_xmt_bytes 764 ibd_state_s::id_xmt_pkt 765 ibd_state_s::rc_rcv_trans_byte 766 ibd_state_s::rc_rcv_trans_pkt 767 ibd_state_s::rc_rcv_copy_byte 768 ibd_state_s::rc_rcv_copy_pkt 769 ibd_state_s::rc_xmt_bytes 770 ibd_state_s::rc_xmt_small_pkt 771 ibd_state_s::rc_xmt_fragmented_pkt 772 ibd_state_s::rc_xmt_map_fail_pkt 773 ibd_state_s::rc_xmt_map_succ_pkt 774 ibd_rc_chan_s::rcq_invoking)) 775 776 /* 777 * Non-mutex protection schemes for data elements. Almost all of 778 * these are non-shared items. 779 */ 780 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 781 callb_cpr 782 ib_gid_s 783 ib_header_info 784 ibd_acache_rq 785 ibd_acache_s::ac_mce 786 ibd_acache_s::ac_chan 787 ibd_mcache::mc_fullreap 788 ibd_mcache::mc_jstate 789 ibd_mcache::mc_req 790 ibd_rwqe_s 791 ibd_swqe_s 792 ibd_wqe_s 793 ibt_wr_ds_s::ds_va 794 ibt_wr_lso_s 795 ipoib_mac::ipoib_qpn 796 mac_capab_lso_s 797 msgb::b_next 798 msgb::b_cont 799 msgb::b_rptr 800 msgb::b_wptr 801 ibd_state_s::id_bgroup_created 802 ibd_state_s::id_mac_state 803 ibd_state_s::id_mtu 804 ibd_state_s::id_ud_num_rwqe 805 ibd_state_s::id_ud_num_swqe 806 ibd_state_s::id_qpnum 807 ibd_state_s::id_rcq_hdl 808 ibd_state_s::id_rx_buf_sz 809 ibd_state_s::id_rx_bufs 810 ibd_state_s::id_rx_mr_hdl 811 ibd_state_s::id_rx_wqes 812 ibd_state_s::id_rxwcs 813 ibd_state_s::id_rxwcs_size 814 ibd_state_s::id_rx_nqueues 815 ibd_state_s::id_rx_queues 816 ibd_state_s::id_scope 817 ibd_state_s::id_scq_hdl 818 ibd_state_s::id_tx_buf_sz 819 ibd_state_s::id_tx_bufs 820 ibd_state_s::id_tx_mr_hdl 821 ibd_state_s::id_tx_rel_list.dl_cnt 822 ibd_state_s::id_tx_wqes 823 ibd_state_s::id_txwcs 824 ibd_state_s::id_txwcs_size 825 ibd_state_s::rc_listen_hdl 826 ibd_state_s::rc_listen_hdl_OFED_interop 827 ibd_state_s::rc_srq_size 828 ibd_state_s::rc_srq_rwqes 829 ibd_state_s::rc_srq_rx_bufs 830 ibd_state_s::rc_srq_rx_mr_hdl 831 ibd_state_s::rc_tx_largebuf_desc_base 832 ibd_state_s::rc_tx_mr_bufs 833 ibd_state_s::rc_tx_mr_hdl 834 ipha_s 835 icmph_s 836 ibt_path_info_s::pi_sid 837 ibd_rc_chan_s::ace 838 ibd_rc_chan_s::chan_hdl 839 ibd_rc_chan_s::state 840 ibd_rc_chan_s::chan_state 841 ibd_rc_chan_s::is_tx_chan 842 ibd_rc_chan_s::rcq_hdl 843 ibd_rc_chan_s::rcq_size 844 ibd_rc_chan_s::scq_hdl 845 ibd_rc_chan_s::scq_size 846 ibd_rc_chan_s::rx_bufs 847 ibd_rc_chan_s::rx_mr_hdl 848 ibd_rc_chan_s::rx_rwqes 849 ibd_rc_chan_s::tx_wqes 850 ibd_rc_chan_s::tx_mr_bufs 851 ibd_rc_chan_s::tx_mr_hdl 852 ibd_rc_chan_s::tx_rel_list.dl_cnt 853 ibd_rc_chan_s::is_used 854 ibd_rc_tx_largebuf_s::lb_buf 855 ibd_rc_msg_hello_s 856 ibt_cm_return_args_s)) 857 858 /* 859 * ibd_rc_chan_s::next is protected by two mutexes: 860 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 861 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 862 */ 863 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 864 ibd_rc_chan_s::next)) 865 866 /* 867 * ibd_state_s.rc_tx_large_bufs_lock 868 */ 869 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 870 ibd_state_s::rc_tx_largebuf_free_head)) 871 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 872 ibd_state_s::rc_tx_largebuf_nfree)) 873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 874 ibd_rc_tx_largebuf_s::lb_next)) 875 876 /* 877 * ibd_acache_s.tx_too_big_mutex 878 */ 879 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 880 ibd_acache_s::tx_too_big_ongoing)) 881 882 /* 883 * tx_wqe_list.dl_mutex 884 */ 885 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 886 ibd_rc_chan_s::tx_wqe_list.dl_head)) 887 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 888 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 890 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 891 892 /* 893 * ibd_state_s.rc_ace_recycle_lock 894 */ 895 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 896 ibd_state_s::rc_ace_recycle)) 897 898 /* 899 * rc_srq_rwqe_list.dl_mutex 900 */ 901 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 902 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 903 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 904 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 905 906 /* 907 * Non-mutex protection schemes for data elements. They are counters 908 * for problem diagnosis. Don't need be protected. 909 */ 910 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 911 ibd_state_s::rc_rcv_alloc_fail 912 ibd_state_s::rc_rcq_err 913 ibd_state_s::rc_ace_not_found 914 ibd_state_s::rc_xmt_drop_too_long_pkt 915 ibd_state_s::rc_xmt_icmp_too_long_pkt 916 ibd_state_s::rc_xmt_reenter_too_long_pkt 917 ibd_state_s::rc_swqe_short 918 ibd_state_s::rc_swqe_mac_update 919 ibd_state_s::rc_xmt_buf_short 920 ibd_state_s::rc_xmt_buf_mac_update 921 ibd_state_s::rc_scq_no_swqe 922 ibd_state_s::rc_scq_no_largebuf 923 ibd_state_s::rc_conn_succ 924 ibd_state_s::rc_conn_fail 925 ibd_state_s::rc_null_conn 926 ibd_state_s::rc_no_estab_conn 927 ibd_state_s::rc_act_close 928 ibd_state_s::rc_pas_close 929 ibd_state_s::rc_delay_ace_recycle 930 ibd_state_s::rc_act_close_simultaneous 931 ibd_state_s::rc_act_close_not_clean 932 ibd_state_s::rc_pas_close_rcq_invoking 933 ibd_state_s::rc_reset_cnt 934 ibd_state_s::rc_timeout_act 935 ibd_state_s::rc_timeout_pas 936 ibd_state_s::rc_stop_connect)) 937 938 #ifdef DEBUG 939 /* 940 * Non-mutex protection schemes for data elements. They are counters 941 * for problem diagnosis. Don't need be protected. 942 */ 943 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 944 ibd_state_s::rc_rwqe_short 945 ibd_rc_stat_s::rc_rcv_trans_byte 946 ibd_rc_stat_s::rc_rcv_trans_pkt 947 ibd_rc_stat_s::rc_rcv_copy_byte 948 ibd_rc_stat_s::rc_rcv_copy_pkt 949 ibd_rc_stat_s::rc_rcv_alloc_fail 950 ibd_rc_stat_s::rc_rcq_err 951 ibd_rc_stat_s::rc_rwqe_short 952 ibd_rc_stat_s::rc_xmt_bytes 953 ibd_rc_stat_s::rc_xmt_small_pkt 954 ibd_rc_stat_s::rc_xmt_fragmented_pkt 955 ibd_rc_stat_s::rc_xmt_map_fail_pkt 956 ibd_rc_stat_s::rc_xmt_map_succ_pkt 957 ibd_rc_stat_s::rc_ace_not_found 958 ibd_rc_stat_s::rc_scq_no_swqe 959 ibd_rc_stat_s::rc_scq_no_largebuf 960 ibd_rc_stat_s::rc_swqe_short 961 ibd_rc_stat_s::rc_swqe_mac_update 962 ibd_rc_stat_s::rc_xmt_buf_short 963 ibd_rc_stat_s::rc_xmt_buf_mac_update 964 ibd_rc_stat_s::rc_conn_succ 965 ibd_rc_stat_s::rc_conn_fail 966 ibd_rc_stat_s::rc_null_conn 967 ibd_rc_stat_s::rc_no_estab_conn 968 ibd_rc_stat_s::rc_act_close 969 ibd_rc_stat_s::rc_pas_close 970 ibd_rc_stat_s::rc_delay_ace_recycle 971 ibd_rc_stat_s::rc_act_close_simultaneous 972 ibd_rc_stat_s::rc_reset_cnt 973 ibd_rc_stat_s::rc_timeout_act 974 ibd_rc_stat_s::rc_timeout_pas)) 975 #endif 976 977 int 978 _init() 979 { 980 int status; 981 982 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 983 PAGESIZE), 0); 984 if (status != 0) { 985 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 986 return (status); 987 } 988 989 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); 990 991 mac_init_ops(&ibd_dev_ops, "ibp"); 992 status = mod_install(&ibd_modlinkage); 993 if (status != 0) { 994 DPRINT(10, "_init:failed in mod_install()"); 995 ddi_soft_state_fini(&ibd_list); 996 mac_fini_ops(&ibd_dev_ops); 997 return (status); 998 } 999 1000 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 1001 mutex_enter(&ibd_gstate.ig_mutex); 1002 ibd_gstate.ig_ibt_hdl = NULL; 1003 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 1004 ibd_gstate.ig_service_list = NULL; 1005 mutex_exit(&ibd_gstate.ig_mutex); 1006 1007 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, 1008 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { 1009 return (EIO); 1010 } 1011 1012 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); 1013 1014 #ifdef IBD_LOGGING 1015 ibd_log_init(); 1016 #endif 1017 return (0); 1018 } 1019 1020 int 1021 _info(struct modinfo *modinfop) 1022 { 1023 return (mod_info(&ibd_modlinkage, modinfop)); 1024 } 1025 1026 int 1027 _fini() 1028 { 1029 int status; 1030 1031 status = mod_remove(&ibd_modlinkage); 1032 if (status != 0) 1033 return (status); 1034 1035 ibt_unregister_part_attr_cb(); 1036 1037 mac_fini_ops(&ibd_dev_ops); 1038 mutex_destroy(&ibd_objlist_lock); 1039 ddi_soft_state_fini(&ibd_list); 1040 mutex_destroy(&ibd_gstate.ig_mutex); 1041 #ifdef IBD_LOGGING 1042 ibd_log_fini(); 1043 #endif 1044 return (0); 1045 } 1046 1047 /* 1048 * Convert the GID part of the mac address from network byte order 1049 * to host order. 1050 */ 1051 static void 1052 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 1053 { 1054 ib_sn_prefix_t nbopref; 1055 ib_guid_t nboguid; 1056 1057 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 1058 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 1059 dgid->gid_prefix = b2h64(nbopref); 1060 dgid->gid_guid = b2h64(nboguid); 1061 } 1062 1063 /* 1064 * Create the IPoIB address in network byte order from host order inputs. 1065 */ 1066 static void 1067 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 1068 ib_guid_t guid) 1069 { 1070 ib_sn_prefix_t nbopref; 1071 ib_guid_t nboguid; 1072 1073 mac->ipoib_qpn = htonl(qpn); 1074 nbopref = h2b64(prefix); 1075 nboguid = h2b64(guid); 1076 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 1077 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 1078 } 1079 1080 /* 1081 * Send to the appropriate all-routers group when the IBA multicast group 1082 * does not exist, based on whether the target group is v4 or v6. 1083 */ 1084 static boolean_t 1085 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 1086 ipoib_mac_t *rmac) 1087 { 1088 boolean_t retval = B_TRUE; 1089 uint32_t adjscope = state->id_scope << 16; 1090 uint32_t topword; 1091 1092 /* 1093 * Copy the first 4 bytes in without assuming any alignment of 1094 * input mac address; this will have IPoIB signature, flags and 1095 * scope bits. 1096 */ 1097 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 1098 topword = ntohl(topword); 1099 1100 /* 1101 * Generate proper address for IPv4/v6, adding in the Pkey properly. 1102 */ 1103 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 1104 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 1105 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 1106 ((uint32_t)(state->id_pkey << 16))), 1107 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 1108 else 1109 /* 1110 * Does not have proper bits in the mgid address. 1111 */ 1112 retval = B_FALSE; 1113 1114 return (retval); 1115 } 1116 1117 /* 1118 * Membership states for different mcg's are tracked by two lists: 1119 * the "non" list is used for promiscuous mode, when all mcg traffic 1120 * needs to be inspected. This type of membership is never used for 1121 * transmission, so there can not be an AH in the active list 1122 * corresponding to a member in this list. This list does not need 1123 * any protection, since all operations are performed by the async 1124 * thread. 1125 * 1126 * "Full" and "SendOnly" membership is tracked using a single list, 1127 * the "full" list. This is because this single list can then be 1128 * searched during transmit to a multicast group (if an AH for the 1129 * mcg is not found in the active list), since at least one type 1130 * of membership must be present before initiating the transmit. 1131 * This list is also emptied during driver detach, since sendonly 1132 * membership acquired during transmit is dropped at detach time 1133 * along with ipv4 broadcast full membership. Insert/deletes to 1134 * this list are done only by the async thread, but it is also 1135 * searched in program context (see multicast disable case), thus 1136 * the id_mc_mutex protects the list. The driver detach path also 1137 * deconstructs the "full" list, but it ensures that the async 1138 * thread will not be accessing the list (by blocking out mcg 1139 * trap handling and making sure no more Tx reaping will happen). 1140 * 1141 * Currently, an IBA attach is done in the SendOnly case too, 1142 * although this is not required. 1143 */ 1144 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1145 list_insert_head(&state->id_mc_full, mce) 1146 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1147 list_insert_head(&state->id_mc_non, mce) 1148 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1149 ibd_mcache_find(mgid, &state->id_mc_full) 1150 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1151 ibd_mcache_find(mgid, &state->id_mc_non) 1152 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1153 list_remove(&state->id_mc_full, mce) 1154 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1155 list_remove(&state->id_mc_non, mce) 1156 1157 static void * 1158 list_get_head(list_t *list) 1159 { 1160 list_node_t *lhead = list_head(list); 1161 1162 if (lhead != NULL) 1163 list_remove(list, lhead); 1164 return (lhead); 1165 } 1166 1167 /* 1168 * This is always guaranteed to be able to queue the work. 1169 */ 1170 void 1171 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1172 { 1173 /* Initialize request */ 1174 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1175 ptr->rq_op = op; 1176 1177 /* 1178 * Queue provided slot onto request pool. 1179 */ 1180 mutex_enter(&state->id_acache_req_lock); 1181 list_insert_tail(&state->id_req_list, ptr); 1182 1183 /* Go, fetch, async thread */ 1184 cv_signal(&state->id_acache_req_cv); 1185 mutex_exit(&state->id_acache_req_lock); 1186 } 1187 1188 /* 1189 * Main body of the per interface async thread. 1190 */ 1191 static void 1192 ibd_async_work(ibd_state_t *state) 1193 { 1194 ibd_req_t *ptr; 1195 callb_cpr_t cprinfo; 1196 1197 mutex_enter(&state->id_acache_req_lock); 1198 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1199 callb_generic_cpr, "ibd_async_work"); 1200 1201 for (;;) { 1202 ptr = list_get_head(&state->id_req_list); 1203 if (ptr != NULL) { 1204 mutex_exit(&state->id_acache_req_lock); 1205 1206 /* 1207 * If we are in late hca initialization mode, do not 1208 * process any other async request other than TRAP. TRAP 1209 * is used for indicating creation of a broadcast group; 1210 * in which case, we need to join/create the group. 1211 */ 1212 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 1213 (ptr->rq_op != IBD_ASYNC_TRAP)) { 1214 goto free_req_and_continue; 1215 } 1216 1217 /* 1218 * Once we have done the operation, there is no 1219 * guarantee the request slot is going to be valid, 1220 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1221 * TRAP). 1222 * 1223 * Perform the request. 1224 */ 1225 switch (ptr->rq_op) { 1226 case IBD_ASYNC_GETAH: 1227 ibd_async_acache(state, &ptr->rq_mac); 1228 break; 1229 case IBD_ASYNC_JOIN: 1230 case IBD_ASYNC_LEAVE: 1231 ibd_async_multicast(state, 1232 ptr->rq_gid, ptr->rq_op); 1233 break; 1234 case IBD_ASYNC_PROMON: 1235 ibd_async_setprom(state); 1236 break; 1237 case IBD_ASYNC_PROMOFF: 1238 ibd_async_unsetprom(state); 1239 break; 1240 case IBD_ASYNC_REAP: 1241 ibd_async_reap_group(state, 1242 ptr->rq_ptr, ptr->rq_gid, 1243 IB_MC_JSTATE_FULL); 1244 /* 1245 * the req buf contains in mce 1246 * structure, so we do not need 1247 * to free it here. 1248 */ 1249 ptr = NULL; 1250 break; 1251 case IBD_ASYNC_TRAP: 1252 ibd_async_trap(state, ptr); 1253 break; 1254 case IBD_ASYNC_SCHED: 1255 ibd_async_txsched(state); 1256 break; 1257 case IBD_ASYNC_LINK: 1258 ibd_async_link(state, ptr); 1259 break; 1260 case IBD_ASYNC_EXIT: 1261 mutex_enter(&state->id_acache_req_lock); 1262 #ifndef __lock_lint 1263 CALLB_CPR_EXIT(&cprinfo); 1264 #else 1265 mutex_exit(&state->id_acache_req_lock); 1266 #endif 1267 return; 1268 case IBD_ASYNC_RC_TOO_BIG: 1269 ibd_async_rc_process_too_big(state, 1270 ptr); 1271 break; 1272 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1273 ibd_async_rc_close_act_chan(state, ptr); 1274 break; 1275 case IBD_ASYNC_RC_RECYCLE_ACE: 1276 ibd_async_rc_recycle_ace(state, ptr); 1277 break; 1278 case IBD_ASYNC_RC_CLOSE_PAS_CHAN: 1279 (void) ibd_rc_pas_close(ptr->rq_ptr, 1280 B_TRUE, B_TRUE); 1281 break; 1282 } 1283 free_req_and_continue: 1284 if (ptr != NULL) 1285 kmem_cache_free(state->id_req_kmc, ptr); 1286 1287 mutex_enter(&state->id_acache_req_lock); 1288 } else { 1289 #ifndef __lock_lint 1290 /* 1291 * Nothing to do: wait till new request arrives. 1292 */ 1293 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1294 cv_wait(&state->id_acache_req_cv, 1295 &state->id_acache_req_lock); 1296 CALLB_CPR_SAFE_END(&cprinfo, 1297 &state->id_acache_req_lock); 1298 #endif 1299 } 1300 } 1301 1302 /*NOTREACHED*/ 1303 _NOTE(NOT_REACHED) 1304 } 1305 1306 /* 1307 * Return when it is safe to queue requests to the async daemon; primarily 1308 * for subnet trap and async event handling. Disallow requests before the 1309 * daemon is created, and when interface deinitilization starts. 1310 */ 1311 static boolean_t 1312 ibd_async_safe(ibd_state_t *state) 1313 { 1314 mutex_enter(&state->id_trap_lock); 1315 if (state->id_trap_stop) { 1316 mutex_exit(&state->id_trap_lock); 1317 return (B_FALSE); 1318 } 1319 state->id_trap_inprog++; 1320 mutex_exit(&state->id_trap_lock); 1321 return (B_TRUE); 1322 } 1323 1324 /* 1325 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1326 * trap or event handling to complete to kill the async thread and deconstruct 1327 * the mcg/ace list. 1328 */ 1329 static void 1330 ibd_async_done(ibd_state_t *state) 1331 { 1332 mutex_enter(&state->id_trap_lock); 1333 if (--state->id_trap_inprog == 0) 1334 cv_signal(&state->id_trap_cv); 1335 mutex_exit(&state->id_trap_lock); 1336 } 1337 1338 /* 1339 * Hash functions: 1340 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1341 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1342 * These operate on mac addresses input into ibd_send, but there is no 1343 * guarantee on the alignment of the ipoib_mac_t structure. 1344 */ 1345 /*ARGSUSED*/ 1346 static uint_t 1347 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1348 { 1349 ulong_t ptraddr = (ulong_t)key; 1350 uint_t hval; 1351 1352 /* 1353 * If the input address is 4 byte aligned, we can just dereference 1354 * it. This is most common, since IP will send in a 4 byte aligned 1355 * IP header, which implies the 24 byte IPoIB psuedo header will be 1356 * 4 byte aligned too. 1357 */ 1358 if ((ptraddr & 3) == 0) 1359 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1360 1361 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1362 return (hval); 1363 } 1364 1365 static int 1366 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1367 { 1368 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1369 return (0); 1370 else 1371 return (1); 1372 } 1373 1374 /* 1375 * Initialize all the per interface caches and lists; AH cache, 1376 * MCG list etc. 1377 */ 1378 static int 1379 ibd_acache_init(ibd_state_t *state) 1380 { 1381 ibd_ace_t *ce; 1382 int i; 1383 1384 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1385 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1386 mutex_enter(&state->id_ac_mutex); 1387 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1388 offsetof(ibd_ace_t, ac_list)); 1389 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1390 offsetof(ibd_ace_t, ac_list)); 1391 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1392 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, 1393 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1394 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1395 offsetof(ibd_mce_t, mc_list)); 1396 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1397 offsetof(ibd_mce_t, mc_list)); 1398 state->id_ac_hot_ace = NULL; 1399 1400 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1401 state->id_num_ah, KM_SLEEP); 1402 for (i = 0; i < state->id_num_ah; i++, ce++) { 1403 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1404 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1405 mutex_exit(&state->id_ac_mutex); 1406 ibd_acache_fini(state); 1407 return (DDI_FAILURE); 1408 } else { 1409 CLEAR_REFCYCLE(ce); 1410 ce->ac_mce = NULL; 1411 mutex_init(&ce->tx_too_big_mutex, NULL, 1412 MUTEX_DRIVER, NULL); 1413 IBD_ACACHE_INSERT_FREE(state, ce); 1414 } 1415 } 1416 mutex_exit(&state->id_ac_mutex); 1417 return (DDI_SUCCESS); 1418 } 1419 1420 static void 1421 ibd_acache_fini(ibd_state_t *state) 1422 { 1423 ibd_ace_t *ptr; 1424 1425 mutex_enter(&state->id_ac_mutex); 1426 1427 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1428 ASSERT(GET_REF(ptr) == 0); 1429 mutex_destroy(&ptr->tx_too_big_mutex); 1430 (void) ibt_free_ud_dest(ptr->ac_dest); 1431 } 1432 1433 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1434 ASSERT(GET_REF(ptr) == 0); 1435 mutex_destroy(&ptr->tx_too_big_mutex); 1436 (void) ibt_free_ud_dest(ptr->ac_dest); 1437 } 1438 1439 list_destroy(&state->id_ah_free); 1440 list_destroy(&state->id_ah_active); 1441 list_destroy(&state->id_mc_full); 1442 list_destroy(&state->id_mc_non); 1443 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); 1444 mutex_exit(&state->id_ac_mutex); 1445 mutex_destroy(&state->id_ac_mutex); 1446 mutex_destroy(&state->id_mc_mutex); 1447 } 1448 1449 /* 1450 * Search AH active hash list for a cached path to input destination. 1451 * If we are "just looking", hold == F. When we are in the Tx path, 1452 * we set hold == T to grab a reference on the AH so that it can not 1453 * be recycled to a new destination while the Tx request is posted. 1454 */ 1455 ibd_ace_t * 1456 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1457 { 1458 ibd_ace_t *ptr; 1459 1460 ASSERT(mutex_owned(&state->id_ac_mutex)); 1461 1462 /* 1463 * Do hash search. 1464 */ 1465 if (mod_hash_find(state->id_ah_active_hash, 1466 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1467 if (hold) 1468 INC_REF(ptr, num); 1469 return (ptr); 1470 } 1471 return (NULL); 1472 } 1473 1474 /* 1475 * This is called by the tx side; if an initialized AH is found in 1476 * the active list, it is locked down and can be used; if no entry 1477 * is found, an async request is queued to do path resolution. 1478 */ 1479 static ibd_ace_t * 1480 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1481 { 1482 ibd_ace_t *ptr; 1483 ibd_req_t *req; 1484 1485 /* 1486 * Only attempt to print when we can; in the mdt pattr case, the 1487 * address is not aligned properly. 1488 */ 1489 if (((ulong_t)mac & 3) == 0) { 1490 DPRINT(4, 1491 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1492 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1493 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1494 htonl(mac->ipoib_gidsuff[1])); 1495 } 1496 1497 mutex_enter(&state->id_ac_mutex); 1498 1499 if (((ptr = state->id_ac_hot_ace) != NULL) && 1500 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1501 INC_REF(ptr, numwqe); 1502 mutex_exit(&state->id_ac_mutex); 1503 return (ptr); 1504 } 1505 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1506 state->id_ac_hot_ace = ptr; 1507 mutex_exit(&state->id_ac_mutex); 1508 return (ptr); 1509 } 1510 1511 /* 1512 * Implementation of a single outstanding async request; if 1513 * the operation is not started yet, queue a request and move 1514 * to ongoing state. Remember in id_ah_addr for which address 1515 * we are queueing the request, in case we need to flag an error; 1516 * Any further requests, for the same or different address, until 1517 * the operation completes, is sent back to GLDv3 to be retried. 1518 * The async thread will update id_ah_op with an error indication 1519 * or will set it to indicate the next look up can start; either 1520 * way, it will mac_tx_update() so that all blocked requests come 1521 * back here. 1522 */ 1523 *err = EAGAIN; 1524 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1525 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1526 if (req != NULL) { 1527 /* 1528 * We did not even find the entry; queue a request 1529 * for it. 1530 */ 1531 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1532 state->id_ah_op = IBD_OP_ONGOING; 1533 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1534 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1535 } 1536 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1537 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1538 /* 1539 * Check the status of the pathrecord lookup request 1540 * we had queued before. 1541 */ 1542 if (state->id_ah_op == IBD_OP_ERRORED) { 1543 *err = EFAULT; 1544 state->id_ah_error++; 1545 } else { 1546 /* 1547 * IBD_OP_ROUTERED case: We need to send to the 1548 * all-router MCG. If we can find the AH for 1549 * the mcg, the Tx will be attempted. If we 1550 * do not find the AH, we return NORESOURCES 1551 * to retry. 1552 */ 1553 ipoib_mac_t routermac; 1554 1555 (void) ibd_get_allroutergroup(state, mac, &routermac); 1556 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1557 numwqe); 1558 } 1559 state->id_ah_op = IBD_OP_NOTSTARTED; 1560 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1561 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1562 /* 1563 * This case can happen when we get a higher band 1564 * packet. The easiest way is to reset the state machine 1565 * to accommodate the higher priority packet. 1566 */ 1567 state->id_ah_op = IBD_OP_NOTSTARTED; 1568 } 1569 mutex_exit(&state->id_ac_mutex); 1570 1571 return (ptr); 1572 } 1573 1574 /* 1575 * Grab a not-currently-in-use AH/PathRecord from the active 1576 * list to recycle to a new destination. Only the async thread 1577 * executes this code. 1578 */ 1579 static ibd_ace_t * 1580 ibd_acache_get_unref(ibd_state_t *state) 1581 { 1582 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1583 boolean_t try_rc_chan_recycle = B_FALSE; 1584 1585 ASSERT(mutex_owned(&state->id_ac_mutex)); 1586 1587 /* 1588 * Do plain linear search. 1589 */ 1590 while (ptr != NULL) { 1591 /* 1592 * Note that it is possible that the "cycle" bit 1593 * is set on the AH w/o any reference count. The 1594 * mcg must have been deleted, and the tx cleanup 1595 * just decremented the reference count to 0, but 1596 * hasn't gotten around to grabbing the id_ac_mutex 1597 * to move the AH into the free list. 1598 */ 1599 if (GET_REF(ptr) == 0) { 1600 if (ptr->ac_chan != NULL) { 1601 ASSERT(state->id_enable_rc == B_TRUE); 1602 if (!try_rc_chan_recycle) { 1603 try_rc_chan_recycle = B_TRUE; 1604 ibd_rc_signal_ace_recycle(state, ptr); 1605 } 1606 } else { 1607 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1608 break; 1609 } 1610 } 1611 ptr = list_prev(&state->id_ah_active, ptr); 1612 } 1613 return (ptr); 1614 } 1615 1616 /* 1617 * Invoked to clean up AH from active list in case of multicast 1618 * disable and to handle sendonly memberships during mcg traps. 1619 * And for port up processing for multicast and unicast AHs. 1620 * Normally, the AH is taken off the active list, and put into 1621 * the free list to be recycled for a new destination. In case 1622 * Tx requests on the AH have not completed yet, the AH is marked 1623 * for reaping (which will put the AH on the free list) once the Tx's 1624 * complete; in this case, depending on the "force" input, we take 1625 * out the AH from the active list right now, or leave it also for 1626 * the reap operation. Returns TRUE if the AH is taken off the active 1627 * list (and either put into the free list right now, or arranged for 1628 * later), FALSE otherwise. 1629 */ 1630 boolean_t 1631 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1632 { 1633 ibd_ace_t *acactive; 1634 boolean_t ret = B_TRUE; 1635 1636 ASSERT(mutex_owned(&state->id_ac_mutex)); 1637 1638 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1639 1640 /* 1641 * Note that the AH might already have the cycle bit set 1642 * on it; this might happen if sequences of multicast 1643 * enables and disables are coming so fast, that posted 1644 * Tx's to the mcg have not completed yet, and the cycle 1645 * bit is set successively by each multicast disable. 1646 */ 1647 if (SET_CYCLE_IF_REF(acactive)) { 1648 if (!force) { 1649 /* 1650 * The ace is kept on the active list, further 1651 * Tx's can still grab a reference on it; the 1652 * ace is reaped when all pending Tx's 1653 * referencing the AH complete. 1654 */ 1655 ret = B_FALSE; 1656 } else { 1657 /* 1658 * In the mcg trap case, we always pull the 1659 * AH from the active list. And also the port 1660 * up multi/unicast case. 1661 */ 1662 ASSERT(acactive->ac_chan == NULL); 1663 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1664 acactive->ac_mce = NULL; 1665 } 1666 } else { 1667 /* 1668 * Determined the ref count is 0, thus reclaim 1669 * immediately after pulling out the ace from 1670 * the active list. 1671 */ 1672 ASSERT(acactive->ac_chan == NULL); 1673 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1674 acactive->ac_mce = NULL; 1675 IBD_ACACHE_INSERT_FREE(state, acactive); 1676 } 1677 1678 } 1679 return (ret); 1680 } 1681 1682 /* 1683 * Helper function for async path record lookup. If we are trying to 1684 * Tx to a MCG, check our membership, possibly trying to join the 1685 * group if required. If that fails, try to send the packet to the 1686 * all router group (indicated by the redirect output), pointing 1687 * the input mac address to the router mcg address. 1688 */ 1689 static ibd_mce_t * 1690 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1691 { 1692 ib_gid_t mgid; 1693 ibd_mce_t *mce; 1694 ipoib_mac_t routermac; 1695 1696 *redirect = B_FALSE; 1697 ibd_n2h_gid(mac, &mgid); 1698 1699 /* 1700 * Check the FullMember+SendOnlyNonMember list. 1701 * Since we are the only one who manipulates the 1702 * id_mc_full list, no locks are needed. 1703 */ 1704 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1705 if (mce != NULL) { 1706 DPRINT(4, "ibd_async_mcache : already joined to group"); 1707 return (mce); 1708 } 1709 1710 /* 1711 * Not found; try to join(SendOnlyNonMember) and attach. 1712 */ 1713 DPRINT(4, "ibd_async_mcache : not joined to group"); 1714 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1715 NULL) { 1716 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1717 return (mce); 1718 } 1719 1720 /* 1721 * MCGroup not present; try to join the all-router group. If 1722 * any of the following steps succeed, we will be redirecting 1723 * to the all router group. 1724 */ 1725 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1726 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1727 return (NULL); 1728 *redirect = B_TRUE; 1729 ibd_n2h_gid(&routermac, &mgid); 1730 bcopy(&routermac, mac, IPOIB_ADDRL); 1731 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1732 mgid.gid_prefix, mgid.gid_guid); 1733 1734 /* 1735 * Are we already joined to the router group? 1736 */ 1737 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1738 DPRINT(4, "ibd_async_mcache : using already joined router" 1739 "group\n"); 1740 return (mce); 1741 } 1742 1743 /* 1744 * Can we join(SendOnlyNonMember) the router group? 1745 */ 1746 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1747 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1748 NULL) { 1749 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1750 return (mce); 1751 } 1752 1753 return (NULL); 1754 } 1755 1756 /* 1757 * Async path record lookup code. 1758 */ 1759 static void 1760 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1761 { 1762 ibd_ace_t *ce; 1763 ibd_mce_t *mce = NULL; 1764 ibt_path_attr_t path_attr; 1765 ibt_path_info_t path_info; 1766 ib_gid_t destgid; 1767 char ret = IBD_OP_NOTSTARTED; 1768 1769 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1770 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1771 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1772 htonl(mac->ipoib_gidsuff[1])); 1773 1774 /* 1775 * Check whether we are trying to transmit to a MCG. 1776 * In that case, we need to make sure we are a member of 1777 * the MCG. 1778 */ 1779 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1780 boolean_t redirected; 1781 1782 /* 1783 * If we can not find or join the group or even 1784 * redirect, error out. 1785 */ 1786 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1787 NULL) { 1788 state->id_ah_op = IBD_OP_ERRORED; 1789 return; 1790 } 1791 1792 /* 1793 * If we got redirected, we need to determine whether 1794 * the AH for the new mcg is in the cache already, and 1795 * not pull it in then; otherwise proceed to get the 1796 * path for the new mcg. There is no guarantee that 1797 * if the AH is currently in the cache, it will still be 1798 * there when we look in ibd_acache_lookup(), but that's 1799 * okay, we will come back here. 1800 */ 1801 if (redirected) { 1802 ret = IBD_OP_ROUTERED; 1803 DPRINT(4, "ibd_async_acache : redirected to " 1804 "%08X:%08X:%08X:%08X:%08X", 1805 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1806 htonl(mac->ipoib_gidpref[1]), 1807 htonl(mac->ipoib_gidsuff[0]), 1808 htonl(mac->ipoib_gidsuff[1])); 1809 1810 mutex_enter(&state->id_ac_mutex); 1811 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1812 state->id_ah_op = IBD_OP_ROUTERED; 1813 mutex_exit(&state->id_ac_mutex); 1814 DPRINT(4, "ibd_async_acache : router AH found"); 1815 return; 1816 } 1817 mutex_exit(&state->id_ac_mutex); 1818 } 1819 } 1820 1821 /* 1822 * Get an AH from the free list. 1823 */ 1824 mutex_enter(&state->id_ac_mutex); 1825 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1826 /* 1827 * No free ones; try to grab an unreferenced active 1828 * one. Maybe we need to make the active list LRU, 1829 * but that will create more work for Tx callbacks. 1830 * Is there a way of not having to pull out the 1831 * entry from the active list, but just indicate it 1832 * is being recycled? Yes, but that creates one more 1833 * check in the fast lookup path. 1834 */ 1835 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1836 /* 1837 * Pretty serious shortage now. 1838 */ 1839 state->id_ah_op = IBD_OP_NOTSTARTED; 1840 mutex_exit(&state->id_ac_mutex); 1841 DPRINT(10, "ibd_async_acache : failed to find AH " 1842 "slot\n"); 1843 return; 1844 } 1845 /* 1846 * We could check whether ac_mce points to a SendOnly 1847 * member and drop that membership now. Or do it lazily 1848 * at detach time. 1849 */ 1850 ce->ac_mce = NULL; 1851 } 1852 mutex_exit(&state->id_ac_mutex); 1853 ASSERT(ce->ac_mce == NULL); 1854 1855 /* 1856 * Update the entry. 1857 */ 1858 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1859 1860 bzero(&path_info, sizeof (path_info)); 1861 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1862 path_attr.pa_sgid = state->id_sgid; 1863 path_attr.pa_num_dgids = 1; 1864 ibd_n2h_gid(&ce->ac_mac, &destgid); 1865 path_attr.pa_dgids = &destgid; 1866 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1867 path_attr.pa_pkey = state->id_pkey; 1868 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1, 1869 &path_info, NULL) != IBT_SUCCESS) { 1870 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1871 goto error; 1872 } 1873 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1874 ntohl(ce->ac_mac.ipoib_qpn), 1875 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1876 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1877 goto error; 1878 } 1879 1880 /* 1881 * mce is set whenever an AH is being associated with a 1882 * MCG; this will come in handy when we leave the MCG. The 1883 * lock protects Tx fastpath from scanning the active list. 1884 */ 1885 if (mce != NULL) 1886 ce->ac_mce = mce; 1887 1888 /* 1889 * initiate a RC mode connection for unicast address 1890 */ 1891 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1892 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1893 ASSERT(ce->ac_chan == NULL); 1894 DPRINT(10, "ibd_async_acache: call " 1895 "ibd_rc_try_connect(ace=%p)", ce); 1896 ibd_rc_try_connect(state, ce, &path_info); 1897 if (ce->ac_chan == NULL) { 1898 DPRINT(10, "ibd_async_acache: fail to setup RC" 1899 " channel"); 1900 state->rc_conn_fail++; 1901 goto error; 1902 } 1903 } 1904 1905 mutex_enter(&state->id_ac_mutex); 1906 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1907 state->id_ah_op = ret; 1908 mutex_exit(&state->id_ac_mutex); 1909 return; 1910 error: 1911 /* 1912 * We might want to drop SendOnly membership here if we 1913 * joined above. The lock protects Tx callbacks inserting 1914 * into the free list. 1915 */ 1916 mutex_enter(&state->id_ac_mutex); 1917 state->id_ah_op = IBD_OP_ERRORED; 1918 IBD_ACACHE_INSERT_FREE(state, ce); 1919 mutex_exit(&state->id_ac_mutex); 1920 } 1921 1922 /* 1923 * While restoring port's presence on the subnet on a port up, it is possible 1924 * that the port goes down again. 1925 */ 1926 static void 1927 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1928 { 1929 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1930 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1931 LINK_STATE_UP; 1932 ibd_mce_t *mce, *pmce; 1933 ibd_ace_t *ace, *pace; 1934 1935 DPRINT(10, "ibd_async_link(): %d", opcode); 1936 1937 /* 1938 * On a link up, revalidate the link speed/width. No point doing 1939 * this on a link down, since we will be unable to do SA operations, 1940 * defaulting to the lowest speed. Also notice that we update our 1941 * notion of speed before calling mac_link_update(), which will do 1942 * necessary higher level notifications for speed changes. 1943 */ 1944 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1945 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1946 state->id_link_speed = ibd_get_portspeed(state); 1947 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1948 } 1949 1950 /* 1951 * Do all the work required to establish our presence on 1952 * the subnet. 1953 */ 1954 if (opcode == IBD_LINK_UP_ABSENT) { 1955 /* 1956 * If in promiscuous mode ... 1957 */ 1958 if (state->id_prom_op == IBD_OP_COMPLETED) { 1959 /* 1960 * Drop all nonmembership. 1961 */ 1962 ibd_async_unsetprom(state); 1963 1964 /* 1965 * Then, try to regain nonmembership to all mcg's. 1966 */ 1967 ibd_async_setprom(state); 1968 1969 } 1970 1971 /* 1972 * Drop all sendonly membership (which also gets rid of the 1973 * AHs); try to reacquire all full membership. 1974 */ 1975 mce = list_head(&state->id_mc_full); 1976 while ((pmce = mce) != NULL) { 1977 mce = list_next(&state->id_mc_full, mce); 1978 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1979 ibd_leave_group(state, 1980 pmce->mc_info.mc_adds_vect.av_dgid, 1981 IB_MC_JSTATE_SEND_ONLY_NON); 1982 else 1983 ibd_reacquire_group(state, pmce); 1984 } 1985 1986 /* 1987 * Recycle all active AHs to free list (and if there are 1988 * pending posts, make sure they will go into the free list 1989 * once the Tx's complete). Grab the lock to prevent 1990 * concurrent Tx's as well as Tx cleanups. 1991 */ 1992 mutex_enter(&state->id_ac_mutex); 1993 ace = list_head(&state->id_ah_active); 1994 while ((pace = ace) != NULL) { 1995 boolean_t cycled; 1996 1997 ace = list_next(&state->id_ah_active, ace); 1998 mce = pace->ac_mce; 1999 if (pace->ac_chan != NULL) { 2000 ASSERT(mce == NULL); 2001 ASSERT(state->id_enable_rc == B_TRUE); 2002 if (pace->ac_chan->chan_state == 2003 IBD_RC_STATE_ACT_ESTAB) { 2004 INC_REF(pace, 1); 2005 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 2006 pace->ac_chan->chan_state = 2007 IBD_RC_STATE_ACT_CLOSING; 2008 ibd_rc_signal_act_close(state, pace); 2009 } else { 2010 state->rc_act_close_simultaneous++; 2011 DPRINT(40, "ibd_async_link: other " 2012 "thread is closing it, ace=%p, " 2013 "ac_chan=%p, chan_state=%d", 2014 pace, pace->ac_chan, 2015 pace->ac_chan->chan_state); 2016 } 2017 } else { 2018 cycled = ibd_acache_recycle(state, 2019 &pace->ac_mac, B_TRUE); 2020 } 2021 /* 2022 * If this is for an mcg, it must be for a fullmember, 2023 * since we got rid of send-only members above when 2024 * processing the mce list. 2025 */ 2026 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2027 IB_MC_JSTATE_FULL))); 2028 2029 /* 2030 * Check if the fullmember mce needs to be torn down, 2031 * ie whether the DLPI disable has already been done. 2032 * If so, do some of the work of tx_cleanup, namely 2033 * causing leave (which will fail), detach and 2034 * mce-freeing. tx_cleanup will put the AH into free 2035 * list. The reason to duplicate some of this 2036 * tx_cleanup work is because we want to delete the 2037 * AH right now instead of waiting for tx_cleanup, to 2038 * force subsequent Tx's to reacquire an AH. 2039 */ 2040 if ((mce != NULL) && (mce->mc_fullreap)) 2041 ibd_async_reap_group(state, mce, 2042 mce->mc_info.mc_adds_vect.av_dgid, 2043 mce->mc_jstate); 2044 } 2045 mutex_exit(&state->id_ac_mutex); 2046 } 2047 2048 /* 2049 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2050 * (which stops further events from being delivered) before 2051 * mac_unregister(). At this point, it is guaranteed that mac_register 2052 * has already been done. 2053 */ 2054 mutex_enter(&state->id_link_mutex); 2055 state->id_link_state = lstate; 2056 mac_link_update(state->id_mh, lstate); 2057 mutex_exit(&state->id_link_mutex); 2058 2059 ibd_async_done(state); 2060 } 2061 2062 /* 2063 * Check the pkey table to see if we can find the pkey we're looking for. 2064 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 2065 * failure. 2066 */ 2067 static int 2068 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 2069 uint16_t *pkix) 2070 { 2071 uint16_t ndx; 2072 2073 ASSERT(pkix != NULL); 2074 2075 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 2076 if (pkey_tbl[ndx] == pkey) { 2077 *pkix = ndx; 2078 return (0); 2079 } 2080 } 2081 return (-1); 2082 } 2083 2084 /* 2085 * Late HCA Initialization: 2086 * If plumb had succeeded without the availability of an active port or the 2087 * pkey, and either of their availability is now being indicated via PORT_UP 2088 * or PORT_CHANGE respectively, try a start of the interface. 2089 * 2090 * Normal Operation: 2091 * When the link is notified up, we need to do a few things, based 2092 * on the port's current p_init_type_reply claiming a reinit has been 2093 * done or not. The reinit steps are: 2094 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2095 * the old Pkey and GID0 are correct. 2096 * 2. Register for mcg traps (already done by ibmf). 2097 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2098 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2099 * 4. Give up all sendonly memberships. 2100 * 5. Acquire all full memberships. 2101 * 6. In promiscuous mode, acquire all non memberships. 2102 * 7. Recycle all AHs to free list. 2103 */ 2104 static void 2105 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2106 { 2107 ibt_hca_portinfo_t *port_infop = NULL; 2108 ibt_status_t ibt_status; 2109 uint_t psize, port_infosz; 2110 ibd_link_op_t opcode; 2111 ibd_req_t *req; 2112 link_state_t new_link_state = LINK_STATE_UP; 2113 uint8_t itreply; 2114 uint16_t pkix; 2115 int ret; 2116 2117 /* 2118 * Let's not race with a plumb or an unplumb; if we detect a 2119 * pkey relocation event later on here, we may have to restart. 2120 */ 2121 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2122 2123 mutex_enter(&state->id_link_mutex); 2124 2125 /* 2126 * If the link state is unknown, a plumb has not yet been attempted 2127 * on the interface. Nothing to do. 2128 */ 2129 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2130 mutex_exit(&state->id_link_mutex); 2131 goto link_mod_return; 2132 } 2133 2134 /* 2135 * If link state is down because of plumb failure, and we are not in 2136 * late HCA init, and we were not successfully plumbed, nothing to do. 2137 */ 2138 if ((state->id_link_state == LINK_STATE_DOWN) && 2139 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && 2140 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { 2141 mutex_exit(&state->id_link_mutex); 2142 goto link_mod_return; 2143 } 2144 2145 /* 2146 * If this routine was called in response to a port down event, 2147 * we just need to see if this should be informed. 2148 */ 2149 if (code == IBT_ERROR_PORT_DOWN) { 2150 new_link_state = LINK_STATE_DOWN; 2151 goto update_link_state; 2152 } 2153 2154 /* 2155 * If it's not a port down event we've received, try to get the port 2156 * attributes first. If we fail here, the port is as good as down. 2157 * Otherwise, if the link went down by the time the handler gets 2158 * here, give up - we cannot even validate the pkey/gid since those 2159 * are not valid and this is as bad as a port down anyway. 2160 */ 2161 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2162 &port_infop, &psize, &port_infosz); 2163 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2164 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2165 new_link_state = LINK_STATE_DOWN; 2166 goto update_link_state; 2167 } 2168 2169 /* 2170 * If in the previous attempt, the pkey was not found either due to the 2171 * port state being down, or due to it's absence in the pkey table, 2172 * look for it now and try to start the interface. 2173 */ 2174 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { 2175 mutex_exit(&state->id_link_mutex); 2176 if ((ret = ibd_start(state)) != 0) { 2177 DPRINT(10, "ibd_linkmod: cannot start from late HCA " 2178 "init, ret=%d", ret); 2179 } 2180 ibt_free_portinfo(port_infop, port_infosz); 2181 goto link_mod_return; 2182 } 2183 2184 /* 2185 * Check the SM InitTypeReply flags. If both NoLoadReply and 2186 * PreserveContentReply are 0, we don't know anything about the 2187 * data loaded into the port attributes, so we need to verify 2188 * if gid0 and pkey are still valid. 2189 */ 2190 itreply = port_infop->p_init_type_reply; 2191 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2192 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2193 /* 2194 * Check to see if the subnet part of GID0 has changed. If 2195 * not, check the simple case first to see if the pkey 2196 * index is the same as before; finally check to see if the 2197 * pkey has been relocated to a different index in the table. 2198 */ 2199 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2200 if (bcmp(port_infop->p_sgid_tbl, 2201 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2202 2203 new_link_state = LINK_STATE_DOWN; 2204 2205 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2206 state->id_pkey) { 2207 2208 new_link_state = LINK_STATE_UP; 2209 2210 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2211 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2212 2213 ibt_free_portinfo(port_infop, port_infosz); 2214 mutex_exit(&state->id_link_mutex); 2215 2216 /* 2217 * Currently a restart is required if our pkey has moved 2218 * in the pkey table. If we get the ibt_recycle_ud() to 2219 * work as documented (expected), we may be able to 2220 * avoid a complete restart. Note that we've already 2221 * marked both the start and stop 'in-progress' flags, 2222 * so it is ok to go ahead and do this restart. 2223 */ 2224 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2225 if ((ret = ibd_start(state)) != 0) { 2226 DPRINT(10, "ibd_restart: cannot restart, " 2227 "ret=%d", ret); 2228 } 2229 2230 goto link_mod_return; 2231 } else { 2232 new_link_state = LINK_STATE_DOWN; 2233 } 2234 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2235 } 2236 2237 update_link_state: 2238 if (port_infop) { 2239 ibt_free_portinfo(port_infop, port_infosz); 2240 } 2241 2242 /* 2243 * If we're reporting a link up, check InitTypeReply to see if 2244 * the SM has ensured that the port's presence in mcg, traps, 2245 * etc. is intact. 2246 */ 2247 if (new_link_state == LINK_STATE_DOWN) { 2248 opcode = IBD_LINK_DOWN; 2249 } else { 2250 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2251 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2252 opcode = IBD_LINK_UP; 2253 } else { 2254 opcode = IBD_LINK_UP_ABSENT; 2255 } 2256 } 2257 2258 /* 2259 * If the old state is the same as the new state, and the SM indicated 2260 * no change in the port parameters, nothing to do. 2261 */ 2262 if ((state->id_link_state == new_link_state) && (opcode != 2263 IBD_LINK_UP_ABSENT)) { 2264 mutex_exit(&state->id_link_mutex); 2265 goto link_mod_return; 2266 } 2267 2268 /* 2269 * Ok, so there was a link state change; see if it's safe to ask 2270 * the async thread to do the work 2271 */ 2272 if (!ibd_async_safe(state)) { 2273 state->id_link_state = new_link_state; 2274 mutex_exit(&state->id_link_mutex); 2275 goto link_mod_return; 2276 } 2277 2278 mutex_exit(&state->id_link_mutex); 2279 2280 /* 2281 * Queue up a request for ibd_async_link() to handle this link 2282 * state change event 2283 */ 2284 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2285 req->rq_ptr = (void *)opcode; 2286 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2287 2288 link_mod_return: 2289 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2290 } 2291 2292 /* 2293 * For the port up/down events, IBTL guarantees there will not be concurrent 2294 * invocations of the handler. IBTL might coalesce link transition events, 2295 * and not invoke the handler for _each_ up/down transition, but it will 2296 * invoke the handler with last known state 2297 */ 2298 static void 2299 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2300 ibt_async_code_t code, ibt_async_event_t *event) 2301 { 2302 ibd_state_t *state = (ibd_state_t *)clnt_private; 2303 2304 switch (code) { 2305 case IBT_ERROR_CATASTROPHIC_CHAN: 2306 ibd_print_warn(state, "catastrophic channel error"); 2307 break; 2308 case IBT_ERROR_CQ: 2309 ibd_print_warn(state, "completion queue error"); 2310 break; 2311 case IBT_PORT_CHANGE_EVENT: 2312 /* 2313 * Events will be delivered to all instances that have 2314 * done ibt_open_hca() but not yet done ibt_close_hca(). 2315 * Only need to do work for our port; IBTF will deliver 2316 * events for other ports on the hca we have ibt_open_hca'ed 2317 * too. Note that id_port is initialized in ibd_attach() 2318 * before we do an ibt_open_hca() in ibd_attach(). 2319 */ 2320 ASSERT(state->id_hca_hdl == hca_hdl); 2321 if (state->id_port != event->ev_port) 2322 break; 2323 2324 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2325 IBT_PORT_CHANGE_PKEY) { 2326 ibd_link_mod(state, code); 2327 } 2328 break; 2329 case IBT_ERROR_PORT_DOWN: 2330 case IBT_CLNT_REREG_EVENT: 2331 case IBT_EVENT_PORT_UP: 2332 /* 2333 * Events will be delivered to all instances that have 2334 * done ibt_open_hca() but not yet done ibt_close_hca(). 2335 * Only need to do work for our port; IBTF will deliver 2336 * events for other ports on the hca we have ibt_open_hca'ed 2337 * too. Note that id_port is initialized in ibd_attach() 2338 * before we do an ibt_open_hca() in ibd_attach(). 2339 */ 2340 ASSERT(state->id_hca_hdl == hca_hdl); 2341 if (state->id_port != event->ev_port) 2342 break; 2343 2344 ibd_link_mod(state, code); 2345 break; 2346 2347 case IBT_HCA_ATTACH_EVENT: 2348 case IBT_HCA_DETACH_EVENT: 2349 /* 2350 * When a new card is plugged to the system, attach_event is 2351 * invoked. Additionally, a cfgadm needs to be run to make the 2352 * card known to the system, and an ifconfig needs to be run to 2353 * plumb up any ibd interfaces on the card. In the case of card 2354 * unplug, a cfgadm is run that will trigger any RCM scripts to 2355 * unplumb the ibd interfaces on the card; when the card is 2356 * actually unplugged, the detach_event is invoked; 2357 * additionally, if any ibd instances are still active on the 2358 * card (eg there were no associated RCM scripts), driver's 2359 * detach routine is invoked. 2360 */ 2361 break; 2362 default: 2363 break; 2364 } 2365 } 2366 2367 static int 2368 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2369 { 2370 mac_register_t *macp; 2371 int ret; 2372 2373 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2374 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2375 return (DDI_FAILURE); 2376 } 2377 2378 /* 2379 * Note that when we register with mac during attach, we don't 2380 * have the id_macaddr yet, so we'll simply be registering a 2381 * zero macaddr that we'll overwrite later during plumb (in 2382 * ibd_m_start()). Similar is the case with id_mtu - we'll 2383 * update the mac layer with the correct mtu during plumb. 2384 */ 2385 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2386 macp->m_driver = state; 2387 macp->m_dip = dip; 2388 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2389 macp->m_callbacks = &ibd_m_callbacks; 2390 macp->m_min_sdu = 0; 2391 if (state->id_type == IBD_PORT_DRIVER) { 2392 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 2393 } else if (state->id_enable_rc) { 2394 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2395 } else { 2396 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2397 } 2398 macp->m_priv_props = ibd_priv_props; 2399 2400 /* 2401 * Register ourselves with the GLDv3 interface 2402 */ 2403 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2404 mac_free(macp); 2405 DPRINT(10, 2406 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2407 return (DDI_FAILURE); 2408 } 2409 2410 mac_free(macp); 2411 return (DDI_SUCCESS); 2412 } 2413 2414 static int 2415 ibd_record_capab(ibd_state_t *state) 2416 { 2417 ibt_hca_attr_t hca_attrs; 2418 ibt_status_t ibt_status; 2419 2420 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2421 2422 /* 2423 * Query the HCA and fetch its attributes 2424 */ 2425 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2426 ASSERT(ibt_status == IBT_SUCCESS); 2427 2428 /* 2429 * 1. Set the Hardware Checksum capability. Currently we only consider 2430 * full checksum offload. 2431 */ 2432 if (state->id_enable_rc) { 2433 state->id_hwcksum_capab = 0; 2434 } else { 2435 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2436 == IBT_HCA_CKSUM_FULL) { 2437 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2438 } 2439 } 2440 2441 /* 2442 * 2. Set LSO policy, capability and maximum length 2443 */ 2444 if (state->id_enable_rc) { 2445 state->id_lso_capable = B_FALSE; 2446 state->id_lso_maxlen = 0; 2447 } else { 2448 if (hca_attrs.hca_max_lso_size > 0) { 2449 state->id_lso_capable = B_TRUE; 2450 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2451 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2452 else 2453 state->id_lso_maxlen = 2454 hca_attrs.hca_max_lso_size; 2455 } else { 2456 state->id_lso_capable = B_FALSE; 2457 state->id_lso_maxlen = 0; 2458 } 2459 } 2460 2461 /* 2462 * 3. Set Reserved L_Key capability 2463 */ 2464 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2465 state->id_hca_res_lkey_capab = 1; 2466 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2467 state->rc_enable_iov_map = B_TRUE; 2468 } else { 2469 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2470 state->rc_enable_iov_map = B_FALSE; 2471 } 2472 2473 /* 2474 * 4. Set maximum sqseg value after checking to see if extended sgl 2475 * size information is provided by the hca 2476 */ 2477 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2478 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2479 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2480 } else { 2481 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2482 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2483 } 2484 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2485 state->id_max_sqseg = IBD_MAX_SQSEG; 2486 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2487 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2488 state->id_max_sqseg, IBD_MAX_SQSEG); 2489 } 2490 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2491 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2492 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2493 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2494 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2495 } 2496 2497 /* 2498 * Translating the virtual address regions into physical regions 2499 * for using the Reserved LKey feature results in a wr sgl that 2500 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2501 * we'll fix a high-water mark (65%) for when we should stop. 2502 */ 2503 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2504 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2505 2506 /* 2507 * 5. Set number of recv and send wqes after checking hca maximum 2508 * channel size. Store the max channel size in the state so that it 2509 * can be referred to when the swqe/rwqe change is requested via 2510 * dladm. 2511 */ 2512 2513 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; 2514 2515 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) 2516 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; 2517 2518 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - 2519 IBD_RWQE_MIN; 2520 2521 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) 2522 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; 2523 2524 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2525 2526 return (DDI_SUCCESS); 2527 } 2528 2529 static int 2530 ibd_part_busy(ibd_state_t *state) 2531 { 2532 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2533 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n"); 2534 return (DDI_FAILURE); 2535 } 2536 2537 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2538 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n"); 2539 return (DDI_FAILURE); 2540 } 2541 2542 /* 2543 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is 2544 * connecting to a remote IPoIB port. We can't remove this port. 2545 */ 2546 if (state->id_ah_op == IBD_OP_ONGOING) { 2547 DPRINT(10, "ibd_part_busy: failed: connecting\n"); 2548 return (DDI_FAILURE); 2549 } 2550 2551 return (DDI_SUCCESS); 2552 } 2553 2554 2555 static void 2556 ibd_part_unattach(ibd_state_t *state) 2557 { 2558 uint32_t progress = state->id_mac_state; 2559 ibt_status_t ret; 2560 2561 /* make sure rx resources are freed */ 2562 ibd_free_rx_rsrcs(state); 2563 2564 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2565 ASSERT(state->id_enable_rc); 2566 ibd_rc_fini_srq_list(state); 2567 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2568 } 2569 2570 if (progress & IBD_DRV_MAC_REGISTERED) { 2571 (void) mac_unregister(state->id_mh); 2572 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2573 } 2574 2575 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 2576 /* 2577 * No new async requests will be posted since the device 2578 * link state has been marked as unknown; completion handlers 2579 * have been turned off, so Tx handler will not cause any 2580 * more IBD_ASYNC_REAP requests. 2581 * 2582 * Queue a request for the async thread to exit, which will 2583 * be serviced after any pending ones. This can take a while, 2584 * specially if the SM is unreachable, since IBMF will slowly 2585 * timeout each SM request issued by the async thread. Reap 2586 * the thread before continuing on, we do not want it to be 2587 * lingering in modunloaded code. 2588 */ 2589 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 2590 thread_join(state->id_async_thrid); 2591 2592 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 2593 } 2594 2595 if (progress & IBD_DRV_REQ_LIST_INITED) { 2596 list_destroy(&state->id_req_list); 2597 mutex_destroy(&state->id_acache_req_lock); 2598 cv_destroy(&state->id_acache_req_cv); 2599 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; 2600 } 2601 2602 if (progress & IBD_DRV_PD_ALLOCD) { 2603 if ((ret = ibt_free_pd(state->id_hca_hdl, 2604 state->id_pd_hdl)) != IBT_SUCCESS) { 2605 ibd_print_warn(state, "failed to free " 2606 "protection domain, ret=%d", ret); 2607 } 2608 state->id_pd_hdl = NULL; 2609 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2610 } 2611 2612 if (progress & IBD_DRV_HCA_OPENED) { 2613 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2614 IBT_SUCCESS) { 2615 ibd_print_warn(state, "failed to close " 2616 "HCA device, ret=%d", ret); 2617 } 2618 state->id_hca_hdl = NULL; 2619 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2620 } 2621 2622 mutex_enter(&ibd_gstate.ig_mutex); 2623 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2624 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2625 IBT_SUCCESS) { 2626 ibd_print_warn(state, 2627 "ibt_detach() failed, ret=%d", ret); 2628 } 2629 state->id_ibt_hdl = NULL; 2630 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2631 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2632 } 2633 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2634 (ibd_gstate.ig_ibt_hdl != NULL)) { 2635 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2636 IBT_SUCCESS) { 2637 ibd_print_warn(state, "ibt_detach(): global " 2638 "failed, ret=%d", ret); 2639 } 2640 ibd_gstate.ig_ibt_hdl = NULL; 2641 } 2642 mutex_exit(&ibd_gstate.ig_mutex); 2643 2644 if (progress & IBD_DRV_TXINTR_ADDED) { 2645 ddi_remove_softintr(state->id_tx); 2646 state->id_tx = NULL; 2647 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2648 } 2649 2650 if (progress & IBD_DRV_RXINTR_ADDED) { 2651 ddi_remove_softintr(state->id_rx); 2652 state->id_rx = NULL; 2653 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2654 } 2655 2656 #ifdef DEBUG 2657 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2658 kstat_delete(state->rc_ksp); 2659 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2660 } 2661 #endif 2662 2663 if (progress & IBD_DRV_STATE_INITIALIZED) { 2664 ibd_state_fini(state); 2665 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2666 } 2667 } 2668 2669 int 2670 ibd_part_attach(ibd_state_t *state, dev_info_t *dip) 2671 { 2672 ibt_status_t ret; 2673 int rv; 2674 kthread_t *kht; 2675 2676 /* 2677 * Initialize mutexes and condition variables 2678 */ 2679 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2680 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()"); 2681 return (DDI_FAILURE); 2682 } 2683 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2684 2685 /* 2686 * Allocate rx,tx softintr 2687 */ 2688 if (ibd_rx_softintr == 1) { 2689 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2690 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2691 DPRINT(10, "ibd_part_attach: failed in " 2692 "ddi_add_softintr(id_rx), ret=%d", rv); 2693 return (DDI_FAILURE); 2694 } 2695 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2696 } 2697 if (ibd_tx_softintr == 1) { 2698 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2699 NULL, NULL, ibd_tx_recycle, 2700 (caddr_t)state)) != DDI_SUCCESS) { 2701 DPRINT(10, "ibd_part_attach: failed in " 2702 "ddi_add_softintr(id_tx), ret=%d", rv); 2703 return (DDI_FAILURE); 2704 } 2705 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2706 } 2707 2708 /* 2709 * Attach to IBTL 2710 */ 2711 mutex_enter(&ibd_gstate.ig_mutex); 2712 if (ibd_gstate.ig_ibt_hdl == NULL) { 2713 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2714 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2715 DPRINT(10, "ibd_part_attach: global: failed in " 2716 "ibt_attach(), ret=%d", ret); 2717 mutex_exit(&ibd_gstate.ig_mutex); 2718 return (DDI_FAILURE); 2719 } 2720 } 2721 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2722 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2723 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d", 2724 ret); 2725 mutex_exit(&ibd_gstate.ig_mutex); 2726 return (DDI_FAILURE); 2727 } 2728 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2729 mutex_exit(&ibd_gstate.ig_mutex); 2730 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2731 2732 /* 2733 * Open the HCA 2734 */ 2735 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 2736 &state->id_hca_hdl)) != IBT_SUCCESS) { 2737 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d", 2738 ret); 2739 return (DDI_FAILURE); 2740 } 2741 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2742 2743 #ifdef DEBUG 2744 /* Initialize Driver Counters for Reliable Connected Mode */ 2745 if (state->id_enable_rc) { 2746 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2747 DPRINT(10, "ibd_part_attach: failed in " 2748 "ibd_rc_init_stats"); 2749 return (DDI_FAILURE); 2750 } 2751 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2752 } 2753 #endif 2754 2755 /* 2756 * Record capabilities 2757 */ 2758 (void) ibd_record_capab(state); 2759 2760 /* 2761 * Allocate a protection domain on the HCA 2762 */ 2763 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2764 &state->id_pd_hdl)) != IBT_SUCCESS) { 2765 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d", 2766 ret); 2767 return (DDI_FAILURE); 2768 } 2769 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2770 2771 2772 /* 2773 * We need to initialise the req_list that is required for the 2774 * operation of the async_thread. 2775 */ 2776 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 2777 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 2778 list_create(&state->id_req_list, sizeof (ibd_req_t), 2779 offsetof(ibd_req_t, rq_list)); 2780 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; 2781 2782 /* 2783 * Create the async thread; thread_create never fails. 2784 */ 2785 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 2786 TS_RUN, minclsyspri); 2787 state->id_async_thrid = kht->t_did; 2788 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 2789 2790 return (DDI_SUCCESS); 2791 } 2792 2793 /* 2794 * Attach device to the IO framework. 2795 */ 2796 static int 2797 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2798 { 2799 int ret; 2800 2801 switch (cmd) { 2802 case DDI_ATTACH: 2803 ret = ibd_port_attach(dip); 2804 break; 2805 default: 2806 ret = DDI_FAILURE; 2807 break; 2808 } 2809 return (ret); 2810 } 2811 2812 /* 2813 * Detach device from the IO framework. 2814 */ 2815 static int 2816 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2817 { 2818 ibd_state_t *state; 2819 int instance; 2820 2821 /* 2822 * IBD doesn't support suspend/resume 2823 */ 2824 if (cmd != DDI_DETACH) 2825 return (DDI_FAILURE); 2826 2827 /* 2828 * Get the instance softstate 2829 */ 2830 instance = ddi_get_instance(dip); 2831 state = ddi_get_soft_state(ibd_list, instance); 2832 2833 /* 2834 * Release all resources we're holding still. Note that if we'd 2835 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2836 * so far, we should find all the flags we need in id_mac_state. 2837 */ 2838 return (ibd_port_unattach(state, dip)); 2839 } 2840 2841 /* 2842 * Pre ibt_attach() driver initialization 2843 */ 2844 static int 2845 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2846 { 2847 char buf[64]; 2848 2849 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2850 state->id_link_state = LINK_STATE_UNKNOWN; 2851 2852 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2853 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2854 state->id_trap_stop = B_TRUE; 2855 state->id_trap_inprog = 0; 2856 2857 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2858 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2859 state->id_dip = dip; 2860 2861 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2862 2863 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2864 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2865 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2866 state->id_tx_busy = 0; 2867 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2868 2869 state->id_rx_list.dl_bufs_outstanding = 0; 2870 state->id_rx_list.dl_cnt = 0; 2871 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2872 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2873 (void) sprintf(buf, "ibd_req%d_%x", ddi_get_instance(dip), 2874 state->id_pkey); 2875 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2876 0, NULL, NULL, NULL, NULL, NULL, 0); 2877 2878 /* For Reliable Connected Mode */ 2879 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2880 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2881 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2882 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2883 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2884 MUTEX_DRIVER, NULL); 2885 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL); 2886 2887 /* 2888 * Make the default link mode as RC. If this fails during connection 2889 * setup, the link mode is automatically transitioned to UD. 2890 * Also set the RC MTU. 2891 */ 2892 state->id_enable_rc = IBD_DEF_LINK_MODE; 2893 state->rc_mtu = IBD_DEF_RC_MAX_MTU; 2894 state->id_mtu = IBD_DEF_MAX_MTU; 2895 2896 /* Iniatialize all tunables to default */ 2897 state->id_lso_policy = IBD_DEF_LSO_POLICY; 2898 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; 2899 state->id_num_ah = IBD_DEF_NUM_AH; 2900 state->id_hash_size = IBD_DEF_HASH_SIZE; 2901 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; 2902 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; 2903 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; 2904 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; 2905 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; 2906 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; 2907 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; 2908 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; 2909 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; 2910 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; 2911 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; 2912 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; 2913 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; 2914 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; 2915 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; 2916 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; 2917 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; 2918 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; 2919 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; 2920 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; 2921 2922 return (DDI_SUCCESS); 2923 } 2924 2925 /* 2926 * Post ibt_detach() driver deconstruction 2927 */ 2928 static void 2929 ibd_state_fini(ibd_state_t *state) 2930 { 2931 kmem_cache_destroy(state->id_req_kmc); 2932 2933 mutex_destroy(&state->id_rx_list.dl_mutex); 2934 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2935 2936 mutex_destroy(&state->id_txpost_lock); 2937 mutex_destroy(&state->id_tx_list.dl_mutex); 2938 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2939 mutex_destroy(&state->id_lso_lock); 2940 2941 mutex_destroy(&state->id_sched_lock); 2942 mutex_destroy(&state->id_scq_poll_lock); 2943 mutex_destroy(&state->id_rcq_poll_lock); 2944 2945 cv_destroy(&state->id_trap_cv); 2946 mutex_destroy(&state->id_trap_lock); 2947 mutex_destroy(&state->id_link_mutex); 2948 2949 /* For Reliable Connected Mode */ 2950 mutex_destroy(&state->rc_timeout_lock); 2951 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2952 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2953 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2954 mutex_destroy(&state->rc_tx_large_bufs_lock); 2955 mutex_destroy(&state->rc_rx_lock); 2956 } 2957 2958 /* 2959 * Fetch link speed from SA for snmp ifspeed reporting. 2960 */ 2961 static uint64_t 2962 ibd_get_portspeed(ibd_state_t *state) 2963 { 2964 int ret; 2965 ibt_path_info_t path; 2966 ibt_path_attr_t path_attr; 2967 uint8_t num_paths; 2968 uint64_t ifspeed; 2969 2970 /* 2971 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2972 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2973 * 2000000000. Start with that as default. 2974 */ 2975 ifspeed = 2000000000; 2976 2977 bzero(&path_attr, sizeof (path_attr)); 2978 2979 /* 2980 * Get the port speed from Loopback path information. 2981 */ 2982 path_attr.pa_dgids = &state->id_sgid; 2983 path_attr.pa_num_dgids = 1; 2984 path_attr.pa_sgid = state->id_sgid; 2985 2986 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2987 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2988 goto earlydone; 2989 2990 if (num_paths < 1) 2991 goto earlydone; 2992 2993 /* 2994 * In case SA does not return an expected value, report the default 2995 * speed as 1X. 2996 */ 2997 ret = 1; 2998 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2999 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 3000 ret = 1; 3001 break; 3002 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 3003 ret = 4; 3004 break; 3005 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 3006 ret = 12; 3007 break; 3008 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 3009 ret = 2; 3010 break; 3011 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 3012 ret = 8; 3013 break; 3014 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 3015 ret = 16; 3016 break; 3017 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 3018 ret = 24; 3019 break; 3020 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 3021 ret = 32; 3022 break; 3023 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 3024 ret = 48; 3025 break; 3026 } 3027 3028 ifspeed *= ret; 3029 3030 earlydone: 3031 return (ifspeed); 3032 } 3033 3034 /* 3035 * Search input mcg list (id_mc_full or id_mc_non) for an entry 3036 * representing the input mcg mgid. 3037 */ 3038 static ibd_mce_t * 3039 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 3040 { 3041 ibd_mce_t *ptr = list_head(mlist); 3042 3043 /* 3044 * Do plain linear search. 3045 */ 3046 while (ptr != NULL) { 3047 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 3048 sizeof (ib_gid_t)) == 0) 3049 return (ptr); 3050 ptr = list_next(mlist, ptr); 3051 } 3052 return (NULL); 3053 } 3054 3055 /* 3056 * Execute IBA JOIN. 3057 */ 3058 static ibt_status_t 3059 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 3060 { 3061 ibt_mcg_attr_t mcg_attr; 3062 3063 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3064 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 3065 mcg_attr.mc_mgid = mgid; 3066 mcg_attr.mc_join_state = mce->mc_jstate; 3067 mcg_attr.mc_scope = state->id_scope; 3068 mcg_attr.mc_pkey = state->id_pkey; 3069 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 3070 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 3071 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 3072 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 3073 NULL, NULL)); 3074 } 3075 3076 /* 3077 * This code JOINs the port in the proper way (depending on the join 3078 * state) so that IBA fabric will forward mcg packets to/from the port. 3079 * It also attaches the QPN to the mcg so it can receive those mcg 3080 * packets. This code makes sure not to attach the mcg to the QP if 3081 * that has been previously done due to the mcg being joined with a 3082 * different join state, even though this is not required by SWG_0216, 3083 * refid 3610. 3084 */ 3085 static ibd_mce_t * 3086 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3087 { 3088 ibt_status_t ibt_status; 3089 ibd_mce_t *mce, *tmce, *omce = NULL; 3090 boolean_t do_attach = B_TRUE; 3091 3092 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 3093 jstate, mgid.gid_prefix, mgid.gid_guid); 3094 3095 /* 3096 * For enable_multicast Full member joins, we need to do some 3097 * extra work. If there is already an mce on the list that 3098 * indicates full membership, that means the membership has 3099 * not yet been dropped (since the disable_multicast was issued) 3100 * because there are pending Tx's to the mcg; in that case, just 3101 * mark the mce not to be reaped when the Tx completion queues 3102 * an async reap operation. 3103 * 3104 * If there is already an mce on the list indicating sendonly 3105 * membership, try to promote to full membership. Be careful 3106 * not to deallocate the old mce, since there might be an AH 3107 * pointing to it; instead, update the old mce with new data 3108 * that tracks the full membership. 3109 */ 3110 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 3111 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 3112 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 3113 ASSERT(omce->mc_fullreap); 3114 omce->mc_fullreap = B_FALSE; 3115 return (omce); 3116 } else { 3117 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3118 } 3119 } 3120 3121 /* 3122 * Allocate the ibd_mce_t to track this JOIN. 3123 */ 3124 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 3125 mce->mc_fullreap = B_FALSE; 3126 mce->mc_jstate = jstate; 3127 3128 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 3129 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 3130 ibt_status); 3131 kmem_free(mce, sizeof (ibd_mce_t)); 3132 return (NULL); 3133 } 3134 3135 /* 3136 * Is an IBA attach required? Not if the interface is already joined 3137 * to the mcg in a different appropriate join state. 3138 */ 3139 if (jstate == IB_MC_JSTATE_NON) { 3140 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3141 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3142 do_attach = B_FALSE; 3143 } else if (jstate == IB_MC_JSTATE_FULL) { 3144 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3145 do_attach = B_FALSE; 3146 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3147 do_attach = B_FALSE; 3148 } 3149 3150 if (do_attach) { 3151 /* 3152 * Do the IBA attach. 3153 */ 3154 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 3155 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 3156 &mce->mc_info)) != IBT_SUCCESS) { 3157 DPRINT(10, "ibd_join_group : failed qp attachment " 3158 "%d\n", ibt_status); 3159 /* 3160 * NOTE that we should probably preserve the join info 3161 * in the list and later try to leave again at detach 3162 * time. 3163 */ 3164 (void) ibt_leave_mcg(state->id_sgid, mgid, 3165 state->id_sgid, jstate); 3166 kmem_free(mce, sizeof (ibd_mce_t)); 3167 return (NULL); 3168 } 3169 } 3170 3171 /* 3172 * Insert the ibd_mce_t in the proper list. 3173 */ 3174 if (jstate == IB_MC_JSTATE_NON) { 3175 IBD_MCACHE_INSERT_NON(state, mce); 3176 } else { 3177 /* 3178 * Set up the mc_req fields used for reaping the 3179 * mcg in case of delayed tx completion (see 3180 * ibd_tx_cleanup()). Also done for sendonly join in 3181 * case we are promoted to fullmembership later and 3182 * keep using the same mce. 3183 */ 3184 mce->mc_req.rq_gid = mgid; 3185 mce->mc_req.rq_ptr = mce; 3186 /* 3187 * Check whether this is the case of trying to join 3188 * full member, and we were already joined send only. 3189 * We try to drop our SendOnly membership, but it is 3190 * possible that the mcg does not exist anymore (and 3191 * the subnet trap never reached us), so the leave 3192 * operation might fail. 3193 */ 3194 if (omce != NULL) { 3195 (void) ibt_leave_mcg(state->id_sgid, mgid, 3196 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3197 omce->mc_jstate = IB_MC_JSTATE_FULL; 3198 bcopy(&mce->mc_info, &omce->mc_info, 3199 sizeof (ibt_mcg_info_t)); 3200 kmem_free(mce, sizeof (ibd_mce_t)); 3201 return (omce); 3202 } 3203 mutex_enter(&state->id_mc_mutex); 3204 IBD_MCACHE_INSERT_FULL(state, mce); 3205 mutex_exit(&state->id_mc_mutex); 3206 } 3207 3208 return (mce); 3209 } 3210 3211 /* 3212 * Called during port up event handling to attempt to reacquire full 3213 * membership to an mcg. Stripped down version of ibd_join_group(). 3214 * Note that it is possible that the mcg might have gone away, and 3215 * gets recreated at this point. 3216 */ 3217 static void 3218 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3219 { 3220 ib_gid_t mgid; 3221 3222 /* 3223 * If the mc_fullreap flag is set, or this join fails, a subsequent 3224 * reap/leave is going to try to leave the group. We could prevent 3225 * that by adding a boolean flag into ibd_mce_t, if required. 3226 */ 3227 if (mce->mc_fullreap) 3228 return; 3229 3230 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3231 3232 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3233 mgid.gid_guid); 3234 3235 /* While reacquiring, leave and then join the MCG */ 3236 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 3237 mce->mc_jstate); 3238 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3239 ibd_print_warn(state, "Failure on port up to rejoin " 3240 "multicast gid %016llx:%016llx", 3241 (u_longlong_t)mgid.gid_prefix, 3242 (u_longlong_t)mgid.gid_guid); 3243 } 3244 3245 /* 3246 * This code handles delayed Tx completion cleanups for mcg's to which 3247 * disable_multicast has been issued, regular mcg related cleanups during 3248 * disable_multicast, disable_promiscuous and mcg traps, as well as 3249 * cleanups during driver detach time. Depending on the join state, 3250 * it deletes the mce from the appropriate list and issues the IBA 3251 * leave/detach; except in the disable_multicast case when the mce 3252 * is left on the active list for a subsequent Tx completion cleanup. 3253 */ 3254 static void 3255 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3256 uint8_t jstate) 3257 { 3258 ibd_mce_t *tmce; 3259 boolean_t do_detach = B_TRUE; 3260 3261 /* 3262 * Before detaching, we must check whether the other list 3263 * contains the mcg; if we detach blindly, the consumer 3264 * who set up the other list will also stop receiving 3265 * traffic. 3266 */ 3267 if (jstate == IB_MC_JSTATE_FULL) { 3268 /* 3269 * The following check is only relevant while coming 3270 * from the Tx completion path in the reap case. 3271 */ 3272 if (!mce->mc_fullreap) 3273 return; 3274 mutex_enter(&state->id_mc_mutex); 3275 IBD_MCACHE_PULLOUT_FULL(state, mce); 3276 mutex_exit(&state->id_mc_mutex); 3277 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3278 do_detach = B_FALSE; 3279 } else if (jstate == IB_MC_JSTATE_NON) { 3280 IBD_MCACHE_PULLOUT_NON(state, mce); 3281 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3282 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3283 do_detach = B_FALSE; 3284 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3285 mutex_enter(&state->id_mc_mutex); 3286 IBD_MCACHE_PULLOUT_FULL(state, mce); 3287 mutex_exit(&state->id_mc_mutex); 3288 do_detach = B_FALSE; 3289 } 3290 3291 /* 3292 * If we are reacting to a mcg trap and leaving our sendonly or 3293 * non membership, the mcg is possibly already gone, so attempting 3294 * to leave might fail. On the other hand, we must try to leave 3295 * anyway, since this might be a trap from long ago, and we could 3296 * have potentially sendonly joined to a recent incarnation of 3297 * the mcg and are about to loose track of this information. 3298 */ 3299 if (do_detach) { 3300 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3301 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3302 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3303 } 3304 3305 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3306 kmem_free(mce, sizeof (ibd_mce_t)); 3307 } 3308 3309 /* 3310 * Async code executed due to multicast and promiscuous disable requests 3311 * and mcg trap handling; also executed during driver detach. Mostly, a 3312 * leave and detach is done; except for the fullmember case when Tx 3313 * requests are pending, whence arrangements are made for subsequent 3314 * cleanup on Tx completion. 3315 */ 3316 static void 3317 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3318 { 3319 ipoib_mac_t mcmac; 3320 boolean_t recycled; 3321 ibd_mce_t *mce; 3322 3323 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3324 jstate, mgid.gid_prefix, mgid.gid_guid); 3325 3326 if (jstate == IB_MC_JSTATE_NON) { 3327 recycled = B_TRUE; 3328 mce = IBD_MCACHE_FIND_NON(state, mgid); 3329 /* 3330 * In case we are handling a mcg trap, we might not find 3331 * the mcg in the non list. 3332 */ 3333 if (mce == NULL) { 3334 return; 3335 } 3336 } else { 3337 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3338 3339 /* 3340 * In case we are handling a mcg trap, make sure the trap 3341 * is not arriving late; if we have an mce that indicates 3342 * that we are already a fullmember, that would be a clear 3343 * indication that the trap arrived late (ie, is for a 3344 * previous incarnation of the mcg). 3345 */ 3346 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3347 if ((mce == NULL) || (mce->mc_jstate == 3348 IB_MC_JSTATE_FULL)) { 3349 return; 3350 } 3351 } else { 3352 ASSERT(jstate == IB_MC_JSTATE_FULL); 3353 3354 /* 3355 * If join group failed, mce will be NULL here. 3356 * This is because in GLDv3 driver, set multicast 3357 * will always return success. 3358 */ 3359 if (mce == NULL) { 3360 return; 3361 } 3362 3363 mce->mc_fullreap = B_TRUE; 3364 } 3365 3366 /* 3367 * If no pending Tx's remain that reference the AH 3368 * for the mcg, recycle it from active to free list. 3369 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3370 * so the last completing Tx will cause an async reap 3371 * operation to be invoked, at which time we will drop our 3372 * membership to the mcg so that the pending Tx's complete 3373 * successfully. Refer to comments on "AH and MCE active 3374 * list manipulation" at top of this file. The lock protects 3375 * against Tx fast path and Tx cleanup code. 3376 */ 3377 mutex_enter(&state->id_ac_mutex); 3378 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3379 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3380 IB_MC_JSTATE_SEND_ONLY_NON)); 3381 mutex_exit(&state->id_ac_mutex); 3382 } 3383 3384 if (recycled) { 3385 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3386 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3387 ibd_async_reap_group(state, mce, mgid, jstate); 3388 } 3389 } 3390 3391 /* 3392 * Find the broadcast address as defined by IPoIB; implicitly 3393 * determines the IBA scope, mtu, tclass etc of the link the 3394 * interface is going to be a member of. 3395 */ 3396 static ibt_status_t 3397 ibd_find_bgroup(ibd_state_t *state) 3398 { 3399 ibt_mcg_attr_t mcg_attr; 3400 uint_t numg; 3401 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3402 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3403 IB_MC_SCOPE_GLOBAL }; 3404 int i, mcgmtu; 3405 boolean_t found = B_FALSE; 3406 int ret; 3407 ibt_mcg_info_t mcg_info; 3408 3409 state->id_bgroup_created = B_FALSE; 3410 state->id_bgroup_present = B_FALSE; 3411 3412 query_bcast_grp: 3413 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3414 mcg_attr.mc_pkey = state->id_pkey; 3415 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3416 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3417 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3418 3419 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3420 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3421 3422 /* 3423 * Look for the IPoIB broadcast group. 3424 */ 3425 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3426 state->id_mgid.gid_prefix = 3427 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3428 ((uint64_t)state->id_scope << 48) | 3429 ((uint32_t)(state->id_pkey << 16))); 3430 mcg_attr.mc_mgid = state->id_mgid; 3431 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3432 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3433 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3434 found = B_TRUE; 3435 break; 3436 } 3437 } 3438 3439 if (!found) { 3440 if (state->id_create_broadcast_group) { 3441 /* 3442 * If we created the broadcast group, but failed to 3443 * find it, we can't do anything except leave the 3444 * one we created and return failure. 3445 */ 3446 if (state->id_bgroup_created) { 3447 ibd_print_warn(state, "IPoIB broadcast group " 3448 "absent. Unable to query after create."); 3449 goto find_bgroup_fail; 3450 } 3451 3452 /* 3453 * Create the ipoib broadcast group if it didn't exist 3454 */ 3455 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3456 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3457 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3458 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3459 mcg_attr.mc_pkey = state->id_pkey; 3460 mcg_attr.mc_flow = 0; 3461 mcg_attr.mc_sl = 0; 3462 mcg_attr.mc_tclass = 0; 3463 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3464 state->id_mgid.gid_prefix = 3465 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3466 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3467 ((uint32_t)(state->id_pkey << 16))); 3468 mcg_attr.mc_mgid = state->id_mgid; 3469 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3470 3471 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3472 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3473 ibd_print_warn(state, "IPoIB broadcast group " 3474 "absent, create failed: ret = %d\n", ret); 3475 state->id_bgroup_created = B_FALSE; 3476 return (IBT_FAILURE); 3477 } 3478 state->id_bgroup_created = B_TRUE; 3479 goto query_bcast_grp; 3480 } else { 3481 ibd_print_warn(state, "IPoIB broadcast group absent"); 3482 return (IBT_FAILURE); 3483 } 3484 } 3485 3486 /* 3487 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3488 */ 3489 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3490 if (state->id_mtu < mcgmtu) { 3491 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3492 "greater than port's maximum MTU %d", mcgmtu, 3493 state->id_mtu); 3494 ibt_free_mcg_info(state->id_mcinfo, 1); 3495 goto find_bgroup_fail; 3496 } 3497 state->id_mtu = mcgmtu; 3498 state->id_bgroup_present = B_TRUE; 3499 3500 return (IBT_SUCCESS); 3501 3502 find_bgroup_fail: 3503 if (state->id_bgroup_created) { 3504 (void) ibt_leave_mcg(state->id_sgid, 3505 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3506 IB_MC_JSTATE_FULL); 3507 } 3508 3509 return (IBT_FAILURE); 3510 } 3511 3512 static int 3513 ibd_alloc_tx_copybufs(ibd_state_t *state) 3514 { 3515 ibt_mr_attr_t mem_attr; 3516 3517 /* 3518 * Allocate one big chunk for all regular tx copy bufs 3519 */ 3520 state->id_tx_buf_sz = state->id_mtu; 3521 if (state->id_lso_policy && state->id_lso_capable && 3522 (state->id_ud_tx_copy_thresh > state->id_mtu)) { 3523 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; 3524 } 3525 3526 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * 3527 state->id_tx_buf_sz, KM_SLEEP); 3528 3529 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * 3530 sizeof (ibd_swqe_t), KM_SLEEP); 3531 3532 /* 3533 * Do one memory registration on the entire txbuf area 3534 */ 3535 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3536 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz; 3537 mem_attr.mr_as = NULL; 3538 mem_attr.mr_flags = IBT_MR_SLEEP; 3539 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3540 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3541 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3542 kmem_free(state->id_tx_wqes, 3543 state->id_ud_num_swqe * sizeof (ibd_swqe_t)); 3544 kmem_free(state->id_tx_bufs, 3545 state->id_ud_num_swqe * state->id_tx_buf_sz); 3546 state->id_tx_bufs = NULL; 3547 return (DDI_FAILURE); 3548 } 3549 3550 return (DDI_SUCCESS); 3551 } 3552 3553 static int 3554 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3555 { 3556 ibt_mr_attr_t mem_attr; 3557 ibd_lsobuf_t *buflist; 3558 ibd_lsobuf_t *lbufp; 3559 ibd_lsobuf_t *tail; 3560 ibd_lsobkt_t *bktp; 3561 uint8_t *membase; 3562 uint8_t *memp; 3563 uint_t memsz; 3564 int i; 3565 3566 /* 3567 * Allocate the lso bucket 3568 */ 3569 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3570 3571 /* 3572 * Allocate the entire lso memory and register it 3573 */ 3574 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ; 3575 membase = kmem_zalloc(memsz, KM_SLEEP); 3576 3577 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3578 mem_attr.mr_len = memsz; 3579 mem_attr.mr_as = NULL; 3580 mem_attr.mr_flags = IBT_MR_SLEEP; 3581 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3582 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3583 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3584 kmem_free(membase, memsz); 3585 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3586 return (DDI_FAILURE); 3587 } 3588 3589 mutex_enter(&state->id_lso_lock); 3590 3591 /* 3592 * Now allocate the buflist. Note that the elements in the buflist and 3593 * the buffers in the lso memory have a permanent 1-1 relation, so we 3594 * can always derive the address of a buflist entry from the address of 3595 * an lso buffer. 3596 */ 3597 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t), 3598 KM_SLEEP); 3599 3600 /* 3601 * Set up the lso buf chain 3602 */ 3603 memp = membase; 3604 lbufp = buflist; 3605 for (i = 0; i < state->id_num_lso_bufs; i++) { 3606 lbufp->lb_isfree = 1; 3607 lbufp->lb_buf = memp; 3608 lbufp->lb_next = lbufp + 1; 3609 3610 tail = lbufp; 3611 3612 memp += IBD_LSO_BUFSZ; 3613 lbufp++; 3614 } 3615 tail->lb_next = NULL; 3616 3617 /* 3618 * Set up the LSO buffer information in ibd state 3619 */ 3620 bktp->bkt_bufl = buflist; 3621 bktp->bkt_free_head = buflist; 3622 bktp->bkt_mem = membase; 3623 bktp->bkt_nelem = state->id_num_lso_bufs; 3624 bktp->bkt_nfree = bktp->bkt_nelem; 3625 3626 state->id_lso = bktp; 3627 mutex_exit(&state->id_lso_lock); 3628 3629 return (DDI_SUCCESS); 3630 } 3631 3632 /* 3633 * Statically allocate Tx buffer list(s). 3634 */ 3635 static int 3636 ibd_init_txlist(ibd_state_t *state) 3637 { 3638 ibd_swqe_t *swqe; 3639 ibt_lkey_t lkey; 3640 int i; 3641 uint_t len; 3642 uint8_t *bufaddr; 3643 3644 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3645 return (DDI_FAILURE); 3646 3647 if (state->id_lso_policy && state->id_lso_capable) { 3648 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3649 state->id_lso_capable = B_FALSE; 3650 } 3651 3652 mutex_enter(&state->id_tx_list.dl_mutex); 3653 state->id_tx_list.dl_head = NULL; 3654 state->id_tx_list.dl_pending_sends = B_FALSE; 3655 state->id_tx_list.dl_cnt = 0; 3656 mutex_exit(&state->id_tx_list.dl_mutex); 3657 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3658 state->id_tx_rel_list.dl_head = NULL; 3659 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3660 state->id_tx_rel_list.dl_cnt = 0; 3661 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3662 3663 /* 3664 * Allocate and setup the swqe list 3665 */ 3666 lkey = state->id_tx_mr_desc.md_lkey; 3667 bufaddr = state->id_tx_bufs; 3668 len = state->id_tx_buf_sz; 3669 swqe = state->id_tx_wqes; 3670 mutex_enter(&state->id_tx_list.dl_mutex); 3671 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) { 3672 swqe->swqe_next = NULL; 3673 swqe->swqe_im_mblk = NULL; 3674 3675 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3676 bufaddr; 3677 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3678 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3679 3680 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3681 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3682 swqe->w_swr.wr_trans = IBT_UD_SRV; 3683 3684 /* These are set in send */ 3685 swqe->w_swr.wr_nds = 0; 3686 swqe->w_swr.wr_sgl = NULL; 3687 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3688 3689 /* add to list */ 3690 state->id_tx_list.dl_cnt++; 3691 swqe->swqe_next = state->id_tx_list.dl_head; 3692 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3693 } 3694 mutex_exit(&state->id_tx_list.dl_mutex); 3695 3696 return (DDI_SUCCESS); 3697 } 3698 3699 static int 3700 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3701 uint32_t *nds_p) 3702 { 3703 ibd_lsobkt_t *bktp; 3704 ibd_lsobuf_t *lbufp; 3705 ibd_lsobuf_t *nextp; 3706 ibt_lkey_t lso_lkey; 3707 uint_t frag_sz; 3708 uint_t num_needed; 3709 int i; 3710 3711 ASSERT(sgl_p != NULL); 3712 ASSERT(nds_p != NULL); 3713 ASSERT(req_sz != 0); 3714 3715 /* 3716 * Determine how many bufs we'd need for the size requested 3717 */ 3718 num_needed = req_sz / IBD_LSO_BUFSZ; 3719 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3720 num_needed++; 3721 3722 mutex_enter(&state->id_lso_lock); 3723 3724 /* 3725 * If we don't have enough lso bufs, return failure 3726 */ 3727 ASSERT(state->id_lso != NULL); 3728 bktp = state->id_lso; 3729 if (bktp->bkt_nfree < num_needed) { 3730 mutex_exit(&state->id_lso_lock); 3731 return (-1); 3732 } 3733 3734 /* 3735 * Pick the first 'num_needed' bufs from the free list 3736 */ 3737 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3738 lbufp = bktp->bkt_free_head; 3739 for (i = 0; i < num_needed; i++) { 3740 ASSERT(lbufp->lb_isfree != 0); 3741 ASSERT(lbufp->lb_buf != NULL); 3742 3743 nextp = lbufp->lb_next; 3744 3745 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3746 sgl_p[i].ds_key = lso_lkey; 3747 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3748 3749 lbufp->lb_isfree = 0; 3750 lbufp->lb_next = NULL; 3751 3752 lbufp = nextp; 3753 } 3754 bktp->bkt_free_head = lbufp; 3755 3756 /* 3757 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3758 * to adjust the last sgl entry's length. Since we know we need atleast 3759 * one, the i-1 use below is ok. 3760 */ 3761 if (frag_sz) { 3762 sgl_p[i-1].ds_len = frag_sz; 3763 } 3764 3765 /* 3766 * Update nfree count and return 3767 */ 3768 bktp->bkt_nfree -= num_needed; 3769 3770 mutex_exit(&state->id_lso_lock); 3771 3772 *nds_p = num_needed; 3773 3774 return (0); 3775 } 3776 3777 static void 3778 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3779 { 3780 ibd_lsobkt_t *bktp; 3781 ibd_lsobuf_t *lbufp; 3782 uint8_t *lso_mem_end; 3783 uint_t ndx; 3784 int i; 3785 3786 mutex_enter(&state->id_lso_lock); 3787 3788 bktp = state->id_lso; 3789 ASSERT(bktp != NULL); 3790 3791 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3792 for (i = 0; i < nds; i++) { 3793 uint8_t *va; 3794 3795 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3796 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3797 3798 /* 3799 * Figure out the buflist element this sgl buffer corresponds 3800 * to and put it back at the head 3801 */ 3802 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3803 lbufp = bktp->bkt_bufl + ndx; 3804 3805 ASSERT(lbufp->lb_isfree == 0); 3806 ASSERT(lbufp->lb_buf == va); 3807 3808 lbufp->lb_isfree = 1; 3809 lbufp->lb_next = bktp->bkt_free_head; 3810 bktp->bkt_free_head = lbufp; 3811 } 3812 bktp->bkt_nfree += nds; 3813 3814 mutex_exit(&state->id_lso_lock); 3815 } 3816 3817 static void 3818 ibd_free_tx_copybufs(ibd_state_t *state) 3819 { 3820 /* 3821 * Unregister txbuf mr 3822 */ 3823 if (ibt_deregister_mr(state->id_hca_hdl, 3824 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3825 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3826 } 3827 state->id_tx_mr_hdl = NULL; 3828 3829 /* 3830 * Free txbuf memory 3831 */ 3832 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * 3833 sizeof (ibd_swqe_t)); 3834 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * 3835 state->id_tx_buf_sz); 3836 state->id_tx_wqes = NULL; 3837 state->id_tx_bufs = NULL; 3838 } 3839 3840 static void 3841 ibd_free_tx_lsobufs(ibd_state_t *state) 3842 { 3843 ibd_lsobkt_t *bktp; 3844 3845 mutex_enter(&state->id_lso_lock); 3846 3847 if ((bktp = state->id_lso) == NULL) { 3848 mutex_exit(&state->id_lso_lock); 3849 return; 3850 } 3851 3852 /* 3853 * First, free the buflist 3854 */ 3855 ASSERT(bktp->bkt_bufl != NULL); 3856 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3857 3858 /* 3859 * Unregister the LSO memory and free it 3860 */ 3861 ASSERT(bktp->bkt_mr_hdl != NULL); 3862 if (ibt_deregister_mr(state->id_hca_hdl, 3863 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3864 DPRINT(10, 3865 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3866 } 3867 ASSERT(bktp->bkt_mem); 3868 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3869 3870 /* 3871 * Finally free the bucket 3872 */ 3873 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3874 state->id_lso = NULL; 3875 3876 mutex_exit(&state->id_lso_lock); 3877 } 3878 3879 /* 3880 * Free the statically allocated Tx buffer list. 3881 */ 3882 static void 3883 ibd_fini_txlist(ibd_state_t *state) 3884 { 3885 /* 3886 * Free the allocated swqes 3887 */ 3888 mutex_enter(&state->id_tx_list.dl_mutex); 3889 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3890 state->id_tx_list.dl_head = NULL; 3891 state->id_tx_list.dl_pending_sends = B_FALSE; 3892 state->id_tx_list.dl_cnt = 0; 3893 state->id_tx_rel_list.dl_head = NULL; 3894 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3895 state->id_tx_rel_list.dl_cnt = 0; 3896 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3897 mutex_exit(&state->id_tx_list.dl_mutex); 3898 3899 ibd_free_tx_lsobufs(state); 3900 ibd_free_tx_copybufs(state); 3901 } 3902 3903 /* 3904 * post a list of rwqes, NULL terminated. 3905 */ 3906 static void 3907 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3908 { 3909 uint_t i; 3910 uint_t num_posted; 3911 ibt_status_t ibt_status; 3912 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3913 3914 while (rwqe) { 3915 /* Post up to IBD_RX_POST_CNT receive work requests */ 3916 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3917 wrs[i] = rwqe->w_rwr; 3918 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3919 if (rwqe == NULL) { 3920 i++; 3921 break; 3922 } 3923 } 3924 3925 /* 3926 * If posting fails for some reason, we'll never receive 3927 * completion intimation, so we'll need to cleanup. But 3928 * we need to make sure we don't clean up nodes whose 3929 * wrs have been successfully posted. We assume that the 3930 * hca driver returns on the first failure to post and 3931 * therefore the first 'num_posted' entries don't need 3932 * cleanup here. 3933 */ 3934 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3935 3936 num_posted = 0; 3937 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3938 &num_posted); 3939 if (ibt_status != IBT_SUCCESS) { 3940 /* This cannot happen unless the device has an error. */ 3941 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3942 "posting multiple wrs failed: " 3943 "requested=%d, done=%d, ret=%d", 3944 IBD_RX_POST_CNT, num_posted, ibt_status); 3945 atomic_add_32(&state->id_rx_list.dl_cnt, 3946 num_posted - i); 3947 } 3948 } 3949 } 3950 3951 /* 3952 * Grab a list of rwqes from the array of lists, and post the list. 3953 */ 3954 static void 3955 ibd_post_recv_intr(ibd_state_t *state) 3956 { 3957 ibd_rx_queue_t *rxp; 3958 ibd_rwqe_t *list; 3959 3960 /* rotate through the rx_queue array, expecting an adequate number */ 3961 state->id_rx_post_queue_index = 3962 (state->id_rx_post_queue_index + 1) & 3963 (state->id_rx_nqueues - 1); 3964 3965 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3966 mutex_enter(&rxp->rx_post_lock); 3967 list = WQE_TO_RWQE(rxp->rx_head); 3968 rxp->rx_head = NULL; 3969 rxp->rx_cnt = 0; 3970 mutex_exit(&rxp->rx_post_lock); 3971 ibd_post_recv_list(state, list); 3972 } 3973 3974 /* macro explained below */ 3975 #define RX_QUEUE_HASH(rwqe) \ 3976 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3977 3978 /* 3979 * Add a rwqe to one of the the Rx lists. If the list is large enough 3980 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3981 * 3982 * Note: one of 2^N lists is chosen via a hash. This is done 3983 * because using one list is contentious. If the first list is busy 3984 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3985 * 3986 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3987 * even distribution of mapping rwqes to the 2^N queues. 3988 */ 3989 static void 3990 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3991 { 3992 ibd_rx_queue_t *rxp; 3993 3994 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3995 3996 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3997 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3998 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3999 mutex_enter(&rxp->rx_post_lock); 4000 } 4001 rwqe->rwqe_next = rxp->rx_head; 4002 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 4003 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 4004 4005 /* only call ibt_post_recv() every Nth time through here */ 4006 if ((active & (state->id_rx_nqueues - 1)) == 0) { 4007 rxp->rx_head = NULL; 4008 rxp->rx_cnt = 0; 4009 mutex_exit(&rxp->rx_post_lock); 4010 ibd_post_recv_list(state, rwqe); 4011 return; 4012 } 4013 } 4014 rxp->rx_head = RWQE_TO_WQE(rwqe); 4015 mutex_exit(&rxp->rx_post_lock); 4016 } 4017 4018 static int 4019 ibd_alloc_rx_copybufs(ibd_state_t *state) 4020 { 4021 ibt_mr_attr_t mem_attr; 4022 int i; 4023 4024 /* 4025 * Allocate one big chunk for all regular rx copy bufs 4026 */ 4027 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 4028 4029 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe * 4030 state->id_rx_buf_sz, KM_SLEEP); 4031 4032 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe * 4033 sizeof (ibd_rwqe_t), KM_SLEEP); 4034 4035 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 4036 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 4037 sizeof (ibd_rx_queue_t), KM_SLEEP); 4038 for (i = 0; i < state->id_rx_nqueues; i++) { 4039 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4040 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 4041 } 4042 4043 /* 4044 * Do one memory registration on the entire rxbuf area 4045 */ 4046 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 4047 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz; 4048 mem_attr.mr_as = NULL; 4049 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 4050 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 4051 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 4052 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 4053 kmem_free(state->id_rx_wqes, 4054 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); 4055 kmem_free(state->id_rx_bufs, 4056 state->id_ud_num_rwqe * state->id_rx_buf_sz); 4057 state->id_rx_bufs = NULL; 4058 state->id_rx_wqes = NULL; 4059 return (DDI_FAILURE); 4060 } 4061 4062 return (DDI_SUCCESS); 4063 } 4064 4065 /* 4066 * Allocate the statically allocated Rx buffer list. 4067 */ 4068 static int 4069 ibd_init_rxlist(ibd_state_t *state) 4070 { 4071 ibd_rwqe_t *rwqe, *next; 4072 ibd_wqe_t *list; 4073 ibt_lkey_t lkey; 4074 int i; 4075 uint_t len; 4076 uint8_t *bufaddr; 4077 4078 mutex_enter(&state->id_rx_free_list.dl_mutex); 4079 if (state->id_rx_free_list.dl_head != NULL) { 4080 /* rx rsrcs were never freed. Just repost them */ 4081 len = state->id_rx_buf_sz; 4082 list = state->id_rx_free_list.dl_head; 4083 state->id_rx_free_list.dl_head = NULL; 4084 state->id_rx_free_list.dl_cnt = 0; 4085 mutex_exit(&state->id_rx_free_list.dl_mutex); 4086 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4087 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4088 if ((rwqe->rwqe_im_mblk = desballoc( 4089 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 4090 &rwqe->w_freemsg_cb)) == NULL) { 4091 /* allow freemsg_cb to free the rwqes */ 4092 if (atomic_dec_32_nv(&state->id_running) != 0) { 4093 cmn_err(CE_WARN, "ibd_init_rxlist: " 4094 "id_running was not 1\n"); 4095 } 4096 DPRINT(10, "ibd_init_rxlist : " 4097 "failed in desballoc()"); 4098 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4099 rwqe = next) { 4100 next = WQE_TO_RWQE(rwqe->rwqe_next); 4101 if (rwqe->rwqe_im_mblk) { 4102 atomic_inc_32(&state-> 4103 id_rx_list. 4104 dl_bufs_outstanding); 4105 freemsg(rwqe->rwqe_im_mblk); 4106 } else 4107 ibd_free_rwqe(state, rwqe); 4108 } 4109 atomic_inc_32(&state->id_running); 4110 return (DDI_FAILURE); 4111 } 4112 } 4113 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4114 return (DDI_SUCCESS); 4115 } 4116 mutex_exit(&state->id_rx_free_list.dl_mutex); 4117 4118 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 4119 return (DDI_FAILURE); 4120 4121 /* 4122 * Allocate and setup the rwqe list 4123 */ 4124 len = state->id_rx_buf_sz; 4125 lkey = state->id_rx_mr_desc.md_lkey; 4126 rwqe = state->id_rx_wqes; 4127 bufaddr = state->id_rx_bufs; 4128 list = NULL; 4129 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) { 4130 rwqe->w_state = state; 4131 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 4132 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 4133 4134 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 4135 4136 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 4137 &rwqe->w_freemsg_cb)) == NULL) { 4138 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 4139 /* allow freemsg_cb to free the rwqes */ 4140 if (atomic_dec_32_nv(&state->id_running) != 0) { 4141 cmn_err(CE_WARN, "ibd_init_rxlist: " 4142 "id_running was not 1\n"); 4143 } 4144 DPRINT(10, "ibd_init_rxlist : " 4145 "failed in desballoc()"); 4146 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4147 rwqe = next) { 4148 next = WQE_TO_RWQE(rwqe->rwqe_next); 4149 freemsg(rwqe->rwqe_im_mblk); 4150 } 4151 atomic_inc_32(&state->id_running); 4152 4153 /* remove reference to free'd rwqes */ 4154 mutex_enter(&state->id_rx_free_list.dl_mutex); 4155 state->id_rx_free_list.dl_head = NULL; 4156 state->id_rx_free_list.dl_cnt = 0; 4157 mutex_exit(&state->id_rx_free_list.dl_mutex); 4158 4159 ibd_fini_rxlist(state); 4160 return (DDI_FAILURE); 4161 } 4162 4163 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 4164 rwqe->rwqe_copybuf.ic_sgl.ds_va = 4165 (ib_vaddr_t)(uintptr_t)bufaddr; 4166 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 4167 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 4168 rwqe->w_rwr.wr_nds = 1; 4169 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 4170 4171 rwqe->rwqe_next = list; 4172 list = RWQE_TO_WQE(rwqe); 4173 } 4174 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4175 4176 return (DDI_SUCCESS); 4177 } 4178 4179 static void 4180 ibd_free_rx_copybufs(ibd_state_t *state) 4181 { 4182 int i; 4183 4184 /* 4185 * Unregister rxbuf mr 4186 */ 4187 if (ibt_deregister_mr(state->id_hca_hdl, 4188 state->id_rx_mr_hdl) != IBT_SUCCESS) { 4189 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 4190 } 4191 state->id_rx_mr_hdl = NULL; 4192 4193 /* 4194 * Free rxbuf memory 4195 */ 4196 for (i = 0; i < state->id_rx_nqueues; i++) { 4197 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4198 mutex_destroy(&rxp->rx_post_lock); 4199 } 4200 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 4201 sizeof (ibd_rx_queue_t)); 4202 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * 4203 sizeof (ibd_rwqe_t)); 4204 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * 4205 state->id_rx_buf_sz); 4206 state->id_rx_queues = NULL; 4207 state->id_rx_wqes = NULL; 4208 state->id_rx_bufs = NULL; 4209 } 4210 4211 static void 4212 ibd_free_rx_rsrcs(ibd_state_t *state) 4213 { 4214 mutex_enter(&state->id_rx_free_list.dl_mutex); 4215 if (state->id_rx_free_list.dl_head == NULL) { 4216 /* already freed */ 4217 mutex_exit(&state->id_rx_free_list.dl_mutex); 4218 return; 4219 } 4220 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe); 4221 ibd_free_rx_copybufs(state); 4222 state->id_rx_free_list.dl_cnt = 0; 4223 state->id_rx_free_list.dl_head = NULL; 4224 mutex_exit(&state->id_rx_free_list.dl_mutex); 4225 } 4226 4227 /* 4228 * Free the statically allocated Rx buffer list. 4229 */ 4230 static void 4231 ibd_fini_rxlist(ibd_state_t *state) 4232 { 4233 ibd_rwqe_t *rwqe; 4234 int i; 4235 4236 /* run through the rx_queue's, calling freemsg() */ 4237 for (i = 0; i < state->id_rx_nqueues; i++) { 4238 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4239 mutex_enter(&rxp->rx_post_lock); 4240 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 4241 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4242 freemsg(rwqe->rwqe_im_mblk); 4243 rxp->rx_cnt--; 4244 } 4245 rxp->rx_head = NULL; 4246 mutex_exit(&rxp->rx_post_lock); 4247 } 4248 4249 /* cannot free rx resources unless gld returned everything */ 4250 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 4251 ibd_free_rx_rsrcs(state); 4252 } 4253 4254 /* 4255 * Free an allocated recv wqe. 4256 */ 4257 /* ARGSUSED */ 4258 static void 4259 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 4260 { 4261 /* 4262 * desballoc() failed (no memory). 4263 * 4264 * This rwqe is placed on a free list so that it 4265 * can be reinstated when memory is available. 4266 * 4267 * NOTE: no code currently exists to reinstate 4268 * these "lost" rwqes. 4269 */ 4270 mutex_enter(&state->id_rx_free_list.dl_mutex); 4271 state->id_rx_free_list.dl_cnt++; 4272 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4273 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4274 mutex_exit(&state->id_rx_free_list.dl_mutex); 4275 } 4276 4277 /* 4278 * IBA Rx completion queue handler. Guaranteed to be single 4279 * threaded and nonreentrant for this CQ. 4280 */ 4281 /* ARGSUSED */ 4282 static void 4283 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4284 { 4285 ibd_state_t *state = (ibd_state_t *)arg; 4286 4287 atomic_inc_64(&state->id_num_intrs); 4288 4289 if (ibd_rx_softintr == 1) { 4290 mutex_enter(&state->id_rcq_poll_lock); 4291 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4292 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4293 mutex_exit(&state->id_rcq_poll_lock); 4294 return; 4295 } else { 4296 mutex_exit(&state->id_rcq_poll_lock); 4297 ddi_trigger_softintr(state->id_rx); 4298 } 4299 } else 4300 (void) ibd_intr((caddr_t)state); 4301 } 4302 4303 /* 4304 * CQ handler for Tx completions, when the Tx CQ is in 4305 * interrupt driven mode. 4306 */ 4307 /* ARGSUSED */ 4308 static void 4309 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4310 { 4311 ibd_state_t *state = (ibd_state_t *)arg; 4312 4313 atomic_inc_64(&state->id_num_intrs); 4314 4315 if (ibd_tx_softintr == 1) { 4316 mutex_enter(&state->id_scq_poll_lock); 4317 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4318 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4319 mutex_exit(&state->id_scq_poll_lock); 4320 return; 4321 } else { 4322 mutex_exit(&state->id_scq_poll_lock); 4323 ddi_trigger_softintr(state->id_tx); 4324 } 4325 } else 4326 (void) ibd_tx_recycle((caddr_t)state); 4327 } 4328 4329 /* 4330 * Multicast group create/delete trap handler. These will be delivered 4331 * on a kernel thread (handling can thus block) and can be invoked 4332 * concurrently. The handler can be invoked anytime after it is 4333 * registered and before ibt_detach(). 4334 */ 4335 /* ARGSUSED */ 4336 static void 4337 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4338 ibt_subnet_event_t *event) 4339 { 4340 ibd_state_t *state = (ibd_state_t *)arg; 4341 ibd_req_t *req; 4342 4343 /* 4344 * The trap handler will get invoked once for every event for 4345 * every port. The input "gid" is the GID0 of the port the 4346 * trap came in on; we just need to act on traps that came 4347 * to our port, meaning the port on which the ipoib interface 4348 * resides. Since ipoib uses GID0 of the port, we just match 4349 * the gids to check whether we need to handle the trap. 4350 */ 4351 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4352 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4353 return; 4354 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4355 4356 DPRINT(10, "ibd_notices_handler : %d\n", code); 4357 4358 switch (code) { 4359 case IBT_SM_EVENT_UNAVAILABLE: 4360 /* 4361 * If we are in promiscuous mode or have 4362 * sendnonmembers, we need to print a warning 4363 * message right now. Else, just store the 4364 * information, print when we enter promiscuous 4365 * mode or attempt nonmember send. We might 4366 * also want to stop caching sendnonmember. 4367 */ 4368 ibd_print_warn(state, "IBA multicast support " 4369 "degraded due to unavailability of multicast " 4370 "traps"); 4371 break; 4372 case IBT_SM_EVENT_AVAILABLE: 4373 /* 4374 * If we printed a warning message above or 4375 * while trying to nonmember send or get into 4376 * promiscuous mode, print an okay message. 4377 */ 4378 ibd_print_warn(state, "IBA multicast support " 4379 "restored due to availability of multicast " 4380 "traps"); 4381 break; 4382 case IBT_SM_EVENT_MCG_CREATED: 4383 case IBT_SM_EVENT_MCG_DELETED: 4384 /* 4385 * If it is a "deleted" event and we are in late hca 4386 * init, nothing to do. 4387 */ 4388 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4389 IBD_DRV_IN_LATE_HCA_INIT) && (code == 4390 IBT_SM_EVENT_MCG_DELETED)) { 4391 break; 4392 } 4393 /* 4394 * Common processing of creation/deletion traps. 4395 * First check if the instance is being 4396 * [de]initialized; back off then, without doing 4397 * anything more, since we are not sure if the 4398 * async thread is around, or whether we might 4399 * be racing with the detach code in ibd_m_stop() 4400 * that scans the mcg list. 4401 */ 4402 if (!ibd_async_safe(state)) 4403 return; 4404 4405 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4406 req->rq_gid = event->sm_notice_gid; 4407 req->rq_ptr = (void *)code; 4408 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4409 break; 4410 } 4411 } 4412 4413 static void 4414 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4415 { 4416 ib_gid_t mgid = req->rq_gid; 4417 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4418 int ret; 4419 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff; 4420 4421 DPRINT(10, "ibd_async_trap : %d\n", code); 4422 4423 /* 4424 * Check if we have already joined the IPoIB broadcast group for our 4425 * PKEY. If joined, perform the rest of the operation. 4426 * Else, the interface is not initialised. Do the initialisation here 4427 * by calling ibd_start() and return. 4428 */ 4429 4430 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4431 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) && 4432 (code == IBT_SM_EVENT_MCG_CREATED)) { 4433 /* 4434 * If we are in late HCA init and a notification for the 4435 * creation of a MCG came in, check if it is the IPoIB MCG for 4436 * this pkey. If not, return. 4437 */ 4438 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey != 4439 state->id_pkey)) { 4440 ibd_async_done(state); 4441 return; 4442 } 4443 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4444 /* 4445 * Check if there is still a necessity to start the interface. 4446 * It is possible that the user attempted unplumb at just about 4447 * the same time, and if unplumb succeeded, we have nothing to 4448 * do. 4449 */ 4450 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4451 IBD_DRV_IN_LATE_HCA_INIT) && 4452 ((ret = ibd_start(state)) != 0)) { 4453 DPRINT(10, "ibd_async_trap: cannot start from late HCA " 4454 "init, ret=%d", ret); 4455 } 4456 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4457 ibd_async_done(state); 4458 return; 4459 } 4460 4461 /* 4462 * Atomically search the nonmember and sendonlymember lists and 4463 * delete. 4464 */ 4465 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4466 4467 if (state->id_prom_op == IBD_OP_COMPLETED) { 4468 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4469 4470 /* 4471 * If in promiscuous mode, try to join/attach to the new 4472 * mcg. Given the unreliable out-of-order mode of trap 4473 * delivery, we can never be sure whether it is a problem 4474 * if the join fails. Thus, we warn the admin of a failure 4475 * if this was a creation trap. Note that the trap might 4476 * actually be reporting a long past event, and the mcg 4477 * might already have been deleted, thus we might be warning 4478 * in vain. 4479 */ 4480 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4481 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4482 ibd_print_warn(state, "IBA promiscuous mode missed " 4483 "new multicast gid %016llx:%016llx", 4484 (u_longlong_t)mgid.gid_prefix, 4485 (u_longlong_t)mgid.gid_guid); 4486 } 4487 4488 /* 4489 * Free the request slot allocated by the subnet event thread. 4490 */ 4491 ibd_async_done(state); 4492 } 4493 4494 /* 4495 * GLDv3 entry point to get capabilities. 4496 */ 4497 static boolean_t 4498 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4499 { 4500 ibd_state_t *state = arg; 4501 4502 if (state->id_type == IBD_PORT_DRIVER) 4503 return (B_FALSE); 4504 4505 switch (cap) { 4506 case MAC_CAPAB_HCKSUM: { 4507 uint32_t *txflags = cap_data; 4508 4509 /* 4510 * We either do full checksum or not do it at all 4511 */ 4512 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4513 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4514 else 4515 return (B_FALSE); 4516 break; 4517 } 4518 4519 case MAC_CAPAB_LSO: { 4520 mac_capab_lso_t *cap_lso = cap_data; 4521 4522 /* 4523 * In addition to the capability and policy, since LSO 4524 * relies on hw checksum, we'll not enable LSO if we 4525 * don't have hw checksum. Of course, if the HCA doesn't 4526 * provide the reserved lkey capability, enabling LSO will 4527 * actually affect performance adversely, so we'll disable 4528 * LSO even for that case. 4529 */ 4530 if (!state->id_lso_policy || !state->id_lso_capable) 4531 return (B_FALSE); 4532 4533 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4534 return (B_FALSE); 4535 4536 if (state->id_hca_res_lkey_capab == 0) { 4537 ibd_print_warn(state, "no reserved-lkey capability, " 4538 "disabling LSO"); 4539 return (B_FALSE); 4540 } 4541 4542 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4543 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4544 break; 4545 } 4546 4547 default: 4548 return (B_FALSE); 4549 } 4550 4551 return (B_TRUE); 4552 } 4553 4554 /* 4555 * callback function for set/get of properties 4556 */ 4557 static int 4558 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4559 uint_t pr_valsize, const void *pr_val) 4560 { 4561 ibd_state_t *state = arg; 4562 int err = 0; 4563 uint32_t link_mode; 4564 4565 /* Cannot set properties on a port driver */ 4566 if (state->id_type == IBD_PORT_DRIVER) { 4567 return (ENOTSUP); 4568 } 4569 4570 switch (pr_num) { 4571 case MAC_PROP_IB_LINKMODE: 4572 if (state->id_mac_state & IBD_DRV_STARTED) { 4573 err = EBUSY; 4574 break; 4575 } 4576 if (pr_val == NULL) { 4577 err = EINVAL; 4578 break; 4579 } 4580 bcopy(pr_val, &link_mode, sizeof (link_mode)); 4581 if (link_mode != IBD_LINK_MODE_UD && 4582 link_mode != IBD_LINK_MODE_RC) { 4583 err = EINVAL; 4584 } else { 4585 if (link_mode == IBD_LINK_MODE_RC) { 4586 if (state->id_enable_rc) { 4587 return (0); 4588 } 4589 state->id_enable_rc = 1; 4590 /* inform MAC framework of new MTU */ 4591 err = mac_maxsdu_update(state->id_mh, 4592 state->rc_mtu - IPOIB_HDRSIZE); 4593 } else { 4594 if (!state->id_enable_rc) { 4595 return (0); 4596 } 4597 state->id_enable_rc = 0; 4598 err = mac_maxsdu_update(state->id_mh, 4599 state->id_mtu - IPOIB_HDRSIZE); 4600 } 4601 (void) ibd_record_capab(state); 4602 mac_capab_update(state->id_mh); 4603 } 4604 break; 4605 case MAC_PROP_PRIVATE: 4606 err = ibd_set_priv_prop(state, pr_name, 4607 pr_valsize, pr_val); 4608 break; 4609 default: 4610 err = ENOTSUP; 4611 break; 4612 } 4613 return (err); 4614 } 4615 4616 static int 4617 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4618 uint_t pr_valsize, void *pr_val) 4619 { 4620 ibd_state_t *state = arg; 4621 int err = 0; 4622 4623 switch (pr_num) { 4624 case MAC_PROP_MTU: 4625 break; 4626 default: 4627 if (state->id_type == IBD_PORT_DRIVER) { 4628 return (ENOTSUP); 4629 } 4630 break; 4631 } 4632 4633 switch (pr_num) { 4634 case MAC_PROP_IB_LINKMODE: 4635 *(uint_t *)pr_val = state->id_enable_rc; 4636 break; 4637 case MAC_PROP_PRIVATE: 4638 err = ibd_get_priv_prop(state, pr_name, pr_valsize, 4639 pr_val); 4640 break; 4641 default: 4642 err = ENOTSUP; 4643 break; 4644 } 4645 return (err); 4646 } 4647 4648 static void 4649 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4650 mac_prop_info_handle_t prh) 4651 { 4652 ibd_state_t *state = arg; 4653 4654 switch (pr_num) { 4655 case MAC_PROP_IB_LINKMODE: { 4656 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE); 4657 break; 4658 } 4659 case MAC_PROP_MTU: { 4660 uint32_t min, max; 4661 if (state->id_type == IBD_PORT_DRIVER) { 4662 min = 1500; 4663 max = IBD_DEF_RC_MAX_SDU; 4664 } else if (state->id_enable_rc) { 4665 min = max = IBD_DEF_RC_MAX_SDU; 4666 } else { 4667 min = max = state->id_mtu - IPOIB_HDRSIZE; 4668 } 4669 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4670 mac_prop_info_set_range_uint32(prh, min, max); 4671 break; 4672 } 4673 case MAC_PROP_PRIVATE: { 4674 char valstr[64]; 4675 int value; 4676 4677 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 4678 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4679 return; 4680 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4681 value = IBD_DEF_COALESCE_COMPLETIONS; 4682 } else if (strcmp(pr_name, 4683 "_ibd_create_broadcast_group") == 0) { 4684 value = IBD_DEF_CREATE_BCAST_GROUP; 4685 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4686 value = IBD_DEF_HASH_SIZE; 4687 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4688 value = IBD_DEF_LSO_POLICY; 4689 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4690 value = IBD_DEF_NUM_AH; 4691 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4692 value = IBD_DEF_NUM_LSO_BUFS; 4693 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4694 value = IBD_DEF_RC_ENABLE_SRQ; 4695 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4696 value = IBD_DEF_RC_NUM_RWQE; 4697 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4698 value = IBD_DEF_RC_NUM_SRQ; 4699 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4700 value = IBD_DEF_RC_NUM_SWQE; 4701 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4702 value = IBD_DEF_RC_RX_COMP_COUNT; 4703 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4704 value = IBD_DEF_RC_RX_COMP_USEC; 4705 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4706 value = IBD_DEF_RC_RX_COPY_THRESH; 4707 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4708 value = IBD_DEF_RC_RX_RWQE_THRESH; 4709 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4710 value = IBD_DEF_RC_TX_COMP_COUNT; 4711 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4712 value = IBD_DEF_RC_TX_COMP_USEC; 4713 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4714 value = IBD_DEF_RC_TX_COPY_THRESH; 4715 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4716 value = IBD_DEF_UD_NUM_RWQE; 4717 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4718 value = IBD_DEF_UD_NUM_SWQE; 4719 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4720 value = IBD_DEF_UD_RX_COMP_COUNT; 4721 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4722 value = IBD_DEF_UD_RX_COMP_USEC; 4723 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4724 value = IBD_DEF_UD_TX_COMP_COUNT; 4725 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4726 value = IBD_DEF_UD_TX_COMP_USEC; 4727 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4728 value = IBD_DEF_UD_TX_COPY_THRESH; 4729 } else { 4730 return; 4731 } 4732 4733 (void) snprintf(valstr, sizeof (valstr), "%d", value); 4734 mac_prop_info_set_default_str(prh, valstr); 4735 break; 4736 } 4737 } /* switch (pr_num) */ 4738 } 4739 4740 /* ARGSUSED2 */ 4741 static int 4742 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name, 4743 uint_t pr_valsize, const void *pr_val) 4744 { 4745 int err = 0; 4746 long result; 4747 4748 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4749 if (pr_val == NULL) { 4750 return (EINVAL); 4751 } 4752 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4753 if (result < 0 || result > 1) { 4754 err = EINVAL; 4755 } else { 4756 state->id_allow_coalesce_comp_tuning = (result == 1) ? 4757 B_TRUE: B_FALSE; 4758 } 4759 return (err); 4760 } 4761 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 4762 if (state->id_mac_state & IBD_DRV_STARTED) { 4763 return (EBUSY); 4764 } 4765 if (pr_val == NULL) { 4766 return (EINVAL); 4767 } 4768 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4769 if (result < 0 || result > 1) { 4770 err = EINVAL; 4771 } else { 4772 state->id_create_broadcast_group = (result == 1) ? 4773 B_TRUE: B_FALSE; 4774 } 4775 return (err); 4776 } 4777 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4778 if (state->id_mac_state & IBD_DRV_STARTED) { 4779 return (EBUSY); 4780 } 4781 if (pr_val == NULL) { 4782 return (EINVAL); 4783 } 4784 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4785 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) { 4786 err = EINVAL; 4787 } else { 4788 state->id_hash_size = (uint32_t)result; 4789 } 4790 return (err); 4791 } 4792 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4793 if (state->id_mac_state & IBD_DRV_STARTED) { 4794 return (EBUSY); 4795 } 4796 if (pr_val == NULL) { 4797 return (EINVAL); 4798 } 4799 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4800 if (result < 0 || result > 1) { 4801 err = EINVAL; 4802 } else { 4803 state->id_lso_policy = (result == 1) ? 4804 B_TRUE: B_FALSE; 4805 } 4806 mac_capab_update(state->id_mh); 4807 return (err); 4808 } 4809 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4810 if (state->id_mac_state & IBD_DRV_STARTED) { 4811 return (EBUSY); 4812 } 4813 if (pr_val == NULL) { 4814 return (EINVAL); 4815 } 4816 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4817 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) { 4818 err = EINVAL; 4819 } else { 4820 state->id_num_ah = (uint32_t)result; 4821 } 4822 return (err); 4823 } 4824 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4825 if (state->id_mac_state & IBD_DRV_STARTED) { 4826 return (EBUSY); 4827 } 4828 if (!state->id_lso_policy || !state->id_lso_capable) { 4829 return (EINVAL); 4830 } 4831 if (pr_val == NULL) { 4832 return (EINVAL); 4833 } 4834 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4835 if (result < IBD_MIN_NUM_LSO_BUFS || 4836 result > IBD_MAX_NUM_LSO_BUFS) { 4837 err = EINVAL; 4838 } else { 4839 state->id_num_lso_bufs = (uint32_t)result; 4840 } 4841 return (err); 4842 } 4843 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4844 if (state->id_mac_state & IBD_DRV_STARTED) { 4845 return (EBUSY); 4846 } 4847 if (pr_val == NULL) { 4848 return (EINVAL); 4849 } 4850 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4851 if (result < 0 || result > 1) { 4852 err = EINVAL; 4853 } else { 4854 state->rc_enable_srq = (result == 1) ? 4855 B_TRUE: B_FALSE; 4856 } 4857 if (!state->rc_enable_srq) { 4858 state->id_rc_num_srq = 0; 4859 } 4860 return (err); 4861 } 4862 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4863 if (state->id_mac_state & IBD_DRV_STARTED) { 4864 return (EBUSY); 4865 } 4866 if (pr_val == NULL) { 4867 return (EINVAL); 4868 } 4869 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4870 if (result < IBD_MIN_RC_NUM_RWQE || 4871 result > IBD_MAX_RC_NUM_RWQE) { 4872 err = EINVAL; 4873 } else { 4874 state->id_rc_num_rwqe = (uint32_t)result; 4875 if (state->id_allow_coalesce_comp_tuning && 4876 state->id_rc_rx_comp_count > state->id_rc_num_rwqe) 4877 state->id_rc_rx_comp_count = 4878 state->id_rc_num_rwqe; 4879 if (state->id_rc_num_srq > state->id_rc_num_rwqe) 4880 state->id_rc_num_srq = 4881 state->id_rc_num_rwqe - 1; 4882 /* 4883 * If rx_rwqe_threshold is greater than the number of 4884 * rwqes, pull it back to 25% of number of rwqes. 4885 */ 4886 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe) 4887 state->id_rc_rx_rwqe_thresh = 4888 (state->id_rc_num_rwqe >> 2); 4889 4890 } 4891 return (err); 4892 } 4893 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4894 if (state->id_mac_state & IBD_DRV_STARTED) { 4895 return (EBUSY); 4896 } 4897 if (pr_val == NULL) { 4898 return (EINVAL); 4899 } 4900 if (!state->rc_enable_srq) 4901 return (EINVAL); 4902 4903 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4904 if (result < IBD_MIN_RC_NUM_SRQ || 4905 result >= state->id_rc_num_rwqe) { 4906 err = EINVAL; 4907 } else 4908 state->id_rc_num_srq = (uint32_t)result; 4909 return (err); 4910 } 4911 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4912 if (state->id_mac_state & IBD_DRV_STARTED) { 4913 return (EBUSY); 4914 } 4915 if (pr_val == NULL) { 4916 return (EINVAL); 4917 } 4918 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4919 if (result < IBD_MIN_RC_NUM_SWQE || 4920 result > IBD_MAX_RC_NUM_SWQE) { 4921 err = EINVAL; 4922 } else { 4923 state->id_rc_num_swqe = (uint32_t)result; 4924 if (state->id_allow_coalesce_comp_tuning && 4925 state->id_rc_tx_comp_count > state->id_rc_num_swqe) 4926 state->id_rc_tx_comp_count = 4927 state->id_rc_num_swqe; 4928 } 4929 return (err); 4930 } 4931 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4932 if (!state->id_allow_coalesce_comp_tuning) { 4933 return (ENOTSUP); 4934 } 4935 if (pr_val == NULL) { 4936 return (EINVAL); 4937 } 4938 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4939 if (result < 1 || result > state->id_rc_num_rwqe) { 4940 err = EINVAL; 4941 } else { 4942 state->id_rc_rx_comp_count = (uint32_t)result; 4943 } 4944 return (err); 4945 } 4946 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4947 if (!state->id_allow_coalesce_comp_tuning) { 4948 return (ENOTSUP); 4949 } 4950 if (pr_val == NULL) { 4951 return (EINVAL); 4952 } 4953 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4954 if (result < 1) { 4955 err = EINVAL; 4956 } else { 4957 state->id_rc_rx_comp_usec = (uint32_t)result; 4958 } 4959 return (err); 4960 } 4961 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4962 if (state->id_mac_state & IBD_DRV_STARTED) { 4963 return (EBUSY); 4964 } 4965 if (pr_val == NULL) { 4966 return (EINVAL); 4967 } 4968 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4969 if (result < IBD_MIN_RC_RX_COPY_THRESH || 4970 result > state->rc_mtu) { 4971 err = EINVAL; 4972 } else { 4973 state->id_rc_rx_copy_thresh = (uint32_t)result; 4974 } 4975 return (err); 4976 } 4977 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4978 if (state->id_mac_state & IBD_DRV_STARTED) { 4979 return (EBUSY); 4980 } 4981 if (pr_val == NULL) { 4982 return (EINVAL); 4983 } 4984 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4985 if (result < IBD_MIN_RC_RX_RWQE_THRESH || 4986 result >= state->id_rc_num_rwqe) { 4987 err = EINVAL; 4988 } else { 4989 state->id_rc_rx_rwqe_thresh = (uint32_t)result; 4990 } 4991 return (err); 4992 } 4993 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4994 if (!state->id_allow_coalesce_comp_tuning) { 4995 return (ENOTSUP); 4996 } 4997 if (pr_val == NULL) { 4998 return (EINVAL); 4999 } 5000 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5001 if (result < 1 || result > state->id_rc_num_swqe) { 5002 err = EINVAL; 5003 } else { 5004 state->id_rc_tx_comp_count = (uint32_t)result; 5005 } 5006 return (err); 5007 } 5008 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5009 if (!state->id_allow_coalesce_comp_tuning) { 5010 return (ENOTSUP); 5011 } 5012 if (pr_val == NULL) { 5013 return (EINVAL); 5014 } 5015 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5016 if (result < 1) 5017 err = EINVAL; 5018 else { 5019 state->id_rc_tx_comp_usec = (uint32_t)result; 5020 } 5021 return (err); 5022 } 5023 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5024 if (state->id_mac_state & IBD_DRV_STARTED) { 5025 return (EBUSY); 5026 } 5027 if (pr_val == NULL) { 5028 return (EINVAL); 5029 } 5030 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5031 if (result < IBD_MIN_RC_TX_COPY_THRESH || 5032 result > state->rc_mtu) { 5033 err = EINVAL; 5034 } else { 5035 state->id_rc_tx_copy_thresh = (uint32_t)result; 5036 } 5037 return (err); 5038 } 5039 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5040 if (state->id_mac_state & IBD_DRV_STARTED) { 5041 return (EBUSY); 5042 } 5043 if (pr_val == NULL) { 5044 return (EINVAL); 5045 } 5046 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5047 if (result < IBD_MIN_UD_NUM_RWQE || 5048 result > IBD_MAX_UD_NUM_RWQE) { 5049 err = EINVAL; 5050 } else { 5051 if (result > state->id_hca_max_chan_sz) { 5052 state->id_ud_num_rwqe = 5053 state->id_hca_max_chan_sz; 5054 } else { 5055 state->id_ud_num_rwqe = (uint32_t)result; 5056 } 5057 if (state->id_allow_coalesce_comp_tuning && 5058 state->id_ud_rx_comp_count > state->id_ud_num_rwqe) 5059 state->id_ud_rx_comp_count = 5060 state->id_ud_num_rwqe; 5061 } 5062 return (err); 5063 } 5064 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5065 if (state->id_mac_state & IBD_DRV_STARTED) { 5066 return (EBUSY); 5067 } 5068 if (pr_val == NULL) { 5069 return (EINVAL); 5070 } 5071 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5072 if (result < IBD_MIN_UD_NUM_SWQE || 5073 result > IBD_MAX_UD_NUM_SWQE) { 5074 err = EINVAL; 5075 } else { 5076 if (result > state->id_hca_max_chan_sz) { 5077 state->id_ud_num_swqe = 5078 state->id_hca_max_chan_sz; 5079 } else { 5080 state->id_ud_num_swqe = (uint32_t)result; 5081 } 5082 if (state->id_allow_coalesce_comp_tuning && 5083 state->id_ud_tx_comp_count > state->id_ud_num_swqe) 5084 state->id_ud_tx_comp_count = 5085 state->id_ud_num_swqe; 5086 } 5087 return (err); 5088 } 5089 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5090 if (!state->id_allow_coalesce_comp_tuning) { 5091 return (ENOTSUP); 5092 } 5093 if (pr_val == NULL) { 5094 return (EINVAL); 5095 } 5096 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5097 if (result < 1 || result > state->id_ud_num_rwqe) { 5098 err = EINVAL; 5099 } else { 5100 state->id_ud_rx_comp_count = (uint32_t)result; 5101 } 5102 return (err); 5103 } 5104 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5105 if (!state->id_allow_coalesce_comp_tuning) { 5106 return (ENOTSUP); 5107 } 5108 if (pr_val == NULL) { 5109 return (EINVAL); 5110 } 5111 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5112 if (result < 1) { 5113 err = EINVAL; 5114 } else { 5115 state->id_ud_rx_comp_usec = (uint32_t)result; 5116 } 5117 return (err); 5118 } 5119 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5120 if (!state->id_allow_coalesce_comp_tuning) { 5121 return (ENOTSUP); 5122 } 5123 if (pr_val == NULL) { 5124 return (EINVAL); 5125 } 5126 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5127 if (result < 1 || result > state->id_ud_num_swqe) { 5128 err = EINVAL; 5129 } else { 5130 state->id_ud_tx_comp_count = (uint32_t)result; 5131 } 5132 return (err); 5133 } 5134 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5135 if (!state->id_allow_coalesce_comp_tuning) { 5136 return (ENOTSUP); 5137 } 5138 if (pr_val == NULL) { 5139 return (EINVAL); 5140 } 5141 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5142 if (result < 1) { 5143 err = EINVAL; 5144 } else { 5145 state->id_ud_tx_comp_usec = (uint32_t)result; 5146 } 5147 return (err); 5148 } 5149 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5150 if (state->id_mac_state & IBD_DRV_STARTED) { 5151 return (EBUSY); 5152 } 5153 if (pr_val == NULL) { 5154 return (EINVAL); 5155 } 5156 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5157 if (result < IBD_MIN_UD_TX_COPY_THRESH || 5158 result > IBD_MAX_UD_TX_COPY_THRESH) { 5159 err = EINVAL; 5160 } else { 5161 state->id_ud_tx_copy_thresh = (uint32_t)result; 5162 } 5163 return (err); 5164 } 5165 return (ENOTSUP); 5166 } 5167 5168 static int 5169 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, 5170 void *pr_val) 5171 { 5172 int err = ENOTSUP; 5173 int value; 5174 5175 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 5176 value = state->id_bgroup_present; 5177 err = 0; 5178 goto done; 5179 } 5180 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 5181 value = state->id_allow_coalesce_comp_tuning; 5182 err = 0; 5183 goto done; 5184 } 5185 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 5186 value = state->id_create_broadcast_group; 5187 err = 0; 5188 goto done; 5189 } 5190 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 5191 value = state->id_hash_size; 5192 err = 0; 5193 goto done; 5194 } 5195 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 5196 value = state->id_lso_policy; 5197 err = 0; 5198 goto done; 5199 } 5200 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 5201 value = state->id_num_ah; 5202 err = 0; 5203 goto done; 5204 } 5205 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 5206 value = state->id_num_lso_bufs; 5207 err = 0; 5208 goto done; 5209 } 5210 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 5211 value = state->rc_enable_srq; 5212 err = 0; 5213 goto done; 5214 } 5215 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 5216 value = state->id_rc_num_rwqe; 5217 err = 0; 5218 goto done; 5219 } 5220 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 5221 value = state->id_rc_num_srq; 5222 err = 0; 5223 goto done; 5224 } 5225 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 5226 value = state->id_rc_num_swqe; 5227 err = 0; 5228 goto done; 5229 } 5230 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 5231 value = state->id_rc_rx_comp_count; 5232 err = 0; 5233 goto done; 5234 } 5235 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 5236 value = state->id_rc_rx_comp_usec; 5237 err = 0; 5238 goto done; 5239 } 5240 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 5241 value = state->id_rc_rx_copy_thresh; 5242 err = 0; 5243 goto done; 5244 } 5245 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 5246 value = state->id_rc_rx_rwqe_thresh; 5247 err = 0; 5248 goto done; 5249 } 5250 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 5251 value = state->id_rc_tx_comp_count; 5252 err = 0; 5253 goto done; 5254 } 5255 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5256 value = state->id_rc_tx_comp_usec; 5257 err = 0; 5258 goto done; 5259 } 5260 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5261 value = state->id_rc_tx_copy_thresh; 5262 err = 0; 5263 goto done; 5264 } 5265 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5266 value = state->id_ud_num_rwqe; 5267 err = 0; 5268 goto done; 5269 } 5270 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5271 value = state->id_ud_num_swqe; 5272 err = 0; 5273 goto done; 5274 } 5275 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5276 value = state->id_ud_rx_comp_count; 5277 err = 0; 5278 goto done; 5279 } 5280 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5281 value = state->id_ud_rx_comp_usec; 5282 err = 0; 5283 goto done; 5284 } 5285 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5286 value = state->id_ud_tx_comp_count; 5287 err = 0; 5288 goto done; 5289 } 5290 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5291 value = state->id_ud_tx_comp_usec; 5292 err = 0; 5293 goto done; 5294 } 5295 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5296 value = state->id_ud_tx_copy_thresh; 5297 err = 0; 5298 goto done; 5299 } 5300 done: 5301 if (err == 0) { 5302 (void) snprintf(pr_val, pr_valsize, "%d", value); 5303 } 5304 return (err); 5305 } 5306 5307 static int 5308 ibd_get_port_details(ibd_state_t *state) 5309 { 5310 ibt_hca_portinfo_t *port_infop; 5311 ibt_status_t ret; 5312 uint_t psize, port_infosz; 5313 5314 mutex_enter(&state->id_link_mutex); 5315 5316 /* 5317 * Query for port information 5318 */ 5319 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 5320 &port_infop, &psize, &port_infosz); 5321 if ((ret != IBT_SUCCESS) || (psize != 1)) { 5322 mutex_exit(&state->id_link_mutex); 5323 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 5324 "failed, ret=%d", ret); 5325 return (ENETDOWN); 5326 } 5327 5328 /* 5329 * If the link is active, verify the pkey 5330 */ 5331 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) { 5332 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 5333 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 5334 state->id_link_state = LINK_STATE_DOWN; 5335 } else { 5336 state->id_link_state = LINK_STATE_UP; 5337 } 5338 state->id_mtu = (128 << port_infop->p_mtu); 5339 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5340 state->id_sgid = *port_infop->p_sgid_tbl; 5341 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5342 /* 5343 * Now that the port is active, record the port speed 5344 */ 5345 state->id_link_speed = ibd_get_portspeed(state); 5346 } else { 5347 /* Make sure that these are handled in PORT_UP/CHANGE */ 5348 state->id_mtu = 0; 5349 state->id_link_state = LINK_STATE_DOWN; 5350 state->id_link_speed = 0; 5351 } 5352 mutex_exit(&state->id_link_mutex); 5353 ibt_free_portinfo(port_infop, port_infosz); 5354 5355 return (0); 5356 } 5357 5358 static int 5359 ibd_alloc_cqs(ibd_state_t *state) 5360 { 5361 ibt_hca_attr_t hca_attrs; 5362 ibt_cq_attr_t cq_attr; 5363 ibt_status_t ret; 5364 uint32_t real_size; 5365 uint_t num_rwqe_change = 0; 5366 uint_t num_swqe_change = 0; 5367 5368 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 5369 ASSERT(ret == IBT_SUCCESS); 5370 5371 /* 5372 * Allocate Rx/combined CQ: 5373 * Theoretically, there is no point in having more than #rwqe 5374 * plus #swqe cqe's, except that the CQ will be signaled for 5375 * overflow when the last wqe completes, if none of the previous 5376 * cqe's have been polled. Thus, we allocate just a few less wqe's 5377 * to make sure such overflow does not occur. 5378 */ 5379 cq_attr.cq_sched = NULL; 5380 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 5381 5382 /* 5383 * Allocate Receive CQ. 5384 */ 5385 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) { 5386 cq_attr.cq_size = state->id_ud_num_rwqe + 1; 5387 } else { 5388 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5389 num_rwqe_change = state->id_ud_num_rwqe; 5390 state->id_ud_num_rwqe = cq_attr.cq_size - 1; 5391 } 5392 5393 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5394 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 5395 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 5396 "failed, ret=%d\n", ret); 5397 return (DDI_FAILURE); 5398 } 5399 5400 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count, 5401 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) { 5402 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 5403 "moderation failed, ret=%d\n", ret); 5404 } 5405 5406 /* make the #rx wc's the same as max rx chain size */ 5407 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 5408 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 5409 state->id_rxwcs_size, KM_SLEEP); 5410 5411 /* 5412 * Allocate Send CQ. 5413 */ 5414 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) { 5415 cq_attr.cq_size = state->id_ud_num_swqe + 1; 5416 } else { 5417 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5418 num_swqe_change = state->id_ud_num_swqe; 5419 state->id_ud_num_swqe = cq_attr.cq_size - 1; 5420 } 5421 5422 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5423 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 5424 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 5425 "failed, ret=%d\n", ret); 5426 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 5427 state->id_rxwcs_size); 5428 (void) ibt_free_cq(state->id_rcq_hdl); 5429 return (DDI_FAILURE); 5430 } 5431 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count, 5432 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) { 5433 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 5434 "moderation failed, ret=%d\n", ret); 5435 } 5436 5437 state->id_txwcs_size = IBD_TX_POLL_THRESH; 5438 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 5439 state->id_txwcs_size, KM_SLEEP); 5440 5441 /* 5442 * Print message in case we could not allocate as many wqe's 5443 * as was requested. 5444 */ 5445 if (num_rwqe_change) { 5446 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 5447 "%d", state->id_ud_num_rwqe, num_rwqe_change); 5448 } 5449 if (num_swqe_change) { 5450 ibd_print_warn(state, "Setting #swqe = %d instead of default " 5451 "%d", state->id_ud_num_swqe, num_swqe_change); 5452 } 5453 5454 return (DDI_SUCCESS); 5455 } 5456 5457 static int 5458 ibd_setup_ud_channel(ibd_state_t *state) 5459 { 5460 ibt_ud_chan_alloc_args_t ud_alloc_attr; 5461 ibt_ud_chan_query_attr_t ud_chan_attr; 5462 ibt_status_t ret; 5463 5464 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 5465 if (state->id_hca_res_lkey_capab) 5466 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 5467 if (state->id_lso_policy && state->id_lso_capable) 5468 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 5469 5470 ud_alloc_attr.ud_hca_port_num = state->id_port; 5471 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 5472 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 5473 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe; 5474 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe; 5475 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 5476 ud_alloc_attr.ud_scq = state->id_scq_hdl; 5477 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 5478 ud_alloc_attr.ud_pd = state->id_pd_hdl; 5479 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 5480 ud_alloc_attr.ud_clone_chan = NULL; 5481 5482 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 5483 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 5484 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 5485 "failed, ret=%d\n", ret); 5486 return (DDI_FAILURE); 5487 } 5488 5489 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 5490 &ud_chan_attr)) != IBT_SUCCESS) { 5491 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 5492 "failed, ret=%d\n", ret); 5493 (void) ibt_free_channel(state->id_chnl_hdl); 5494 return (DDI_FAILURE); 5495 } 5496 5497 state->id_qpnum = ud_chan_attr.ud_qpn; 5498 5499 return (DDI_SUCCESS); 5500 } 5501 5502 static int 5503 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 5504 { 5505 uint32_t progress = state->id_mac_state; 5506 uint_t attempts; 5507 ibt_status_t ret; 5508 ib_gid_t mgid; 5509 ibd_mce_t *mce; 5510 uint8_t jstate; 5511 timeout_id_t tid; 5512 5513 if (atomic_dec_32_nv(&state->id_running) != 0) 5514 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 5515 5516 /* 5517 * Before we try to stop/undo whatever we did in ibd_start(), 5518 * we need to mark the link state appropriately to prevent the 5519 * ip layer from using this instance for any new transfers. Note 5520 * that if the original state of the link was "up" when we're 5521 * here, we'll set the final link state to "unknown", to behave 5522 * in the same fashion as other ethernet drivers. 5523 */ 5524 mutex_enter(&state->id_link_mutex); 5525 if (cur_link_state == LINK_STATE_DOWN) { 5526 state->id_link_state = cur_link_state; 5527 } else { 5528 state->id_link_state = LINK_STATE_UNKNOWN; 5529 } 5530 mutex_exit(&state->id_link_mutex); 5531 bzero(&state->id_macaddr, sizeof (ipoib_mac_t)); 5532 mac_link_update(state->id_mh, state->id_link_state); 5533 5534 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 5535 if (progress & IBD_DRV_STARTED) { 5536 state->id_mac_state &= (~IBD_DRV_STARTED); 5537 } 5538 5539 if (progress & IBD_DRV_IN_LATE_HCA_INIT) { 5540 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT); 5541 } 5542 5543 /* Stop listen under Reliable Connected Mode */ 5544 if (progress & IBD_DRV_RC_LISTEN) { 5545 ASSERT(state->id_enable_rc); 5546 if (state->rc_listen_hdl != NULL) { 5547 ibd_rc_stop_listen(state); 5548 } 5549 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 5550 } 5551 5552 /* Stop timeout routine */ 5553 if (progress & IBD_DRV_RC_TIMEOUT) { 5554 ASSERT(state->id_enable_rc); 5555 mutex_enter(&state->rc_timeout_lock); 5556 state->rc_timeout_start = B_FALSE; 5557 tid = state->rc_timeout; 5558 state->rc_timeout = 0; 5559 mutex_exit(&state->rc_timeout_lock); 5560 if (tid != 0) 5561 (void) untimeout(tid); 5562 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT); 5563 } 5564 5565 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 5566 attempts = 100; 5567 while (state->id_ah_op == IBD_OP_ONGOING) { 5568 /* 5569 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB 5570 * port is connecting to a remote IPoIB port. Wait for 5571 * the end of this connecting operation. 5572 */ 5573 delay(drv_usectohz(100000)); 5574 if (--attempts == 0) { 5575 state->rc_stop_connect++; 5576 DPRINT(40, "ibd_undo_start: connecting"); 5577 break; 5578 } 5579 } 5580 mutex_enter(&state->id_sched_lock); 5581 state->id_sched_needed = 0; 5582 mutex_exit(&state->id_sched_lock); 5583 (void) ibd_rc_close_all_chan(state); 5584 } 5585 5586 /* 5587 * First, stop receive interrupts; this stops the driver from 5588 * handing up buffers to higher layers. Wait for receive buffers 5589 * to be returned and give up after 1 second. 5590 */ 5591 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 5592 attempts = 10; 5593 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 5594 0) > 0) { 5595 delay(drv_usectohz(100000)); 5596 if (--attempts == 0) { 5597 /* 5598 * There are pending bufs with the network 5599 * layer and we have no choice but to wait 5600 * for them to be done with. Reap all the 5601 * Tx/Rx completions that were posted since 5602 * we turned off the notification and 5603 * return failure. 5604 */ 5605 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 5606 DPRINT(2, "ibd_undo_start: " 5607 "reclaiming failed"); 5608 break; 5609 } 5610 } 5611 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 5612 } 5613 5614 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 5615 ibd_rc_fini_tx_largebuf_list(state); 5616 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 5617 } 5618 5619 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 5620 ASSERT(state->id_enable_rc); 5621 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 5622 if (state->id_ah_op == IBD_OP_ONGOING) { 5623 delay(drv_usectohz(10000)); 5624 if (state->id_ah_op == IBD_OP_ONGOING) { 5625 /* 5626 * "state->id_ah_op == IBD_OP_ONGOING" 5627 * means this IPoIB port is connecting 5628 * to a remote IPoIB port. We can't 5629 * delete SRQ here. 5630 */ 5631 state->rc_stop_connect++; 5632 DPRINT(40, "ibd_undo_start: " 5633 "connecting"); 5634 } else { 5635 ibd_rc_fini_srq_list(state); 5636 state->id_mac_state &= 5637 (~IBD_DRV_RC_SRQ_ALLOCD); 5638 } 5639 } else { 5640 ibd_rc_fini_srq_list(state); 5641 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 5642 } 5643 } else { 5644 DPRINT(40, "ibd_undo_start: srq bufs outstanding\n"); 5645 } 5646 } 5647 5648 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 5649 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 5650 5651 mutex_enter(&state->id_trap_lock); 5652 state->id_trap_stop = B_TRUE; 5653 while (state->id_trap_inprog > 0) 5654 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 5655 mutex_exit(&state->id_trap_lock); 5656 5657 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 5658 } 5659 5660 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 5661 /* 5662 * Flushing the channel ensures that all pending WQE's 5663 * are marked with flush_error and handed to the CQ. It 5664 * does not guarantee the invocation of the CQ handler. 5665 * This call is guaranteed to return successfully for 5666 * UD QPNs. 5667 */ 5668 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 5669 IBT_SUCCESS) { 5670 DPRINT(10, "ibd_undo_start: flush_channel " 5671 "failed, ret=%d", ret); 5672 } 5673 5674 /* 5675 * Give some time for the TX CQ handler to process the 5676 * completions. 5677 */ 5678 attempts = 10; 5679 mutex_enter(&state->id_tx_list.dl_mutex); 5680 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5681 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 5682 != state->id_ud_num_swqe) { 5683 if (--attempts == 0) 5684 break; 5685 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5686 mutex_exit(&state->id_tx_list.dl_mutex); 5687 delay(drv_usectohz(100000)); 5688 mutex_enter(&state->id_tx_list.dl_mutex); 5689 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5690 } 5691 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5692 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 5693 state->id_ud_num_swqe) { 5694 cmn_err(CE_WARN, "tx resources not freed\n"); 5695 } 5696 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5697 mutex_exit(&state->id_tx_list.dl_mutex); 5698 5699 attempts = 10; 5700 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5701 if (--attempts == 0) 5702 break; 5703 delay(drv_usectohz(100000)); 5704 } 5705 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 5706 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5707 cmn_err(CE_WARN, "rx resources not freed\n"); 5708 } 5709 5710 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 5711 } 5712 5713 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 5714 /* 5715 * Drop all residual full/non membership. This includes full 5716 * membership to the broadcast group, and any nonmembership 5717 * acquired during transmits. We do this after the Tx completion 5718 * handlers are done, since those might result in some late 5719 * leaves; this also eliminates a potential race with that 5720 * path wrt the mc full list insert/delete. Trap handling 5721 * has also been suppressed at this point. Thus, no locks 5722 * are required while traversing the mc full list. 5723 */ 5724 DPRINT(2, "ibd_undo_start: clear full cache entries"); 5725 mce = list_head(&state->id_mc_full); 5726 while (mce != NULL) { 5727 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5728 jstate = mce->mc_jstate; 5729 mce = list_next(&state->id_mc_full, mce); 5730 ibd_leave_group(state, mgid, jstate); 5731 } 5732 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 5733 } 5734 5735 if (progress & IBD_DRV_RXLIST_ALLOCD) { 5736 ibd_fini_rxlist(state); 5737 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 5738 } 5739 5740 if (progress & IBD_DRV_TXLIST_ALLOCD) { 5741 ibd_fini_txlist(state); 5742 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 5743 } 5744 5745 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 5746 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 5747 IBT_SUCCESS) { 5748 DPRINT(10, "ibd_undo_start: free_channel " 5749 "failed, ret=%d", ret); 5750 } 5751 5752 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 5753 } 5754 5755 if (progress & IBD_DRV_CQS_ALLOCD) { 5756 kmem_free(state->id_txwcs, 5757 sizeof (ibt_wc_t) * state->id_txwcs_size); 5758 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 5759 IBT_SUCCESS) { 5760 DPRINT(10, "ibd_undo_start: free_cq(scq) " 5761 "failed, ret=%d", ret); 5762 } 5763 5764 kmem_free(state->id_rxwcs, 5765 sizeof (ibt_wc_t) * state->id_rxwcs_size); 5766 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 5767 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 5768 "ret=%d", ret); 5769 } 5770 5771 state->id_txwcs = NULL; 5772 state->id_rxwcs = NULL; 5773 state->id_scq_hdl = NULL; 5774 state->id_rcq_hdl = NULL; 5775 5776 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 5777 } 5778 5779 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 5780 mutex_enter(&state->id_ac_mutex); 5781 mod_hash_destroy_hash(state->id_ah_active_hash); 5782 mutex_exit(&state->id_ac_mutex); 5783 ibd_acache_fini(state); 5784 5785 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 5786 } 5787 5788 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 5789 /* 5790 * If we'd created the ipoib broadcast group and had 5791 * successfully joined it, leave it now 5792 */ 5793 if (state->id_bgroup_created) { 5794 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 5795 jstate = IB_MC_JSTATE_FULL; 5796 (void) ibt_leave_mcg(state->id_sgid, mgid, 5797 state->id_sgid, jstate); 5798 } 5799 ibt_free_mcg_info(state->id_mcinfo, 1); 5800 5801 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 5802 } 5803 5804 return (DDI_SUCCESS); 5805 } 5806 5807 /* 5808 * These pair of routines are used to set/clear the condition that 5809 * the caller is likely to do something to change the id_mac_state. 5810 * If there's already someone doing either a start or a stop (possibly 5811 * due to the async handler detecting a pkey relocation event, a plumb 5812 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 5813 * that's done. 5814 */ 5815 static void 5816 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 5817 { 5818 mutex_enter(&state->id_macst_lock); 5819 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 5820 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 5821 5822 state->id_mac_state |= flag; 5823 mutex_exit(&state->id_macst_lock); 5824 } 5825 5826 static void 5827 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 5828 { 5829 mutex_enter(&state->id_macst_lock); 5830 state->id_mac_state &= (~flag); 5831 cv_signal(&state->id_macst_cv); 5832 mutex_exit(&state->id_macst_lock); 5833 } 5834 5835 /* 5836 * GLDv3 entry point to start hardware. 5837 */ 5838 /*ARGSUSED*/ 5839 static int 5840 ibd_m_start(void *arg) 5841 { 5842 ibd_state_t *state = arg; 5843 int ret; 5844 5845 if (state->id_type == IBD_PORT_DRIVER) 5846 return (EINVAL); 5847 5848 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5849 if (state->id_mac_state & IBD_DRV_IN_DELETION) { 5850 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5851 return (EIO); 5852 } 5853 5854 ret = ibd_start(state); 5855 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5856 return (ret); 5857 } 5858 5859 static int 5860 ibd_start(ibd_state_t *state) 5861 { 5862 int err; 5863 ibt_status_t ret; 5864 int late_hca_init = 0; 5865 5866 if (state->id_mac_state & IBD_DRV_STARTED) 5867 return (DDI_SUCCESS); 5868 5869 /* 5870 * We do not increment the running flag when calling ibd_start() as 5871 * a result of some event which moves the state away from late HCA 5872 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability. 5873 */ 5874 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 5875 (atomic_inc_32_nv(&state->id_running) != 1)) { 5876 DPRINT(10, "ibd_start: id_running is non-zero"); 5877 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 5878 atomic_dec_32(&state->id_running); 5879 return (EINVAL); 5880 } 5881 5882 /* 5883 * Get port details; if we fail here, something bad happened. 5884 * Fail plumb. 5885 */ 5886 if ((err = ibd_get_port_details(state)) != 0) { 5887 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 5888 goto start_fail; 5889 } 5890 /* 5891 * If state->id_link_state is DOWN, it indicates that either the port 5892 * is down, or the pkey is not available. In both cases, resort to late 5893 * initialization. Register for subnet notices, and return success. 5894 */ 5895 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 5896 if (state->id_link_state == LINK_STATE_DOWN) { 5897 late_hca_init = 1; 5898 goto late_hca_init_return; 5899 } 5900 5901 /* 5902 * Find the IPoIB broadcast group 5903 */ 5904 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 5905 /* Resort to late initialization */ 5906 late_hca_init = 1; 5907 goto reg_snet_notices; 5908 } 5909 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 5910 5911 /* 5912 * Initialize per-interface caches and lists; if we fail here, 5913 * it is most likely due to a lack of resources 5914 */ 5915 if (ibd_acache_init(state) != DDI_SUCCESS) { 5916 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 5917 err = ENOMEM; 5918 goto start_fail; 5919 } 5920 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 5921 5922 /* 5923 * Allocate send and receive completion queues 5924 */ 5925 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 5926 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 5927 err = ENOMEM; 5928 goto start_fail; 5929 } 5930 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 5931 5932 /* 5933 * Setup a UD channel 5934 */ 5935 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 5936 err = ENOMEM; 5937 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 5938 goto start_fail; 5939 } 5940 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 5941 5942 /* 5943 * Allocate and initialize the tx buffer list 5944 */ 5945 if (ibd_init_txlist(state) != DDI_SUCCESS) { 5946 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 5947 err = ENOMEM; 5948 goto start_fail; 5949 } 5950 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 5951 5952 /* 5953 * Create the send cq handler here 5954 */ 5955 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 5956 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 5957 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5958 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 5959 "failed, ret=%d", ret); 5960 err = EINVAL; 5961 goto start_fail; 5962 } 5963 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 5964 5965 /* 5966 * Allocate and initialize the rx buffer list 5967 */ 5968 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 5969 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 5970 err = ENOMEM; 5971 goto start_fail; 5972 } 5973 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 5974 5975 /* 5976 * Join IPoIB broadcast group 5977 */ 5978 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 5979 DPRINT(10, "ibd_start: ibd_join_group() failed"); 5980 err = ENOTACTIVE; 5981 goto start_fail; 5982 } 5983 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 5984 5985 /* 5986 * When we did mac_register() in ibd_attach(), we didn't register 5987 * the real macaddr and we didn't have the true port mtu. Now that 5988 * we're almost ready, set the local mac address and broadcast 5989 * addresses and update gldv3 about the real values of these 5990 * parameters. 5991 */ 5992 if (state->id_enable_rc) { 5993 ibd_h2n_mac(&state->id_macaddr, 5994 IBD_MAC_ADDR_RC + state->id_qpnum, 5995 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5996 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 5997 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5998 } else { 5999 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 6000 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 6001 } 6002 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 6003 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 6004 6005 if (!state->id_enable_rc) { 6006 (void) mac_maxsdu_update(state->id_mh, state->id_mtu 6007 - IPOIB_HDRSIZE); 6008 } 6009 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6010 6011 /* 6012 * Setup the receive cq handler 6013 */ 6014 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 6015 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 6016 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 6017 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 6018 "failed, ret=%d", ret); 6019 err = EINVAL; 6020 goto start_fail; 6021 } 6022 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 6023 6024 reg_snet_notices: 6025 /* 6026 * In case of normal initialization sequence, 6027 * Setup the subnet notices handler after we've initialized the acache/ 6028 * mcache and started the async thread, both of which are required for 6029 * the trap handler to function properly. 6030 * 6031 * Now that the async thread has been started (and we've already done 6032 * a mac_register() during attach so mac_tx_update() can be called 6033 * if necessary without any problem), we can enable the trap handler 6034 * to queue requests to the async thread. 6035 * 6036 * In case of late hca initialization, the subnet notices handler will 6037 * only handle MCG created/deleted event. The action performed as part 6038 * of handling these events is to start the interface. So, the 6039 * acache/mcache initialization is not a necessity in such cases for 6040 * registering the subnet notices handler. Also, if we are in 6041 * ibd_start() as a result of, say, some event handling after entering 6042 * late hca initialization phase no need to register again. 6043 */ 6044 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) { 6045 ibt_register_subnet_notices(state->id_ibt_hdl, 6046 ibd_snet_notices_handler, state); 6047 mutex_enter(&state->id_trap_lock); 6048 state->id_trap_stop = B_FALSE; 6049 mutex_exit(&state->id_trap_lock); 6050 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 6051 } 6052 6053 late_hca_init_return: 6054 if (late_hca_init == 1) { 6055 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT; 6056 /* 6057 * In case of late initialization, mark the link state as down, 6058 * immaterial of the actual link state as reported in the 6059 * port_info. 6060 */ 6061 state->id_link_state = LINK_STATE_DOWN; 6062 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6063 mac_link_update(state->id_mh, state->id_link_state); 6064 return (DDI_SUCCESS); 6065 } 6066 6067 if (state->id_enable_rc) { 6068 if (state->rc_enable_srq) { 6069 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 6070 if (ibd_rc_repost_srq_free_list(state) != 6071 IBT_SUCCESS) { 6072 err = ENOMEM; 6073 goto start_fail; 6074 } 6075 } else { 6076 /* Allocate SRQ resource */ 6077 if (ibd_rc_init_srq_list(state) != 6078 IBT_SUCCESS) { 6079 err = ENOMEM; 6080 goto start_fail; 6081 } 6082 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 6083 } 6084 } 6085 6086 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 6087 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 6088 "failed"); 6089 err = ENOMEM; 6090 goto start_fail; 6091 } 6092 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 6093 6094 /* RC: begin to listen only after everything is available */ 6095 if (ibd_rc_listen(state) != IBT_SUCCESS) { 6096 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 6097 err = EINVAL; 6098 goto start_fail; 6099 } 6100 state->id_mac_state |= IBD_DRV_RC_LISTEN; 6101 } 6102 6103 /* 6104 * Indicate link status to GLDv3 and higher layers. By default, 6105 * we assume we are in up state (which must have been true at 6106 * least at the time the broadcast mcg's were probed); if there 6107 * were any up/down transitions till the time we come here, the 6108 * async handler will have updated last known state, which we 6109 * use to tell GLDv3. The async handler will not send any 6110 * notifications to GLDv3 till we reach here in the initialization 6111 * sequence. 6112 */ 6113 mac_link_update(state->id_mh, state->id_link_state); 6114 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT; 6115 state->id_mac_state |= IBD_DRV_STARTED; 6116 6117 /* Start timer after everything is ready */ 6118 if (state->id_enable_rc) { 6119 mutex_enter(&state->rc_timeout_lock); 6120 state->rc_timeout_start = B_TRUE; 6121 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state, 6122 SEC_TO_TICK(ibd_rc_conn_timeout)); 6123 mutex_exit(&state->rc_timeout_lock); 6124 state->id_mac_state |= IBD_DRV_RC_TIMEOUT; 6125 } 6126 6127 return (DDI_SUCCESS); 6128 6129 start_fail: 6130 /* 6131 * If we ran into a problem during ibd_start() and ran into 6132 * some other problem during undoing our partial work, we can't 6133 * do anything about it. Ignore any errors we might get from 6134 * ibd_undo_start() and just return the original error we got. 6135 */ 6136 (void) ibd_undo_start(state, LINK_STATE_DOWN); 6137 return (err); 6138 } 6139 6140 /* 6141 * GLDv3 entry point to stop hardware from receiving packets. 6142 */ 6143 /*ARGSUSED*/ 6144 static void 6145 ibd_m_stop(void *arg) 6146 { 6147 ibd_state_t *state = (ibd_state_t *)arg; 6148 6149 if (state->id_type == IBD_PORT_DRIVER) 6150 return; 6151 6152 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6153 6154 (void) ibd_undo_start(state, state->id_link_state); 6155 6156 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6157 } 6158 6159 /* 6160 * GLDv3 entry point to modify device's mac address. We do not 6161 * allow address modifications. 6162 */ 6163 static int 6164 ibd_m_unicst(void *arg, const uint8_t *macaddr) 6165 { 6166 ibd_state_t *state = arg; 6167 6168 if (state->id_type == IBD_PORT_DRIVER) 6169 return (EINVAL); 6170 6171 /* 6172 * Don't bother even comparing the macaddr if we haven't 6173 * completed ibd_m_start(). 6174 */ 6175 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6176 return (0); 6177 6178 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 6179 return (0); 6180 else 6181 return (EINVAL); 6182 } 6183 6184 /* 6185 * The blocking part of the IBA join/leave operations are done out 6186 * of here on the async thread. 6187 */ 6188 static void 6189 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 6190 { 6191 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 6192 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 6193 6194 if (op == IBD_ASYNC_JOIN) { 6195 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 6196 ibd_print_warn(state, "Join multicast group failed :" 6197 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6198 } 6199 } else { 6200 /* 6201 * Here, we must search for the proper mcg_info and 6202 * use that to leave the group. 6203 */ 6204 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 6205 } 6206 } 6207 6208 /* 6209 * GLDv3 entry point for multicast enable/disable requests. 6210 * This function queues the operation to the async thread and 6211 * return success for a valid multicast address. 6212 */ 6213 static int 6214 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 6215 { 6216 ibd_state_t *state = (ibd_state_t *)arg; 6217 ipoib_mac_t maddr, *mcast; 6218 ib_gid_t mgid; 6219 ibd_req_t *req; 6220 6221 if (state->id_type == IBD_PORT_DRIVER) 6222 return (EINVAL); 6223 6224 /* 6225 * If we haven't completed ibd_m_start(), async thread wouldn't 6226 * have been started and id_bcaddr wouldn't be set, so there's 6227 * no point in continuing. 6228 */ 6229 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6230 return (0); 6231 6232 /* 6233 * The incoming multicast address might not be aligned properly 6234 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 6235 * it to look like one though, to get the offsets of the mc gid, 6236 * since we know we are not going to dereference any values with 6237 * the ipoib_mac_t pointer. 6238 */ 6239 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 6240 mcast = &maddr; 6241 6242 /* 6243 * Check validity of MCG address. We could additionally check 6244 * that a enable/disable is not being issued on the "broadcast" 6245 * mcg, but since this operation is only invokable by privileged 6246 * programs anyway, we allow the flexibility to those dlpi apps. 6247 * Note that we do not validate the "scope" of the IBA mcg. 6248 */ 6249 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 6250 return (EINVAL); 6251 6252 /* 6253 * fill in multicast pkey and scope 6254 */ 6255 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 6256 6257 /* 6258 * If someone is trying to JOIN/LEAVE the broadcast group, we do 6259 * nothing (i.e. we stay JOINed to the broadcast group done in 6260 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 6261 * requires to be joined to broadcast groups at all times. 6262 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 6263 * depends on this. 6264 */ 6265 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6266 return (0); 6267 6268 ibd_n2h_gid(mcast, &mgid); 6269 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6270 if (req == NULL) 6271 return (ENOMEM); 6272 6273 req->rq_gid = mgid; 6274 6275 if (add) { 6276 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 6277 mgid.gid_prefix, mgid.gid_guid); 6278 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 6279 } else { 6280 DPRINT(1, "ibd_m_multicst : unset_multicast : " 6281 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6282 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 6283 } 6284 return (0); 6285 } 6286 6287 /* 6288 * The blocking part of the IBA promiscuous operations are done 6289 * out of here on the async thread. The dlpireq parameter indicates 6290 * whether this invocation is due to a dlpi request or due to 6291 * a port up/down event. 6292 */ 6293 static void 6294 ibd_async_unsetprom(ibd_state_t *state) 6295 { 6296 ibd_mce_t *mce = list_head(&state->id_mc_non); 6297 ib_gid_t mgid; 6298 6299 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 6300 6301 while (mce != NULL) { 6302 mgid = mce->mc_info.mc_adds_vect.av_dgid; 6303 mce = list_next(&state->id_mc_non, mce); 6304 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 6305 } 6306 state->id_prom_op = IBD_OP_NOTSTARTED; 6307 } 6308 6309 /* 6310 * The blocking part of the IBA promiscuous operations are done 6311 * out of here on the async thread. The dlpireq parameter indicates 6312 * whether this invocation is due to a dlpi request or due to 6313 * a port up/down event. 6314 */ 6315 static void 6316 ibd_async_setprom(ibd_state_t *state) 6317 { 6318 ibt_mcg_attr_t mcg_attr; 6319 ibt_mcg_info_t *mcg_info; 6320 ib_gid_t mgid; 6321 uint_t numg; 6322 int i; 6323 char ret = IBD_OP_COMPLETED; 6324 6325 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 6326 6327 /* 6328 * Obtain all active MC groups on the IB fabric with 6329 * specified criteria (scope + Pkey + Qkey + mtu). 6330 */ 6331 bzero(&mcg_attr, sizeof (mcg_attr)); 6332 mcg_attr.mc_pkey = state->id_pkey; 6333 mcg_attr.mc_scope = state->id_scope; 6334 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 6335 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 6336 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 6337 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 6338 IBT_SUCCESS) { 6339 ibd_print_warn(state, "Could not get list of IBA multicast " 6340 "groups"); 6341 ret = IBD_OP_ERRORED; 6342 goto done; 6343 } 6344 6345 /* 6346 * Iterate over the returned mcg's and join as NonMember 6347 * to the IP mcg's. 6348 */ 6349 for (i = 0; i < numg; i++) { 6350 /* 6351 * Do a NonMember JOIN on the MC group. 6352 */ 6353 mgid = mcg_info[i].mc_adds_vect.av_dgid; 6354 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 6355 ibd_print_warn(state, "IBA promiscuous mode missed " 6356 "multicast gid %016llx:%016llx", 6357 (u_longlong_t)mgid.gid_prefix, 6358 (u_longlong_t)mgid.gid_guid); 6359 } 6360 6361 ibt_free_mcg_info(mcg_info, numg); 6362 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 6363 done: 6364 state->id_prom_op = ret; 6365 } 6366 6367 /* 6368 * GLDv3 entry point for multicast promiscuous enable/disable requests. 6369 * GLDv3 assumes phys state receives more packets than multi state, 6370 * which is not true for IPoIB. Thus, treat the multi and phys 6371 * promiscuous states the same way to work with GLDv3's assumption. 6372 */ 6373 static int 6374 ibd_m_promisc(void *arg, boolean_t on) 6375 { 6376 ibd_state_t *state = (ibd_state_t *)arg; 6377 ibd_req_t *req; 6378 6379 if (state->id_type == IBD_PORT_DRIVER) 6380 return (EINVAL); 6381 6382 /* 6383 * Async thread wouldn't have been started if we haven't 6384 * passed ibd_m_start() 6385 */ 6386 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6387 return (0); 6388 6389 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6390 if (req == NULL) 6391 return (ENOMEM); 6392 if (on) { 6393 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 6394 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 6395 } else { 6396 DPRINT(1, "ibd_m_promisc : unset_promisc"); 6397 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 6398 } 6399 6400 return (0); 6401 } 6402 6403 /* 6404 * GLDv3 entry point for gathering statistics. 6405 */ 6406 static int 6407 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 6408 { 6409 ibd_state_t *state = (ibd_state_t *)arg; 6410 6411 switch (stat) { 6412 case MAC_STAT_IFSPEED: 6413 *val = state->id_link_speed; 6414 break; 6415 case MAC_STAT_MULTIRCV: 6416 *val = state->id_multi_rcv; 6417 break; 6418 case MAC_STAT_BRDCSTRCV: 6419 *val = state->id_brd_rcv; 6420 break; 6421 case MAC_STAT_MULTIXMT: 6422 *val = state->id_multi_xmt; 6423 break; 6424 case MAC_STAT_BRDCSTXMT: 6425 *val = state->id_brd_xmt; 6426 break; 6427 case MAC_STAT_RBYTES: 6428 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 6429 + state->rc_rcv_copy_byte; 6430 break; 6431 case MAC_STAT_IPACKETS: 6432 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 6433 + state->rc_rcv_copy_pkt; 6434 break; 6435 case MAC_STAT_OBYTES: 6436 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 6437 break; 6438 case MAC_STAT_OPACKETS: 6439 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 6440 state->rc_xmt_fragmented_pkt + 6441 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 6442 break; 6443 case MAC_STAT_OERRORS: 6444 *val = state->id_ah_error; /* failed AH translation */ 6445 break; 6446 case MAC_STAT_IERRORS: 6447 *val = 0; 6448 break; 6449 case MAC_STAT_NOXMTBUF: 6450 *val = state->id_tx_short + state->rc_swqe_short + 6451 state->rc_xmt_buf_short; 6452 break; 6453 case MAC_STAT_NORCVBUF: 6454 default: 6455 return (ENOTSUP); 6456 } 6457 6458 return (0); 6459 } 6460 6461 static void 6462 ibd_async_txsched(ibd_state_t *state) 6463 { 6464 ibd_resume_transmission(state); 6465 } 6466 6467 static void 6468 ibd_resume_transmission(ibd_state_t *state) 6469 { 6470 int flag; 6471 int met_thresh = 0; 6472 int thresh = 0; 6473 int ret = -1; 6474 6475 mutex_enter(&state->id_sched_lock); 6476 if (state->id_sched_needed & IBD_RSRC_SWQE) { 6477 mutex_enter(&state->id_tx_list.dl_mutex); 6478 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6479 met_thresh = state->id_tx_list.dl_cnt + 6480 state->id_tx_rel_list.dl_cnt; 6481 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6482 mutex_exit(&state->id_tx_list.dl_mutex); 6483 thresh = IBD_FREE_SWQES_THRESH; 6484 flag = IBD_RSRC_SWQE; 6485 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 6486 ASSERT(state->id_lso != NULL); 6487 mutex_enter(&state->id_lso_lock); 6488 met_thresh = state->id_lso->bkt_nfree; 6489 thresh = IBD_FREE_LSOS_THRESH; 6490 mutex_exit(&state->id_lso_lock); 6491 flag = IBD_RSRC_LSOBUF; 6492 if (met_thresh > thresh) 6493 state->id_sched_lso_cnt++; 6494 } 6495 if (met_thresh > thresh) { 6496 state->id_sched_needed &= ~flag; 6497 state->id_sched_cnt++; 6498 ret = 0; 6499 } 6500 mutex_exit(&state->id_sched_lock); 6501 6502 if (ret == 0) 6503 mac_tx_update(state->id_mh); 6504 } 6505 6506 /* 6507 * Release the send wqe back into free list. 6508 */ 6509 static void 6510 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 6511 { 6512 /* 6513 * Add back on Tx list for reuse. 6514 */ 6515 ASSERT(tail->swqe_next == NULL); 6516 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6517 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 6518 tail->swqe_next = state->id_tx_rel_list.dl_head; 6519 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 6520 state->id_tx_rel_list.dl_cnt += n; 6521 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6522 } 6523 6524 /* 6525 * Acquire a send wqe from free list. 6526 * Returns error number and send wqe pointer. 6527 */ 6528 static ibd_swqe_t * 6529 ibd_acquire_swqe(ibd_state_t *state) 6530 { 6531 ibd_swqe_t *wqe; 6532 6533 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6534 if (state->id_tx_rel_list.dl_head != NULL) { 6535 /* transfer id_tx_rel_list to id_tx_list */ 6536 state->id_tx_list.dl_head = 6537 state->id_tx_rel_list.dl_head; 6538 state->id_tx_list.dl_cnt = 6539 state->id_tx_rel_list.dl_cnt; 6540 state->id_tx_list.dl_pending_sends = B_FALSE; 6541 6542 /* clear id_tx_rel_list */ 6543 state->id_tx_rel_list.dl_head = NULL; 6544 state->id_tx_rel_list.dl_cnt = 0; 6545 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6546 6547 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 6548 state->id_tx_list.dl_cnt -= 1; 6549 state->id_tx_list.dl_head = wqe->swqe_next; 6550 } else { /* no free swqe */ 6551 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6552 state->id_tx_list.dl_pending_sends = B_TRUE; 6553 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 6554 state->id_tx_short++; 6555 wqe = NULL; 6556 } 6557 return (wqe); 6558 } 6559 6560 static int 6561 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 6562 ibt_ud_dest_hdl_t ud_dest) 6563 { 6564 mblk_t *nmp; 6565 int iph_len, tcph_len; 6566 ibt_wr_lso_t *lso; 6567 uintptr_t ip_start, tcp_start; 6568 uint8_t *dst; 6569 uint_t pending, mblen; 6570 6571 /* 6572 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 6573 * we need to adjust it here for lso. 6574 */ 6575 lso = &(node->w_swr.wr.ud_lso); 6576 lso->lso_ud_dest = ud_dest; 6577 lso->lso_mss = mss; 6578 6579 /* 6580 * Calculate the LSO header size and set it in the UD LSO structure. 6581 * Note that the only assumption we make is that each of the IPoIB, 6582 * IP and TCP headers will be contained in a single mblk fragment; 6583 * together, the headers may span multiple mblk fragments. 6584 */ 6585 nmp = mp; 6586 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 6587 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 6588 ip_start = (uintptr_t)nmp->b_cont->b_rptr 6589 + (ip_start - (uintptr_t)(nmp->b_wptr)); 6590 nmp = nmp->b_cont; 6591 6592 } 6593 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 6594 6595 tcp_start = ip_start + iph_len; 6596 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 6597 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 6598 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 6599 nmp = nmp->b_cont; 6600 } 6601 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 6602 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 6603 6604 /* 6605 * If the lso header fits entirely within a single mblk fragment, 6606 * we'll avoid an additional copy of the lso header here and just 6607 * pass the b_rptr of the mblk directly. 6608 * 6609 * If this isn't true, we'd have to allocate for it explicitly. 6610 */ 6611 if (lso->lso_hdr_sz <= MBLKL(mp)) { 6612 lso->lso_hdr = mp->b_rptr; 6613 } else { 6614 /* On work completion, remember to free this allocated hdr */ 6615 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 6616 if (lso->lso_hdr == NULL) { 6617 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 6618 "sz = %d", lso->lso_hdr_sz); 6619 lso->lso_hdr_sz = 0; 6620 lso->lso_mss = 0; 6621 return (-1); 6622 } 6623 } 6624 6625 /* 6626 * Copy in the lso header only if we need to 6627 */ 6628 if (lso->lso_hdr != mp->b_rptr) { 6629 dst = lso->lso_hdr; 6630 pending = lso->lso_hdr_sz; 6631 6632 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 6633 mblen = MBLKL(nmp); 6634 if (pending > mblen) { 6635 bcopy(nmp->b_rptr, dst, mblen); 6636 dst += mblen; 6637 pending -= mblen; 6638 } else { 6639 bcopy(nmp->b_rptr, dst, pending); 6640 break; 6641 } 6642 } 6643 } 6644 6645 return (0); 6646 } 6647 6648 static void 6649 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 6650 { 6651 ibt_wr_lso_t *lso; 6652 6653 if ((!node) || (!mp)) 6654 return; 6655 6656 /* 6657 * Free any header space that we might've allocated if we 6658 * did an LSO 6659 */ 6660 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 6661 lso = &(node->w_swr.wr.ud_lso); 6662 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 6663 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 6664 lso->lso_hdr = NULL; 6665 lso->lso_hdr_sz = 0; 6666 } 6667 } 6668 } 6669 6670 static void 6671 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 6672 { 6673 uint_t i; 6674 uint_t num_posted; 6675 uint_t n_wrs; 6676 ibt_status_t ibt_status; 6677 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 6678 ibd_swqe_t *tx_head, *elem; 6679 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 6680 6681 /* post the one request, then check for more */ 6682 ibt_status = ibt_post_send(state->id_chnl_hdl, 6683 &node->w_swr, 1, NULL); 6684 if (ibt_status != IBT_SUCCESS) { 6685 ibd_print_warn(state, "ibd_post_send: " 6686 "posting one wr failed: ret=%d", ibt_status); 6687 ibd_tx_cleanup(state, node); 6688 } 6689 6690 tx_head = NULL; 6691 for (;;) { 6692 if (tx_head == NULL) { 6693 mutex_enter(&state->id_txpost_lock); 6694 tx_head = state->id_tx_head; 6695 if (tx_head == NULL) { 6696 state->id_tx_busy = 0; 6697 mutex_exit(&state->id_txpost_lock); 6698 return; 6699 } 6700 state->id_tx_head = NULL; 6701 mutex_exit(&state->id_txpost_lock); 6702 } 6703 6704 /* 6705 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 6706 * at a time if possible, and keep posting them. 6707 */ 6708 for (n_wrs = 0, elem = tx_head; 6709 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 6710 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 6711 nodes[n_wrs] = elem; 6712 wrs[n_wrs] = elem->w_swr; 6713 } 6714 tx_head = elem; 6715 6716 ASSERT(n_wrs != 0); 6717 6718 /* 6719 * If posting fails for some reason, we'll never receive 6720 * completion intimation, so we'll need to cleanup. But 6721 * we need to make sure we don't clean up nodes whose 6722 * wrs have been successfully posted. We assume that the 6723 * hca driver returns on the first failure to post and 6724 * therefore the first 'num_posted' entries don't need 6725 * cleanup here. 6726 */ 6727 num_posted = 0; 6728 ibt_status = ibt_post_send(state->id_chnl_hdl, 6729 wrs, n_wrs, &num_posted); 6730 if (ibt_status != IBT_SUCCESS) { 6731 ibd_print_warn(state, "ibd_post_send: " 6732 "posting multiple wrs failed: " 6733 "requested=%d, done=%d, ret=%d", 6734 n_wrs, num_posted, ibt_status); 6735 6736 for (i = num_posted; i < n_wrs; i++) 6737 ibd_tx_cleanup(state, nodes[i]); 6738 } 6739 } 6740 } 6741 6742 static int 6743 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 6744 uint_t lsohdr_sz) 6745 { 6746 ibt_wr_ds_t *sgl; 6747 ibt_status_t ibt_status; 6748 mblk_t *nmp; 6749 mblk_t *data_mp; 6750 uchar_t *bufp; 6751 size_t blksize; 6752 size_t skip; 6753 size_t avail; 6754 uint_t pktsize; 6755 uint_t frag_len; 6756 uint_t pending_hdr; 6757 int nmblks; 6758 int i; 6759 6760 /* 6761 * Let's skip ahead to the data if this is LSO 6762 */ 6763 data_mp = mp; 6764 pending_hdr = 0; 6765 if (lsohdr_sz) { 6766 pending_hdr = lsohdr_sz; 6767 for (nmp = mp; nmp; nmp = nmp->b_cont) { 6768 frag_len = nmp->b_wptr - nmp->b_rptr; 6769 if (frag_len > pending_hdr) 6770 break; 6771 pending_hdr -= frag_len; 6772 } 6773 data_mp = nmp; /* start of data past lso header */ 6774 ASSERT(data_mp != NULL); 6775 } 6776 6777 /* 6778 * Calculate the size of message data and number of msg blocks 6779 */ 6780 pktsize = 0; 6781 for (nmblks = 0, nmp = data_mp; nmp != NULL; 6782 nmp = nmp->b_cont, nmblks++) { 6783 pktsize += MBLKL(nmp); 6784 } 6785 pktsize -= pending_hdr; 6786 6787 /* 6788 * We only do ibt_map_mem_iov() if the pktsize is above the 6789 * "copy-threshold", and if the number of mp fragments is less than 6790 * the maximum acceptable. 6791 */ 6792 if ((state->id_hca_res_lkey_capab) && 6793 (pktsize > state->id_ud_tx_copy_thresh) && 6794 (nmblks < state->id_max_sqseg_hiwm)) { 6795 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6796 ibt_iov_attr_t iov_attr; 6797 6798 iov_attr.iov_as = NULL; 6799 iov_attr.iov = iov_arr; 6800 iov_attr.iov_buf = NULL; 6801 iov_attr.iov_list_len = nmblks; 6802 iov_attr.iov_wr_nds = state->id_max_sqseg; 6803 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 6804 iov_attr.iov_flags = IBT_IOV_SLEEP; 6805 6806 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 6807 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 6808 iov_arr[i].iov_len = MBLKL(nmp); 6809 if (i == 0) { 6810 iov_arr[i].iov_addr += pending_hdr; 6811 iov_arr[i].iov_len -= pending_hdr; 6812 } 6813 } 6814 6815 node->w_buftype = IBD_WQE_MAPPED; 6816 node->w_swr.wr_sgl = node->w_sgl; 6817 6818 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 6819 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 6820 if (ibt_status != IBT_SUCCESS) { 6821 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 6822 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 6823 goto ibd_copy_path; 6824 } 6825 6826 return (0); 6827 } 6828 6829 ibd_copy_path: 6830 if (pktsize <= state->id_tx_buf_sz) { 6831 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6832 node->w_swr.wr_nds = 1; 6833 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6834 node->w_buftype = IBD_WQE_TXBUF; 6835 6836 /* 6837 * Even though this is the copy path for transfers less than 6838 * id_tx_buf_sz, it could still be an LSO packet. If so, it 6839 * is possible the first data mblk fragment (data_mp) still 6840 * contains part of the LSO header that we need to skip. 6841 */ 6842 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6843 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 6844 blksize = MBLKL(nmp) - pending_hdr; 6845 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 6846 bufp += blksize; 6847 pending_hdr = 0; 6848 } 6849 6850 return (0); 6851 } 6852 6853 /* 6854 * Copy path for transfers greater than id_tx_buf_sz 6855 */ 6856 node->w_swr.wr_sgl = node->w_sgl; 6857 if (ibd_acquire_lsobufs(state, pktsize, 6858 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 6859 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 6860 return (-1); 6861 } 6862 node->w_buftype = IBD_WQE_LSOBUF; 6863 6864 /* 6865 * Copy the larger-than-id_tx_buf_sz packet into a set of 6866 * fixed-sized, pre-mapped LSO buffers. Note that we might 6867 * need to skip part of the LSO header in the first fragment 6868 * as before. 6869 */ 6870 nmp = data_mp; 6871 skip = pending_hdr; 6872 for (i = 0; i < node->w_swr.wr_nds; i++) { 6873 sgl = node->w_swr.wr_sgl + i; 6874 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 6875 avail = IBD_LSO_BUFSZ; 6876 while (nmp && avail) { 6877 blksize = MBLKL(nmp) - skip; 6878 if (blksize > avail) { 6879 bcopy(nmp->b_rptr + skip, bufp, avail); 6880 skip += avail; 6881 avail = 0; 6882 } else { 6883 bcopy(nmp->b_rptr + skip, bufp, blksize); 6884 skip = 0; 6885 avail -= blksize; 6886 bufp += blksize; 6887 nmp = nmp->b_cont; 6888 } 6889 } 6890 } 6891 6892 return (0); 6893 } 6894 6895 /* 6896 * Schedule a completion queue polling to reap the resource we're 6897 * short on. If we implement the change to reap tx completions 6898 * in a separate thread, we'll need to wake up that thread here. 6899 */ 6900 static int 6901 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 6902 { 6903 ibd_req_t *req; 6904 6905 mutex_enter(&state->id_sched_lock); 6906 state->id_sched_needed |= resource_type; 6907 mutex_exit(&state->id_sched_lock); 6908 6909 /* 6910 * If we are asked to queue a work entry, we need to do it 6911 */ 6912 if (q_flag) { 6913 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6914 if (req == NULL) 6915 return (-1); 6916 6917 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 6918 } 6919 6920 return (0); 6921 } 6922 6923 /* 6924 * The passed in packet has this format: 6925 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 6926 */ 6927 static boolean_t 6928 ibd_send(ibd_state_t *state, mblk_t *mp) 6929 { 6930 ibd_ace_t *ace; 6931 ibd_swqe_t *node; 6932 ipoib_mac_t *dest; 6933 ib_header_info_t *ipibp; 6934 ip6_t *ip6h; 6935 uint_t pktsize; 6936 uint32_t mss; 6937 uint32_t hckflags; 6938 uint32_t lsoflags = 0; 6939 uint_t lsohdr_sz = 0; 6940 int ret, len; 6941 boolean_t dofree = B_FALSE; 6942 boolean_t rc; 6943 /* if (rc_chan == NULL) send by UD; else send by RC; */ 6944 ibd_rc_chan_t *rc_chan; 6945 int nmblks; 6946 mblk_t *nmp; 6947 6948 /* 6949 * If we aren't done with the device initialization and start, 6950 * we shouldn't be here. 6951 */ 6952 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6953 return (B_FALSE); 6954 6955 /* 6956 * Obtain an address handle for the destination. 6957 */ 6958 ipibp = (ib_header_info_t *)mp->b_rptr; 6959 dest = (ipoib_mac_t *)&ipibp->ib_dst; 6960 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6961 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 6962 6963 rc_chan = NULL; 6964 ace = ibd_acache_lookup(state, dest, &ret, 1); 6965 if (state->id_enable_rc && (ace != NULL) && 6966 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 6967 if (ace->ac_chan == NULL) { 6968 state->rc_null_conn++; 6969 } else { 6970 if (ace->ac_chan->chan_state == 6971 IBD_RC_STATE_ACT_ESTAB) { 6972 rc_chan = ace->ac_chan; 6973 rc_chan->is_used = B_TRUE; 6974 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 6975 node = WQE_TO_SWQE( 6976 rc_chan->tx_wqe_list.dl_head); 6977 if (node != NULL) { 6978 rc_chan->tx_wqe_list.dl_cnt -= 1; 6979 rc_chan->tx_wqe_list.dl_head = 6980 node->swqe_next; 6981 } else { 6982 node = ibd_rc_acquire_swqes(rc_chan); 6983 } 6984 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 6985 6986 if (node == NULL) { 6987 state->rc_swqe_short++; 6988 mutex_enter(&state->id_sched_lock); 6989 state->id_sched_needed |= 6990 IBD_RSRC_RC_SWQE; 6991 mutex_exit(&state->id_sched_lock); 6992 ibd_dec_ref_ace(state, ace); 6993 return (B_FALSE); 6994 } 6995 } else { 6996 state->rc_no_estab_conn++; 6997 } 6998 } 6999 } 7000 7001 if (rc_chan == NULL) { 7002 mutex_enter(&state->id_tx_list.dl_mutex); 7003 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 7004 if (node != NULL) { 7005 state->id_tx_list.dl_cnt -= 1; 7006 state->id_tx_list.dl_head = node->swqe_next; 7007 } else { 7008 node = ibd_acquire_swqe(state); 7009 } 7010 mutex_exit(&state->id_tx_list.dl_mutex); 7011 if (node == NULL) { 7012 /* 7013 * If we don't have an swqe available, schedule a 7014 * transmit completion queue cleanup and hold off on 7015 * sending more packets until we have some free swqes 7016 */ 7017 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 7018 if (ace != NULL) { 7019 ibd_dec_ref_ace(state, ace); 7020 } 7021 return (B_FALSE); 7022 } 7023 7024 /* 7025 * If a poll cannot be scheduled, we have no choice but 7026 * to drop this packet 7027 */ 7028 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 7029 if (ace != NULL) { 7030 ibd_dec_ref_ace(state, ace); 7031 } 7032 return (B_TRUE); 7033 } 7034 } 7035 7036 /* 7037 * Initialize the commonly used fields in swqe to NULL to protect 7038 * against ibd_tx_cleanup accidentally misinterpreting these on a 7039 * failure. 7040 */ 7041 node->swqe_im_mblk = NULL; 7042 node->w_swr.wr_nds = 0; 7043 node->w_swr.wr_sgl = NULL; 7044 node->w_swr.wr_opcode = IBT_WRC_SEND; 7045 7046 /* 7047 * Calculate the size of message data and number of msg blocks 7048 */ 7049 pktsize = 0; 7050 for (nmblks = 0, nmp = mp; nmp != NULL; 7051 nmp = nmp->b_cont, nmblks++) { 7052 pktsize += MBLKL(nmp); 7053 } 7054 7055 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 7056 atomic_inc_64(&state->id_brd_xmt); 7057 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 7058 atomic_inc_64(&state->id_multi_xmt); 7059 7060 if (ace != NULL) { 7061 node->w_ahandle = ace; 7062 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 7063 } else { 7064 DPRINT(5, 7065 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 7066 ((ret == EFAULT) ? "failed" : "queued"), 7067 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 7068 htonl(dest->ipoib_gidpref[1]), 7069 htonl(dest->ipoib_gidsuff[0]), 7070 htonl(dest->ipoib_gidsuff[1])); 7071 state->rc_ace_not_found++; 7072 node->w_ahandle = NULL; 7073 7074 /* 7075 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 7076 * can not find a path for the specific dest address. We 7077 * should get rid of this kind of packet. We also should get 7078 * rid of the packet if we cannot schedule a poll via the 7079 * async thread. For the normal case, ibd will return the 7080 * packet to upper layer and wait for AH creating. 7081 * 7082 * Note that we always queue a work slot entry for the async 7083 * thread when we fail AH lookup (even in intr mode); this is 7084 * due to the convoluted way the code currently looks for AH. 7085 */ 7086 if (ret == EFAULT) { 7087 dofree = B_TRUE; 7088 rc = B_TRUE; 7089 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 7090 dofree = B_TRUE; 7091 rc = B_TRUE; 7092 } else { 7093 dofree = B_FALSE; 7094 rc = B_FALSE; 7095 } 7096 goto ibd_send_fail; 7097 } 7098 7099 /* 7100 * For ND6 packets, padding is at the front of the source lladdr. 7101 * Insert the padding at front. 7102 */ 7103 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 7104 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 7105 if (!pullupmsg(mp, IPV6_HDR_LEN + 7106 sizeof (ib_header_info_t))) { 7107 DPRINT(10, "ibd_send: pullupmsg failure "); 7108 dofree = B_TRUE; 7109 rc = B_TRUE; 7110 goto ibd_send_fail; 7111 } 7112 ipibp = (ib_header_info_t *)mp->b_rptr; 7113 } 7114 ip6h = (ip6_t *)((uchar_t *)ipibp + 7115 sizeof (ib_header_info_t)); 7116 len = ntohs(ip6h->ip6_plen); 7117 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7118 mblk_t *pad; 7119 7120 pad = allocb(4, 0); 7121 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 7122 linkb(mp, pad); 7123 if (MBLKL(mp) < sizeof (ib_header_info_t) + 7124 IPV6_HDR_LEN + len + 4) { 7125 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 7126 IPV6_HDR_LEN + len + 4)) { 7127 DPRINT(10, "ibd_send: pullupmsg " 7128 "failure "); 7129 dofree = B_TRUE; 7130 rc = B_TRUE; 7131 goto ibd_send_fail; 7132 } 7133 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 7134 sizeof (ib_header_info_t)); 7135 } 7136 7137 /* LINTED: E_CONSTANT_CONDITION */ 7138 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 7139 } 7140 } 7141 7142 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 7143 mp->b_rptr += sizeof (ib_addrs_t); 7144 pktsize -= sizeof (ib_addrs_t); 7145 7146 if (rc_chan) { /* send in RC mode */ 7147 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 7148 ibt_iov_attr_t iov_attr; 7149 uint_t i; 7150 size_t blksize; 7151 uchar_t *bufp; 7152 ibd_rc_tx_largebuf_t *lbufp; 7153 7154 atomic_add_64(&state->rc_xmt_bytes, pktsize); 7155 7156 /* 7157 * Upper layer does Tx checksum, we don't need do any 7158 * checksum here. 7159 */ 7160 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 7161 7162 /* 7163 * We only do ibt_map_mem_iov() if the pktsize is above 7164 * the "copy-threshold", and if the number of mp 7165 * fragments is less than the maximum acceptable. 7166 */ 7167 if (pktsize <= state->id_rc_tx_copy_thresh) { 7168 atomic_inc_64(&state->rc_xmt_small_pkt); 7169 /* 7170 * Only process unicast packet in Reliable Connected 7171 * mode. 7172 */ 7173 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 7174 node->w_swr.wr_nds = 1; 7175 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 7176 node->w_buftype = IBD_WQE_TXBUF; 7177 7178 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 7179 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7180 blksize = MBLKL(nmp); 7181 bcopy(nmp->b_rptr, bufp, blksize); 7182 bufp += blksize; 7183 } 7184 freemsg(mp); 7185 ASSERT(node->swqe_im_mblk == NULL); 7186 } else { 7187 if ((state->rc_enable_iov_map) && 7188 (nmblks < state->rc_max_sqseg_hiwm)) { 7189 7190 /* do ibt_map_mem_iov() */ 7191 iov_attr.iov_as = NULL; 7192 iov_attr.iov = iov_arr; 7193 iov_attr.iov_buf = NULL; 7194 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 7195 iov_attr.iov_lso_hdr_sz = 0; 7196 iov_attr.iov_flags = IBT_IOV_SLEEP; 7197 7198 i = 0; 7199 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7200 iov_arr[i].iov_len = MBLKL(nmp); 7201 if (iov_arr[i].iov_len != 0) { 7202 iov_arr[i].iov_addr = (caddr_t) 7203 (void *)nmp->b_rptr; 7204 i++; 7205 } 7206 } 7207 iov_attr.iov_list_len = i; 7208 node->w_swr.wr_sgl = node->w_sgl; 7209 7210 ret = ibt_map_mem_iov(state->id_hca_hdl, 7211 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 7212 &node->w_mi_hdl); 7213 if (ret != IBT_SUCCESS) { 7214 atomic_inc_64( 7215 &state->rc_xmt_map_fail_pkt); 7216 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 7217 ") failed, nmblks=%d, real_nmblks" 7218 "=%d, ret=0x%x", nmblks, i, ret); 7219 goto ibd_rc_large_copy; 7220 } 7221 7222 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 7223 node->w_buftype = IBD_WQE_MAPPED; 7224 node->swqe_im_mblk = mp; 7225 } else { 7226 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 7227 ibd_rc_large_copy: 7228 mutex_enter(&state->rc_tx_large_bufs_lock); 7229 if (state->rc_tx_largebuf_nfree == 0) { 7230 state->rc_xmt_buf_short++; 7231 mutex_exit 7232 (&state->rc_tx_large_bufs_lock); 7233 mutex_enter(&state->id_sched_lock); 7234 state->id_sched_needed |= 7235 IBD_RSRC_RC_TX_LARGEBUF; 7236 mutex_exit(&state->id_sched_lock); 7237 dofree = B_FALSE; 7238 rc = B_FALSE; 7239 /* 7240 * If we don't have Tx large bufs, 7241 * return failure. node->w_buftype 7242 * should not be IBD_WQE_RC_COPYBUF, 7243 * otherwise it will cause problem 7244 * in ibd_rc_tx_cleanup() 7245 */ 7246 node->w_buftype = IBD_WQE_TXBUF; 7247 goto ibd_send_fail; 7248 } 7249 7250 lbufp = state->rc_tx_largebuf_free_head; 7251 ASSERT(lbufp->lb_buf != NULL); 7252 state->rc_tx_largebuf_free_head = 7253 lbufp->lb_next; 7254 lbufp->lb_next = NULL; 7255 /* Update nfree count */ 7256 state->rc_tx_largebuf_nfree --; 7257 mutex_exit(&state->rc_tx_large_bufs_lock); 7258 bufp = lbufp->lb_buf; 7259 node->w_sgl[0].ds_va = 7260 (ib_vaddr_t)(uintptr_t)bufp; 7261 node->w_sgl[0].ds_key = 7262 state->rc_tx_mr_desc.md_lkey; 7263 node->w_sgl[0].ds_len = pktsize; 7264 node->w_swr.wr_sgl = node->w_sgl; 7265 node->w_swr.wr_nds = 1; 7266 node->w_buftype = IBD_WQE_RC_COPYBUF; 7267 node->w_rc_tx_largebuf = lbufp; 7268 7269 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7270 blksize = MBLKL(nmp); 7271 if (blksize != 0) { 7272 bcopy(nmp->b_rptr, bufp, 7273 blksize); 7274 bufp += blksize; 7275 } 7276 } 7277 freemsg(mp); 7278 ASSERT(node->swqe_im_mblk == NULL); 7279 } 7280 } 7281 7282 node->swqe_next = NULL; 7283 mutex_enter(&rc_chan->tx_post_lock); 7284 if (rc_chan->tx_busy) { 7285 if (rc_chan->tx_head) { 7286 rc_chan->tx_tail->swqe_next = 7287 SWQE_TO_WQE(node); 7288 } else { 7289 rc_chan->tx_head = node; 7290 } 7291 rc_chan->tx_tail = node; 7292 mutex_exit(&rc_chan->tx_post_lock); 7293 } else { 7294 rc_chan->tx_busy = 1; 7295 mutex_exit(&rc_chan->tx_post_lock); 7296 ibd_rc_post_send(rc_chan, node); 7297 } 7298 7299 return (B_TRUE); 7300 } /* send by RC */ 7301 7302 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 7303 /* 7304 * Too long pktsize. The packet size from GLD should <= 7305 * state->id_mtu + sizeof (ib_addrs_t) 7306 */ 7307 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 7308 ibd_req_t *req; 7309 7310 mutex_enter(&ace->tx_too_big_mutex); 7311 if (ace->tx_too_big_ongoing) { 7312 mutex_exit(&ace->tx_too_big_mutex); 7313 state->rc_xmt_reenter_too_long_pkt++; 7314 dofree = B_TRUE; 7315 } else { 7316 ace->tx_too_big_ongoing = B_TRUE; 7317 mutex_exit(&ace->tx_too_big_mutex); 7318 state->rc_xmt_icmp_too_long_pkt++; 7319 7320 req = kmem_cache_alloc(state->id_req_kmc, 7321 KM_NOSLEEP); 7322 if (req == NULL) { 7323 ibd_print_warn(state, "ibd_send: alloc " 7324 "ibd_req_t fail"); 7325 /* Drop it. */ 7326 dofree = B_TRUE; 7327 } else { 7328 req->rq_ptr = mp; 7329 req->rq_ptr2 = ace; 7330 ibd_queue_work_slot(state, req, 7331 IBD_ASYNC_RC_TOO_BIG); 7332 dofree = B_FALSE; 7333 } 7334 } 7335 } else { 7336 ibd_print_warn(state, "Reliable Connected mode is on. " 7337 "Multicast packet length %d > %d is too long to " 7338 "send packet (%d > %d), drop it", 7339 pktsize, state->id_mtu); 7340 state->rc_xmt_drop_too_long_pkt++; 7341 /* Drop it. */ 7342 dofree = B_TRUE; 7343 } 7344 rc = B_TRUE; 7345 goto ibd_send_fail; 7346 } 7347 7348 atomic_add_64(&state->id_xmt_bytes, pktsize); 7349 atomic_inc_64(&state->id_xmt_pkt); 7350 7351 /* 7352 * Do LSO and checksum related work here. For LSO send, adjust the 7353 * ud destination, the opcode and the LSO header information to the 7354 * work request. 7355 */ 7356 mac_lso_get(mp, &mss, &lsoflags); 7357 if ((lsoflags & HW_LSO) != HW_LSO) { 7358 node->w_swr.wr_opcode = IBT_WRC_SEND; 7359 lsohdr_sz = 0; 7360 } else { 7361 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 7362 /* 7363 * The routine can only fail if there's no memory; we 7364 * can only drop the packet if this happens 7365 */ 7366 ibd_print_warn(state, 7367 "ibd_send: no memory, lso posting failed"); 7368 dofree = B_TRUE; 7369 rc = B_TRUE; 7370 goto ibd_send_fail; 7371 } 7372 7373 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 7374 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 7375 } 7376 7377 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 7378 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 7379 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 7380 else 7381 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 7382 7383 /* 7384 * Prepare the sgl for posting; the routine can only fail if there's 7385 * no lso buf available for posting. If this is the case, we should 7386 * probably resched for lso bufs to become available and then try again. 7387 */ 7388 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 7389 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 7390 dofree = B_TRUE; 7391 rc = B_TRUE; 7392 } else { 7393 dofree = B_FALSE; 7394 rc = B_FALSE; 7395 } 7396 goto ibd_send_fail; 7397 } 7398 node->swqe_im_mblk = mp; 7399 7400 /* 7401 * Queue the wqe to hardware; since we can now simply queue a 7402 * post instead of doing it serially, we cannot assume anything 7403 * about the 'node' after ibd_post_send() returns. 7404 */ 7405 node->swqe_next = NULL; 7406 7407 mutex_enter(&state->id_txpost_lock); 7408 if (state->id_tx_busy) { 7409 if (state->id_tx_head) { 7410 state->id_tx_tail->swqe_next = 7411 SWQE_TO_WQE(node); 7412 } else { 7413 state->id_tx_head = node; 7414 } 7415 state->id_tx_tail = node; 7416 mutex_exit(&state->id_txpost_lock); 7417 } else { 7418 state->id_tx_busy = 1; 7419 mutex_exit(&state->id_txpost_lock); 7420 ibd_post_send(state, node); 7421 } 7422 7423 return (B_TRUE); 7424 7425 ibd_send_fail: 7426 if (node && mp) 7427 ibd_free_lsohdr(node, mp); 7428 7429 if (dofree) 7430 freemsg(mp); 7431 7432 if (node != NULL) { 7433 if (rc_chan) { 7434 ibd_rc_tx_cleanup(node); 7435 } else { 7436 ibd_tx_cleanup(state, node); 7437 } 7438 } 7439 7440 return (rc); 7441 } 7442 7443 /* 7444 * GLDv3 entry point for transmitting datagram. 7445 */ 7446 static mblk_t * 7447 ibd_m_tx(void *arg, mblk_t *mp) 7448 { 7449 ibd_state_t *state = (ibd_state_t *)arg; 7450 mblk_t *next; 7451 7452 if (state->id_type == IBD_PORT_DRIVER) { 7453 freemsgchain(mp); 7454 return (NULL); 7455 } 7456 7457 if ((state->id_link_state != LINK_STATE_UP) || 7458 !(state->id_mac_state & IBD_DRV_STARTED)) { 7459 freemsgchain(mp); 7460 mp = NULL; 7461 } 7462 7463 while (mp != NULL) { 7464 next = mp->b_next; 7465 mp->b_next = NULL; 7466 if (ibd_send(state, mp) == B_FALSE) { 7467 /* Send fail */ 7468 mp->b_next = next; 7469 break; 7470 } 7471 mp = next; 7472 } 7473 7474 return (mp); 7475 } 7476 7477 /* 7478 * this handles Tx and Rx completions. With separate CQs, this handles 7479 * only Rx completions. 7480 */ 7481 static uint_t 7482 ibd_intr(caddr_t arg) 7483 { 7484 ibd_state_t *state = (ibd_state_t *)arg; 7485 7486 ibd_poll_rcq(state, state->id_rcq_hdl); 7487 7488 return (DDI_INTR_CLAIMED); 7489 } 7490 7491 /* 7492 * Poll and fully drain the send cq 7493 */ 7494 static void 7495 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7496 { 7497 ibt_wc_t *wcs = state->id_txwcs; 7498 uint_t numwcs = state->id_txwcs_size; 7499 ibd_wqe_t *wqe; 7500 ibd_swqe_t *head, *tail; 7501 ibt_wc_t *wc; 7502 uint_t num_polled; 7503 int i; 7504 7505 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7506 head = tail = NULL; 7507 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7508 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 7509 if (wc->wc_status != IBT_WC_SUCCESS) { 7510 /* 7511 * Channel being torn down. 7512 */ 7513 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7514 DPRINT(5, "ibd_drain_scq: flush error"); 7515 DPRINT(10, "ibd_drain_scq: Bad " 7516 "status %d", wc->wc_status); 7517 } else { 7518 DPRINT(10, "ibd_drain_scq: " 7519 "unexpected wc_status %d", 7520 wc->wc_status); 7521 } 7522 /* 7523 * Fallthrough to invoke the Tx handler to 7524 * release held resources, e.g., AH refcount. 7525 */ 7526 } 7527 /* 7528 * Add this swqe to the list to be cleaned up. 7529 */ 7530 if (head) 7531 tail->swqe_next = wqe; 7532 else 7533 head = WQE_TO_SWQE(wqe); 7534 tail = WQE_TO_SWQE(wqe); 7535 } 7536 tail->swqe_next = NULL; 7537 ibd_tx_cleanup_list(state, head, tail); 7538 7539 /* 7540 * Resume any blocked transmissions if possible 7541 */ 7542 ibd_resume_transmission(state); 7543 } 7544 } 7545 7546 /* 7547 * Poll and fully drain the receive cq 7548 */ 7549 static void 7550 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7551 { 7552 ibt_wc_t *wcs = state->id_rxwcs; 7553 uint_t numwcs = state->id_rxwcs_size; 7554 ibd_rwqe_t *rwqe; 7555 ibt_wc_t *wc; 7556 uint_t num_polled; 7557 int i; 7558 mblk_t *head, *tail, *mp; 7559 7560 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7561 head = tail = NULL; 7562 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7563 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 7564 if (wc->wc_status != IBT_WC_SUCCESS) { 7565 /* 7566 * Channel being torn down. 7567 */ 7568 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7569 DPRINT(5, "ibd_drain_rcq: " 7570 "expected flushed rwqe"); 7571 } else { 7572 DPRINT(5, "ibd_drain_rcq: " 7573 "unexpected wc_status %d", 7574 wc->wc_status); 7575 } 7576 atomic_inc_32( 7577 &state->id_rx_list.dl_bufs_outstanding); 7578 freemsg(rwqe->rwqe_im_mblk); 7579 continue; 7580 } 7581 mp = ibd_process_rx(state, rwqe, wc); 7582 if (mp == NULL) 7583 continue; 7584 7585 /* 7586 * Add this mp to the list to send to the nw layer. 7587 */ 7588 if (head) 7589 tail->b_next = mp; 7590 else 7591 head = mp; 7592 tail = mp; 7593 } 7594 if (head) 7595 mac_rx(state->id_mh, state->id_rh, head); 7596 7597 /* 7598 * Account for #rwqes polled. 7599 * Post more here, if less than one fourth full. 7600 */ 7601 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 7602 (state->id_ud_num_rwqe / 4)) 7603 ibd_post_recv_intr(state); 7604 } 7605 } 7606 7607 /* 7608 * Common code for interrupt handling as well as for polling 7609 * for all completed wqe's while detaching. 7610 */ 7611 static void 7612 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7613 { 7614 int flag, redo_flag; 7615 int redo = 1; 7616 7617 flag = IBD_CQ_POLLING; 7618 redo_flag = IBD_REDO_CQ_POLLING; 7619 7620 mutex_enter(&state->id_scq_poll_lock); 7621 if (state->id_scq_poll_busy & flag) { 7622 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 7623 state->id_scq_poll_busy |= redo_flag; 7624 mutex_exit(&state->id_scq_poll_lock); 7625 return; 7626 } 7627 state->id_scq_poll_busy |= flag; 7628 mutex_exit(&state->id_scq_poll_lock); 7629 7630 /* 7631 * In some cases (eg detaching), this code can be invoked on 7632 * any cpu after disabling cq notification (thus no concurrency 7633 * exists). Apart from that, the following applies normally: 7634 * Transmit completion handling could be from any cpu if 7635 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 7636 * is interrupt driven. 7637 */ 7638 7639 /* 7640 * Poll and drain the CQ 7641 */ 7642 ibd_drain_scq(state, cq_hdl); 7643 7644 /* 7645 * Enable CQ notifications and redrain the cq to catch any 7646 * completions we might have missed after the ibd_drain_scq() 7647 * above and before the ibt_enable_cq_notify() that follows. 7648 * Finally, service any new requests to poll the cq that 7649 * could've come in after the ibt_enable_cq_notify(). 7650 */ 7651 do { 7652 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 7653 IBT_SUCCESS) { 7654 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7655 } 7656 7657 ibd_drain_scq(state, cq_hdl); 7658 7659 mutex_enter(&state->id_scq_poll_lock); 7660 if (state->id_scq_poll_busy & redo_flag) 7661 state->id_scq_poll_busy &= ~redo_flag; 7662 else { 7663 state->id_scq_poll_busy &= ~flag; 7664 redo = 0; 7665 } 7666 mutex_exit(&state->id_scq_poll_lock); 7667 7668 } while (redo); 7669 } 7670 7671 /* 7672 * Common code for interrupt handling as well as for polling 7673 * for all completed wqe's while detaching. 7674 */ 7675 static void 7676 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 7677 { 7678 int flag, redo_flag; 7679 int redo = 1; 7680 7681 flag = IBD_CQ_POLLING; 7682 redo_flag = IBD_REDO_CQ_POLLING; 7683 7684 mutex_enter(&state->id_rcq_poll_lock); 7685 if (state->id_rcq_poll_busy & flag) { 7686 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 7687 state->id_rcq_poll_busy |= redo_flag; 7688 mutex_exit(&state->id_rcq_poll_lock); 7689 return; 7690 } 7691 state->id_rcq_poll_busy |= flag; 7692 mutex_exit(&state->id_rcq_poll_lock); 7693 7694 /* 7695 * Poll and drain the CQ 7696 */ 7697 ibd_drain_rcq(state, rcq); 7698 7699 /* 7700 * Enable CQ notifications and redrain the cq to catch any 7701 * completions we might have missed after the ibd_drain_cq() 7702 * above and before the ibt_enable_cq_notify() that follows. 7703 * Finally, service any new requests to poll the cq that 7704 * could've come in after the ibt_enable_cq_notify(). 7705 */ 7706 do { 7707 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 7708 IBT_SUCCESS) { 7709 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7710 } 7711 7712 ibd_drain_rcq(state, rcq); 7713 7714 mutex_enter(&state->id_rcq_poll_lock); 7715 if (state->id_rcq_poll_busy & redo_flag) 7716 state->id_rcq_poll_busy &= ~redo_flag; 7717 else { 7718 state->id_rcq_poll_busy &= ~flag; 7719 redo = 0; 7720 } 7721 mutex_exit(&state->id_rcq_poll_lock); 7722 7723 } while (redo); 7724 } 7725 7726 /* 7727 * Unmap the memory area associated with a given swqe. 7728 */ 7729 void 7730 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 7731 { 7732 ibt_status_t stat; 7733 7734 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 7735 7736 if (swqe->w_mi_hdl) { 7737 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 7738 swqe->w_mi_hdl)) != IBT_SUCCESS) { 7739 DPRINT(10, 7740 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 7741 } 7742 swqe->w_mi_hdl = NULL; 7743 } 7744 swqe->w_swr.wr_nds = 0; 7745 } 7746 7747 void 7748 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 7749 { 7750 /* 7751 * The recycling logic can be eliminated from here 7752 * and put into the async thread if we create another 7753 * list to hold ACE's for unjoined mcg's. 7754 */ 7755 if (DEC_REF_DO_CYCLE(ace)) { 7756 ibd_mce_t *mce; 7757 7758 /* 7759 * Check with the lock taken: we decremented 7760 * reference count without the lock, and some 7761 * transmitter might already have bumped the 7762 * reference count (possible in case of multicast 7763 * disable when we leave the AH on the active 7764 * list). If not still 0, get out, leaving the 7765 * recycle bit intact. 7766 * 7767 * Atomically transition the AH from active 7768 * to free list, and queue a work request to 7769 * leave the group and destroy the mce. No 7770 * transmitter can be looking at the AH or 7771 * the MCE in between, since we have the 7772 * ac_mutex lock. In the SendOnly reap case, 7773 * it is not necessary to hold the ac_mutex 7774 * and recheck the ref count (since the AH was 7775 * taken off the active list), we just do it 7776 * to have uniform processing with the Full 7777 * reap case. 7778 */ 7779 mutex_enter(&state->id_ac_mutex); 7780 mce = ace->ac_mce; 7781 if (GET_REF_CYCLE(ace) == 0) { 7782 CLEAR_REFCYCLE(ace); 7783 /* 7784 * Identify the case of fullmember reap as 7785 * opposed to mcg trap reap. Also, port up 7786 * might set ac_mce to NULL to indicate Tx 7787 * cleanup should do no more than put the 7788 * AH in the free list (see ibd_async_link). 7789 */ 7790 if (mce != NULL) { 7791 ace->ac_mce = NULL; 7792 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 7793 /* 7794 * mc_req was initialized at mce 7795 * creation time. 7796 */ 7797 ibd_queue_work_slot(state, 7798 &mce->mc_req, IBD_ASYNC_REAP); 7799 } 7800 IBD_ACACHE_INSERT_FREE(state, ace); 7801 } 7802 mutex_exit(&state->id_ac_mutex); 7803 } 7804 } 7805 7806 /* 7807 * Common code that deals with clean ups after a successful or 7808 * erroneous transmission attempt. 7809 */ 7810 static void 7811 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 7812 { 7813 ibd_ace_t *ace = swqe->w_ahandle; 7814 7815 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 7816 7817 /* 7818 * If this was a dynamic mapping in ibd_send(), we need to 7819 * unmap here. If this was an lso buffer we'd used for sending, 7820 * we need to release the lso buf to the pool, since the resource 7821 * is scarce. However, if this was simply a normal send using 7822 * the copybuf (present in each swqe), we don't need to release it. 7823 */ 7824 if (swqe->swqe_im_mblk != NULL) { 7825 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7826 ibd_unmap_mem(state, swqe); 7827 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7828 ibd_release_lsobufs(state, 7829 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7830 } 7831 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7832 freemsg(swqe->swqe_im_mblk); 7833 swqe->swqe_im_mblk = NULL; 7834 } 7835 7836 /* 7837 * Drop the reference count on the AH; it can be reused 7838 * now for a different destination if there are no more 7839 * posted sends that will use it. This can be eliminated 7840 * if we can always associate each Tx buffer with an AH. 7841 * The ace can be null if we are cleaning up from the 7842 * ibd_send() error path. 7843 */ 7844 if (ace != NULL) { 7845 ibd_dec_ref_ace(state, ace); 7846 } 7847 7848 /* 7849 * Release the send wqe for reuse. 7850 */ 7851 swqe->swqe_next = NULL; 7852 ibd_release_swqe(state, swqe, swqe, 1); 7853 } 7854 7855 static void 7856 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 7857 { 7858 ibd_ace_t *ace; 7859 ibd_swqe_t *swqe; 7860 int n = 0; 7861 7862 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 7863 7864 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 7865 7866 /* 7867 * If this was a dynamic mapping in ibd_send(), we need to 7868 * unmap here. If this was an lso buffer we'd used for sending, 7869 * we need to release the lso buf to the pool, since the 7870 * resource is scarce. However, if this was simply a normal 7871 * send using the copybuf (present in each swqe), we don't need 7872 * to release it. 7873 */ 7874 if (swqe->swqe_im_mblk != NULL) { 7875 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7876 ibd_unmap_mem(state, swqe); 7877 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7878 ibd_release_lsobufs(state, 7879 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7880 } 7881 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7882 freemsg(swqe->swqe_im_mblk); 7883 swqe->swqe_im_mblk = NULL; 7884 } 7885 7886 /* 7887 * Drop the reference count on the AH; it can be reused 7888 * now for a different destination if there are no more 7889 * posted sends that will use it. This can be eliminated 7890 * if we can always associate each Tx buffer with an AH. 7891 * The ace can be null if we are cleaning up from the 7892 * ibd_send() error path. 7893 */ 7894 ace = swqe->w_ahandle; 7895 if (ace != NULL) { 7896 ibd_dec_ref_ace(state, ace); 7897 } 7898 n++; 7899 } 7900 7901 /* 7902 * Release the send wqes for reuse. 7903 */ 7904 ibd_release_swqe(state, head, tail, n); 7905 } 7906 7907 /* 7908 * Processing to be done after receipt of a packet; hand off to GLD 7909 * in the format expected by GLD. The received packet has this 7910 * format: 2b sap :: 00 :: data. 7911 */ 7912 static mblk_t * 7913 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 7914 { 7915 ib_header_info_t *phdr; 7916 mblk_t *mp; 7917 ipoib_hdr_t *ipibp; 7918 ipha_t *iphap; 7919 ip6_t *ip6h; 7920 int len; 7921 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 7922 uint32_t bufs; 7923 7924 /* 7925 * Track number handed to upper layer that need to be returned. 7926 */ 7927 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 7928 7929 /* Never run out of rwqes, use allocb when running low */ 7930 if (bufs >= state->id_rx_bufs_outstanding_limit) { 7931 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7932 atomic_inc_32(&state->id_rx_allocb); 7933 mp = allocb(pkt_len, BPRI_HI); 7934 if (mp) { 7935 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 7936 ibd_post_recv(state, rwqe); 7937 } else { /* no memory */ 7938 atomic_inc_32(&state->id_rx_allocb_failed); 7939 ibd_post_recv(state, rwqe); 7940 return (NULL); 7941 } 7942 } else { 7943 mp = rwqe->rwqe_im_mblk; 7944 } 7945 7946 7947 /* 7948 * Adjust write pointer depending on how much data came in. 7949 */ 7950 mp->b_wptr = mp->b_rptr + pkt_len; 7951 7952 /* 7953 * Make sure this is NULL or we're in trouble. 7954 */ 7955 if (mp->b_next != NULL) { 7956 ibd_print_warn(state, 7957 "ibd_process_rx: got duplicate mp from rcq?"); 7958 mp->b_next = NULL; 7959 } 7960 7961 /* 7962 * the IB link will deliver one of the IB link layer 7963 * headers called, the Global Routing Header (GRH). 7964 * ibd driver uses the information in GRH to build the 7965 * Header_info structure and pass it with the datagram up 7966 * to GLDv3. 7967 * If the GRH is not valid, indicate to GLDv3 by setting 7968 * the VerTcFlow field to 0. 7969 */ 7970 phdr = (ib_header_info_t *)mp->b_rptr; 7971 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 7972 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 7973 7974 /* if it is loop back packet, just drop it. */ 7975 if (state->id_enable_rc) { 7976 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 7977 &state->rc_macaddr_loopback, 7978 IPOIB_ADDRL) == 0) { 7979 freemsg(mp); 7980 return (NULL); 7981 } 7982 } else { 7983 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 7984 IPOIB_ADDRL) == 0) { 7985 freemsg(mp); 7986 return (NULL); 7987 } 7988 } 7989 7990 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 7991 sizeof (ipoib_mac_t)); 7992 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 7993 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 7994 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 7995 } else { 7996 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 7997 } 7998 } else { 7999 /* 8000 * It can not be a IBA multicast packet. Must have been 8001 * unicast for us. Just copy the interface address to dst. 8002 */ 8003 phdr->ib_grh.ipoib_vertcflow = 0; 8004 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 8005 sizeof (ipoib_mac_t)); 8006 } 8007 8008 /* 8009 * For ND6 packets, padding is at the front of the source/target 8010 * lladdr. However the inet6 layer is not aware of it, hence remove 8011 * the padding from such packets. 8012 */ 8013 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 8014 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 8015 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 8016 len = ntohs(ip6h->ip6_plen); 8017 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 8018 /* LINTED: E_CONSTANT_CONDITION */ 8019 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 8020 } 8021 } 8022 8023 /* 8024 * Update statistics 8025 */ 8026 atomic_add_64(&state->id_rcv_bytes, pkt_len); 8027 atomic_inc_64(&state->id_rcv_pkt); 8028 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 8029 atomic_inc_64(&state->id_brd_rcv); 8030 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 8031 atomic_inc_64(&state->id_multi_rcv); 8032 8033 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 8034 /* 8035 * Set receive checksum status in mp 8036 * Hardware checksumming can be considered valid only if: 8037 * 1. CQE.IP_OK bit is set 8038 * 2. CQE.CKSUM = 0xffff 8039 * 3. IPv6 routing header is not present in the packet 8040 * 4. If there are no IP_OPTIONS in the IP HEADER 8041 */ 8042 8043 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 8044 (wc->wc_cksum == 0xFFFF) && 8045 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 8046 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 8047 } 8048 8049 return (mp); 8050 } 8051 8052 /* 8053 * Callback code invoked from STREAMs when the receive data buffer is 8054 * free for recycling. 8055 */ 8056 static void 8057 ibd_freemsg_cb(char *arg) 8058 { 8059 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 8060 ibd_state_t *state = rwqe->w_state; 8061 8062 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 8063 8064 /* 8065 * If the driver is stopped, just free the rwqe. 8066 */ 8067 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 8068 DPRINT(6, "ibd_freemsg: wqe being freed"); 8069 rwqe->rwqe_im_mblk = NULL; 8070 ibd_free_rwqe(state, rwqe); 8071 return; 8072 } 8073 8074 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 8075 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 8076 if (rwqe->rwqe_im_mblk == NULL) { 8077 ibd_free_rwqe(state, rwqe); 8078 DPRINT(6, "ibd_freemsg: desballoc failed"); 8079 return; 8080 } 8081 8082 ibd_post_recv(state, rwqe); 8083 } 8084 8085 static uint_t 8086 ibd_tx_recycle(caddr_t arg) 8087 { 8088 ibd_state_t *state = (ibd_state_t *)arg; 8089 8090 /* 8091 * Poll for completed entries 8092 */ 8093 ibd_poll_scq(state, state->id_scq_hdl); 8094 8095 return (DDI_INTR_CLAIMED); 8096 } 8097 8098 #ifdef IBD_LOGGING 8099 static void 8100 ibd_log_init(void) 8101 { 8102 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 8103 ibd_lbuf_ndx = 0; 8104 8105 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 8106 } 8107 8108 static void 8109 ibd_log_fini(void) 8110 { 8111 if (ibd_lbuf) 8112 kmem_free(ibd_lbuf, IBD_LOG_SZ); 8113 ibd_lbuf_ndx = 0; 8114 ibd_lbuf = NULL; 8115 8116 mutex_destroy(&ibd_lbuf_lock); 8117 } 8118 8119 static void 8120 ibd_log(const char *fmt, ...) 8121 { 8122 va_list ap; 8123 uint32_t off; 8124 uint32_t msglen; 8125 char tmpbuf[IBD_DMAX_LINE]; 8126 8127 if (ibd_lbuf == NULL) 8128 return; 8129 8130 va_start(ap, fmt); 8131 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 8132 va_end(ap); 8133 8134 if (msglen >= IBD_DMAX_LINE) 8135 msglen = IBD_DMAX_LINE - 1; 8136 8137 mutex_enter(&ibd_lbuf_lock); 8138 8139 off = ibd_lbuf_ndx; /* current msg should go here */ 8140 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 8141 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 8142 8143 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 8144 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 8145 8146 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 8147 ibd_lbuf_ndx = 0; 8148 8149 mutex_exit(&ibd_lbuf_lock); 8150 8151 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 8152 } 8153 #endif 8154 8155 /* ARGSUSED */ 8156 static int 8157 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8158 int *rvalp) 8159 { 8160 ibd_create_ioctl_t *cmd = karg; 8161 ibd_state_t *state, *port_state, *p; 8162 int i, err, rval = 0; 8163 mac_register_t *macp; 8164 ibt_hca_portinfo_t *pinfop = NULL; 8165 ibt_status_t ibt_status; 8166 uint_t psize, pinfosz; 8167 boolean_t force_create = B_FALSE; 8168 8169 cmd->ibdioc.ioc_status = 0; 8170 8171 if (cmd->ibdioc.ioc_port_inst < 0) { 8172 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8173 return (EINVAL); 8174 } 8175 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst); 8176 if (port_state == NULL) { 8177 DPRINT(10, "ibd_create_partition: failed to get state %d", 8178 cmd->ibdioc.ioc_port_inst); 8179 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8180 return (EINVAL); 8181 } 8182 8183 /* Limited PKeys not supported */ 8184 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) { 8185 rval = EINVAL; 8186 goto part_create_return; 8187 } 8188 8189 if (cmd->ioc_force_create == 0) { 8190 /* 8191 * Check if the port pkey table contains the pkey for which 8192 * this partition is being created. 8193 */ 8194 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8195 port_state->id_port, &pinfop, &psize, &pinfosz); 8196 8197 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8198 rval = EINVAL; 8199 goto part_create_return; 8200 } 8201 8202 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) { 8203 rval = ENETDOWN; 8204 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN; 8205 goto part_create_return; 8206 } 8207 8208 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) { 8209 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) { 8210 break; 8211 } 8212 } 8213 if (i == pinfop->p_pkey_tbl_sz) { 8214 rval = EINVAL; 8215 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT; 8216 goto part_create_return; 8217 } 8218 } else { 8219 force_create = B_TRUE; 8220 } 8221 8222 mutex_enter(&ibd_objlist_lock); 8223 for (p = ibd_objlist_head; p; p = p->id_next) { 8224 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) && 8225 (p->id_pkey == cmd->ioc_pkey)) { 8226 mutex_exit(&ibd_objlist_lock); 8227 rval = EEXIST; 8228 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS; 8229 goto part_create_return; 8230 } 8231 } 8232 mutex_exit(&ibd_objlist_lock); 8233 8234 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP); 8235 8236 state->id_type = IBD_PARTITION_OBJ; 8237 8238 state->id_plinkid = cmd->ioc_partid; 8239 state->id_dlinkid = cmd->ibdioc.ioc_linkid; 8240 state->id_port_inst = cmd->ibdioc.ioc_port_inst; 8241 8242 state->id_dip = port_state->id_dip; 8243 state->id_port = port_state->id_port; 8244 state->id_pkey = cmd->ioc_pkey; 8245 state->id_hca_guid = port_state->id_hca_guid; 8246 state->id_port_guid = port_state->id_port_guid; 8247 state->id_force_create = force_create; 8248 8249 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 8250 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 8251 8252 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) { 8253 rval = EIO; 8254 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE; 8255 goto fail; 8256 } 8257 8258 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 8259 rval = EAGAIN; 8260 goto fail; 8261 } 8262 8263 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 8264 macp->m_dip = port_state->id_dip; 8265 macp->m_instance = (uint_t)-1; 8266 macp->m_driver = state; 8267 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 8268 macp->m_callbacks = &ibd_m_callbacks; 8269 macp->m_min_sdu = 0; 8270 if (state->id_enable_rc) { 8271 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 8272 } else { 8273 macp->m_max_sdu = IBD_DEF_MAX_SDU; 8274 } 8275 macp->m_priv_props = ibd_priv_props; 8276 8277 err = mac_register(macp, &state->id_mh); 8278 mac_free(macp); 8279 8280 if (err != 0) { 8281 DPRINT(10, "ibd_create_partition: mac_register() failed %d", 8282 err); 8283 rval = err; 8284 goto fail; 8285 } 8286 8287 err = dls_devnet_create(state->id_mh, 8288 cmd->ioc_partid, crgetzoneid(credp)); 8289 if (err != 0) { 8290 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed " 8291 "%d", err); 8292 rval = err; 8293 (void) mac_unregister(state->id_mh); 8294 goto fail; 8295 } 8296 8297 /* 8298 * Add the new partition state structure to the list 8299 */ 8300 mutex_enter(&ibd_objlist_lock); 8301 if (ibd_objlist_head) 8302 state->id_next = ibd_objlist_head; 8303 8304 ibd_objlist_head = state; 8305 mutex_exit(&ibd_objlist_lock); 8306 8307 part_create_return: 8308 if (pinfop) { 8309 ibt_free_portinfo(pinfop, pinfosz); 8310 } 8311 return (rval); 8312 8313 fail: 8314 if (pinfop) { 8315 ibt_free_portinfo(pinfop, pinfosz); 8316 } 8317 ibd_part_unattach(state); 8318 kmem_free(state, sizeof (ibd_state_t)); 8319 return (rval); 8320 } 8321 8322 /* ARGSUSED */ 8323 static int 8324 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8325 int *rvalp) 8326 { 8327 int err; 8328 datalink_id_t tmpid; 8329 ibd_state_t *node, *prev; 8330 ibd_delete_ioctl_t *cmd = karg; 8331 8332 prev = NULL; 8333 8334 mutex_enter(&ibd_objlist_lock); 8335 node = ibd_objlist_head; 8336 8337 /* Find the ibd state structure corresponding the partion */ 8338 while (node != NULL) { 8339 if (node->id_plinkid == cmd->ioc_partid) 8340 break; 8341 prev = node; 8342 node = node->id_next; 8343 } 8344 8345 if (node == NULL) { 8346 mutex_exit(&ibd_objlist_lock); 8347 return (ENOENT); 8348 } 8349 8350 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) { 8351 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed " 8352 "%d", err); 8353 mutex_exit(&ibd_objlist_lock); 8354 return (err); 8355 } 8356 8357 /* 8358 * Call ibd_part_unattach() only after making sure that the instance has 8359 * not been started yet and is also not in late hca init mode. 8360 */ 8361 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8362 8363 err = 0; 8364 if ((node->id_mac_state & IBD_DRV_STARTED) || 8365 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) || 8366 (ibd_part_busy(node) != DDI_SUCCESS) || 8367 ((err = mac_disable(node->id_mh)) != 0)) { 8368 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid, 8369 crgetzoneid(credp)); 8370 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8371 mutex_exit(&ibd_objlist_lock); 8372 return (err != 0 ? err : EBUSY); 8373 } 8374 8375 node->id_mac_state |= IBD_DRV_IN_DELETION; 8376 8377 ibd_part_unattach(node); 8378 8379 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8380 8381 /* Remove the partition state structure from the linked list */ 8382 if (prev == NULL) 8383 ibd_objlist_head = node->id_next; 8384 else 8385 prev->id_next = node->id_next; 8386 mutex_exit(&ibd_objlist_lock); 8387 8388 if ((err = mac_unregister(node->id_mh)) != 0) { 8389 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d", 8390 err); 8391 } 8392 8393 cv_destroy(&node->id_macst_cv); 8394 mutex_destroy(&node->id_macst_lock); 8395 8396 kmem_free(node, sizeof (ibd_state_t)); 8397 8398 return (0); 8399 } 8400 8401 /* ARGSUSED */ 8402 static int 8403 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred, 8404 int *rvalp) 8405 { 8406 ibd_ioctl_t cmd; 8407 ibpart_ioctl_t partioc; 8408 ibport_ioctl_t portioc; 8409 #ifdef _MULTI_DATAMODEL 8410 ibport_ioctl32_t portioc32; 8411 #endif 8412 ibd_state_t *state, *port_state; 8413 int size; 8414 ibt_hca_portinfo_t *pinfop = NULL; 8415 ibt_status_t ibt_status; 8416 uint_t psize, pinfosz; 8417 int rval = 0; 8418 8419 size = sizeof (ibd_ioctl_t); 8420 if (ddi_copyin((void *)arg, &cmd, size, mode)) { 8421 return (EFAULT); 8422 } 8423 cmd.ioc_status = 0; 8424 switch (cmd.ioc_info_cmd) { 8425 case IBD_INFO_CMD_IBPART: 8426 size = sizeof (ibpart_ioctl_t); 8427 if (ddi_copyin((void *)arg, &partioc, size, mode)) { 8428 return (EFAULT); 8429 } 8430 8431 mutex_enter(&ibd_objlist_lock); 8432 /* Find the ibd state structure corresponding the partition */ 8433 for (state = ibd_objlist_head; state; state = state->id_next) { 8434 if (state->id_plinkid == cmd.ioc_linkid) { 8435 break; 8436 } 8437 } 8438 8439 if (state == NULL) { 8440 mutex_exit(&ibd_objlist_lock); 8441 return (ENOENT); 8442 } 8443 8444 partioc.ibdioc.ioc_linkid = state->id_dlinkid; 8445 partioc.ibdioc.ioc_port_inst = state->id_port_inst; 8446 partioc.ibdioc.ioc_portnum = state->id_port; 8447 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid; 8448 partioc.ibdioc.ioc_portguid = state->id_port_guid; 8449 partioc.ibdioc.ioc_status = 0; 8450 partioc.ioc_partid = state->id_plinkid; 8451 partioc.ioc_pkey = state->id_pkey; 8452 partioc.ioc_force_create = state->id_force_create; 8453 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) { 8454 mutex_exit(&ibd_objlist_lock); 8455 return (EFAULT); 8456 } 8457 mutex_exit(&ibd_objlist_lock); 8458 8459 break; 8460 8461 case IBD_INFO_CMD_IBPORT: 8462 if ((cmd.ioc_port_inst < 0) || ((port_state = 8463 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8464 DPRINT(10, "ibd_create_partition: failed to get" 8465 " state %d", cmd.ioc_port_inst); 8466 size = sizeof (ibd_ioctl_t); 8467 cmd.ioc_status = IBD_INVALID_PORT_INST; 8468 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8469 mode)) { 8470 return (EFAULT); 8471 } 8472 return (EINVAL); 8473 } 8474 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8475 port_state->id_port, &pinfop, &psize, &pinfosz); 8476 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8477 return (EINVAL); 8478 } 8479 #ifdef _MULTI_DATAMODEL 8480 switch (ddi_model_convert_from(mode & FMODELS)) { 8481 case DDI_MODEL_ILP32: { 8482 size = sizeof (ibport_ioctl32_t); 8483 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8484 rval = EFAULT; 8485 goto fail; 8486 } 8487 portioc32.ibdioc.ioc_status = 0; 8488 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8489 portioc32.ibdioc.ioc_hcaguid = 8490 port_state->id_hca_guid; 8491 portioc32.ibdioc.ioc_portguid = 8492 port_state->id_port_guid; 8493 if (portioc32.ioc_pkey_tbl_sz != 8494 pinfop->p_pkey_tbl_sz) { 8495 rval = EINVAL; 8496 size = sizeof (ibd_ioctl_t); 8497 portioc32.ibdioc.ioc_status = 8498 IBD_INVALID_PKEY_TBL_SIZE; 8499 if (ddi_copyout((void *)&portioc32.ibdioc, 8500 (void *)arg, size, mode)) { 8501 rval = EFAULT; 8502 goto fail; 8503 } 8504 goto fail; 8505 } 8506 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8507 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8508 (void *)(uintptr_t)portioc32.ioc_pkeys, size, 8509 mode)) { 8510 rval = EFAULT; 8511 goto fail; 8512 } 8513 size = sizeof (ibport_ioctl32_t); 8514 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8515 mode)) { 8516 rval = EFAULT; 8517 goto fail; 8518 } 8519 break; 8520 } 8521 case DDI_MODEL_NONE: 8522 size = sizeof (ibport_ioctl_t); 8523 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8524 rval = EFAULT; 8525 goto fail; 8526 } 8527 portioc.ibdioc.ioc_status = 0; 8528 portioc.ibdioc.ioc_portnum = port_state->id_port; 8529 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8530 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8531 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8532 rval = EINVAL; 8533 size = sizeof (ibd_ioctl_t); 8534 portioc.ibdioc.ioc_status = 8535 IBD_INVALID_PKEY_TBL_SIZE; 8536 if (ddi_copyout((void *)&portioc.ibdioc, 8537 (void *)arg, size, mode)) { 8538 rval = EFAULT; 8539 goto fail; 8540 } 8541 goto fail; 8542 } 8543 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8544 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8545 (void *)(portioc.ioc_pkeys), size, mode)) { 8546 rval = EFAULT; 8547 goto fail; 8548 } 8549 size = sizeof (ibport_ioctl_t); 8550 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8551 mode)) { 8552 rval = EFAULT; 8553 goto fail; 8554 } 8555 break; 8556 } 8557 #else /* ! _MULTI_DATAMODEL */ 8558 size = sizeof (ibport_ioctl_t); 8559 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8560 rval = EFAULT; 8561 goto fail; 8562 } 8563 portioc.ibdioc.ioc_status = 0; 8564 portioc.ibdioc.ioc_portnum = port_state->id_port; 8565 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8566 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8567 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8568 rval = EINVAL; 8569 size = sizeof (ibd_ioctl_t); 8570 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; 8571 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, 8572 size, mode)) { 8573 rval = EFAULT; 8574 goto fail; 8575 } 8576 goto fail; 8577 } 8578 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8579 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8580 (void *)(portioc.ioc_pkeys), size, mode)) { 8581 rval = EFAULT; 8582 goto fail; 8583 } 8584 size = sizeof (ibport_ioctl_t); 8585 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8586 mode)) { 8587 rval = EFAULT; 8588 goto fail; 8589 } 8590 #endif /* _MULTI_DATAMODEL */ 8591 8592 break; 8593 8594 case IBD_INFO_CMD_PKEYTBLSZ: 8595 if ((cmd.ioc_port_inst < 0) || ((port_state = 8596 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8597 DPRINT(10, "ibd_create_partition: failed to get" 8598 " state %d", cmd.ioc_port_inst); 8599 size = sizeof (ibd_ioctl_t); 8600 cmd.ioc_status = IBD_INVALID_PORT_INST; 8601 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8602 mode)) { 8603 return (EFAULT); 8604 } 8605 return (EINVAL); 8606 } 8607 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8608 port_state->id_port, &pinfop, &psize, &pinfosz); 8609 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8610 return (EINVAL); 8611 } 8612 #ifdef _MULTI_DATAMODEL 8613 switch (ddi_model_convert_from(mode & FMODELS)) { 8614 case DDI_MODEL_ILP32: { 8615 size = sizeof (ibport_ioctl32_t); 8616 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8617 rval = EFAULT; 8618 goto fail; 8619 } 8620 portioc32.ibdioc.ioc_status = 0; 8621 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8622 portioc32.ibdioc.ioc_hcaguid = 8623 port_state->id_hca_guid; 8624 portioc32.ibdioc.ioc_portguid = 8625 port_state->id_port_guid; 8626 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8627 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8628 mode)) { 8629 rval = EFAULT; 8630 goto fail; 8631 } 8632 break; 8633 } 8634 case DDI_MODEL_NONE: 8635 size = sizeof (ibport_ioctl_t); 8636 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8637 rval = EFAULT; 8638 goto fail; 8639 } 8640 portioc.ibdioc.ioc_status = 0; 8641 portioc.ibdioc.ioc_portnum = port_state->id_port; 8642 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8643 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8644 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8645 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8646 mode)) { 8647 rval = EFAULT; 8648 goto fail; 8649 } 8650 break; 8651 } 8652 #else /* ! _MULTI_DATAMODEL */ 8653 size = sizeof (ibport_ioctl_t); 8654 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8655 rval = EFAULT; 8656 goto fail; 8657 } 8658 portioc.ibdioc.ioc_status = 0; 8659 portioc.ibdioc.ioc_portnum = port_state->id_port; 8660 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8661 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8662 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8663 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8664 mode)) { 8665 rval = EFAULT; 8666 goto fail; 8667 } 8668 #endif /* _MULTI_DATAMODEL */ 8669 break; 8670 8671 default: 8672 return (EINVAL); 8673 8674 } /* switch (cmd.ioc_info_cmd) */ 8675 fail: 8676 if (pinfop) { 8677 ibt_free_portinfo(pinfop, pinfosz); 8678 } 8679 return (rval); 8680 } 8681 8682 /* ARGSUSED */ 8683 static void 8684 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl, 8685 ibt_async_code_t code, ibt_async_event_t *event) 8686 { 8687 ibd_state_t *state = (ibd_state_t *)arg; 8688 link_state_t lstate; 8689 8690 switch (code) { 8691 case IBT_EVENT_PORT_UP: 8692 case IBT_ERROR_PORT_DOWN: 8693 if (ibd_get_port_state(state, &lstate) != 0) 8694 break; 8695 8696 if (state->id_link_state != lstate) { 8697 state->id_link_state = lstate; 8698 mac_link_update(state->id_mh, lstate); 8699 } 8700 break; 8701 default: 8702 break; 8703 } 8704 } 8705 8706 static int 8707 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate) 8708 { 8709 ibt_hca_portinfo_t *port_infop; 8710 uint_t psize, port_infosz; 8711 ibt_status_t ret; 8712 8713 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 8714 &port_infop, &psize, &port_infosz); 8715 if ((ret != IBT_SUCCESS) || (psize != 1)) 8716 return (-1); 8717 8718 state->id_sgid = *port_infop->p_sgid_tbl; 8719 state->id_link_speed = ibd_get_portspeed(state); 8720 8721 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) 8722 *lstate = LINK_STATE_UP; 8723 else 8724 *lstate = LINK_STATE_DOWN; 8725 8726 ibt_free_portinfo(port_infop, port_infosz); 8727 return (0); 8728 } 8729 8730 static int 8731 ibd_port_attach(dev_info_t *dip) 8732 { 8733 ibd_state_t *state; 8734 link_state_t lstate; 8735 int instance; 8736 ibt_status_t ret; 8737 8738 /* 8739 * Allocate softstate structure 8740 */ 8741 instance = ddi_get_instance(dip); 8742 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) { 8743 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed"); 8744 return (DDI_FAILURE); 8745 } 8746 8747 state = ddi_get_soft_state(ibd_list, instance); 8748 8749 state->id_dip = dip; 8750 state->id_type = IBD_PORT_DRIVER; 8751 8752 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 8753 "port-number", 0)) == 0) { 8754 DPRINT(10, "ibd_port_attach: invalid port number (%d)", 8755 state->id_port); 8756 return (DDI_FAILURE); 8757 } 8758 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8759 "hca-guid", 0)) == 0) { 8760 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)", 8761 state->id_hca_guid); 8762 return (DDI_FAILURE); 8763 } 8764 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8765 "port-guid", 0)) == 0) { 8766 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)", 8767 state->id_port_guid); 8768 return (DDI_FAILURE); 8769 } 8770 8771 /* 8772 * Attach to IBTL 8773 */ 8774 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state, 8775 &state->id_ibt_hdl)) != IBT_SUCCESS) { 8776 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d", 8777 ret); 8778 goto done; 8779 } 8780 8781 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 8782 8783 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 8784 &state->id_hca_hdl)) != IBT_SUCCESS) { 8785 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8786 ret); 8787 goto done; 8788 } 8789 state->id_mac_state |= IBD_DRV_HCA_OPENED; 8790 8791 /* Update link status */ 8792 8793 if (ibd_get_port_state(state, &lstate) != 0) { 8794 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8795 ret); 8796 goto done; 8797 } 8798 state->id_link_state = lstate; 8799 /* 8800 * Register ibd interfaces with the Nemo framework 8801 */ 8802 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 8803 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()"); 8804 goto done; 8805 } 8806 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 8807 8808 mac_link_update(state->id_mh, lstate); 8809 8810 return (DDI_SUCCESS); 8811 done: 8812 (void) ibd_port_unattach(state, dip); 8813 return (DDI_FAILURE); 8814 } 8815 8816 static int 8817 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip) 8818 { 8819 int instance; 8820 uint32_t progress = state->id_mac_state; 8821 ibt_status_t ret; 8822 8823 if (progress & IBD_DRV_MAC_REGISTERED) { 8824 (void) mac_unregister(state->id_mh); 8825 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 8826 } 8827 8828 if (progress & IBD_DRV_HCA_OPENED) { 8829 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 8830 IBT_SUCCESS) { 8831 ibd_print_warn(state, "failed to close " 8832 "HCA device, ret=%d", ret); 8833 } 8834 state->id_hca_hdl = NULL; 8835 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 8836 } 8837 8838 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 8839 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 8840 ibd_print_warn(state, 8841 "ibt_detach() failed, ret=%d", ret); 8842 } 8843 state->id_ibt_hdl = NULL; 8844 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 8845 } 8846 instance = ddi_get_instance(dip); 8847 ddi_soft_state_free(ibd_list, instance); 8848 8849 return (DDI_SUCCESS); 8850 } 8851 8852 ibt_status_t 8853 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr) 8854 { 8855 ibd_state_t *state; 8856 8857 mutex_enter(&ibd_objlist_lock); 8858 8859 /* Find the ibd state structure corresponding the partition */ 8860 for (state = ibd_objlist_head; state; state = state->id_next) { 8861 if (state->id_plinkid == linkid) { 8862 break; 8863 } 8864 } 8865 8866 if (state == NULL) { 8867 mutex_exit(&ibd_objlist_lock); 8868 return (IBT_NO_SUCH_OBJECT); 8869 } 8870 8871 attr->pa_dlinkid = state->id_dlinkid; 8872 attr->pa_plinkid = state->id_plinkid; 8873 attr->pa_port = state->id_port; 8874 attr->pa_hca_guid = state->id_hca_guid; 8875 attr->pa_port_guid = state->id_port_guid; 8876 attr->pa_pkey = state->id_pkey; 8877 8878 mutex_exit(&ibd_objlist_lock); 8879 8880 return (IBT_SUCCESS); 8881 } 8882 8883 ibt_status_t 8884 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts) 8885 { 8886 ibd_state_t *state; 8887 int n = 0; 8888 ibt_part_attr_t *attr; 8889 8890 mutex_enter(&ibd_objlist_lock); 8891 8892 for (state = ibd_objlist_head; state; state = state->id_next) 8893 n++; 8894 8895 *nparts = n; 8896 if (n == 0) { 8897 *attr_list = NULL; 8898 mutex_exit(&ibd_objlist_lock); 8899 return (IBT_SUCCESS); 8900 } 8901 8902 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP); 8903 attr = *attr_list; 8904 for (state = ibd_objlist_head; state; state = state->id_next) { 8905 #ifdef DEBUG 8906 ASSERT(n > 0); 8907 n--; 8908 #endif 8909 attr->pa_dlinkid = state->id_dlinkid; 8910 attr->pa_plinkid = state->id_plinkid; 8911 attr->pa_port = state->id_port; 8912 attr->pa_hca_guid = state->id_hca_guid; 8913 attr->pa_port_guid = state->id_port_guid; 8914 attr->pa_pkey = state->id_pkey; 8915 attr++; 8916 } 8917 8918 mutex_exit(&ibd_objlist_lock); 8919 return (IBT_SUCCESS); 8920 } 8921