1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * An implementation of the IPoIB standard based on PSARC 2001/289. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 43 #include <sys/sysmacros.h> /* for offsetof */ 44 #include <sys/disp.h> /* for async thread pri */ 45 #include <sys/atomic.h> /* for atomic_add*() */ 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 47 #include <netinet/in.h> /* for netinet/ip.h below */ 48 #include <netinet/ip.h> /* for struct ip */ 49 #include <netinet/udp.h> /* for struct udphdr */ 50 #include <inet/common.h> /* for inet/ip.h below */ 51 #include <inet/ip.h> /* for ipha_t */ 52 #include <inet/ip6.h> /* for ip6_t */ 53 #include <inet/tcp.h> /* for tcph_t */ 54 #include <netinet/icmp6.h> /* for icmp6_t */ 55 #include <sys/callb.h> 56 #include <sys/modhash.h> 57 58 #include <sys/ib/clients/ibd/ibd.h> 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 60 #include <sys/note.h> 61 #include <sys/multidata.h> 62 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 64 65 #include <sys/priv_names.h> 66 #include <sys/dls.h> 67 #include <sys/dld_ioc.h> 68 #include <sys/policy.h> 69 #include <sys/ibpart.h> 70 #include <sys/file.h> 71 72 /* 73 * The write-up below includes details on the following: 74 * 1. The dladm administrative model. 75 * 2. Late HCA initialization feature. 76 * 3. Brussels support and its implications to the current architecture. 77 * 78 * 1. The dladm administrative model. 79 * ------------------------------------------ 80 * With the dladm model, ibnex will create one ibd instance per port. These 81 * instances will be created independent of the port state. 82 * 83 * The ibd driver is two faceted: One side of it working as the port driver and 84 * the other as the partition object driver. 85 * 86 * The port instance is a child of the HCA, and will have an entry in the devfs. 87 * A DDI attach only happens for the port driver, and its attach is 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is 89 * handled in ibd_port_unattach(). 90 * 91 * The partition object is only a registrant to the mac layer via mac_register() 92 * and does not have an entry in the device tree. There is no DDI softstate 93 * managed by the DDI framework for the partition objects. However, the state is 94 * managed inside the ibd driver, and every partition object hangs off the 95 * "ibd_objlist_head". 96 * 97 * The partition object first comes into existence when a user runs the 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry 99 * point of the partition object. The partition object goes away with the 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry 101 * point of the partition object. 102 * 103 * The create-part and delete-part subcommands result in dld ioctls that end up 104 * calling ibd_create_parition() and ibd_delete_partition respectively. 105 * There ioctls are registered with the dld layer in _init() via a call to 106 * dld_ioc_register(). 107 * 108 * The port instance by itself cannot be plumbed. It is only the partition 109 * objects that can be plumbed and they alone participate in I/O and not the 110 * port driver. 111 * 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to 113 * display useful information. The info entry point for ibd is 114 * ibd_get_partition_info(). 115 * 116 * 2. Late HCA initialization feature. 117 * ------------------------------------ 118 * As mentioned in section 1, the user creates the partition objects via 119 * dladm(1M). It is possible that: 120 * a) The physical port itself is down and the SM cannot be reached. 121 * b) The PKEY specified by the used has not been created in the SM yet. 122 * c) An IPoIB broadcast group for the specified PKEY is not present. 123 * 124 * In all of the above cases, complete initialization of the partition object is 125 * not possible. However, the new model allows the creation of partition 126 * objects even in such cases but will defer the initialization for later. 127 * When such a partition object is plumbed, the link state will be displayed as 128 * "down". 129 * The driver, at this point, is listening to events that herald the 130 * availability of resources - 131 * i) LINK_UP when the link becomes available 132 * ii) PORT_CHANGE when the PKEY has been created 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been 134 * created 135 * via ibd_async_handler() for events i) and ii), and via 136 * ibd_snet_notices_handler() for iii. 137 * The driver handles these events (as and when they arrive) and completes the 138 * initialization of the partition object and transitions it to a usable state. 139 * 140 * 3. Brussels support and its implications to the current architecture. 141 * --------------------------------------------------------------------- 142 * The brussels support introduces two new interfaces to the ibd driver - 143 * ibd_m_getprop() and ibd_m_setprop(). 144 * These interfaces allow setting and retrieval of certain properties. 145 * Some of them are public properties while most other are private properties 146 * meant to be used by developers. Tuning the latter kind can cause 147 * performance issues and should not be used without understanding the 148 * implications. All properties are specific to an instance of either the 149 * partition object or the port driver. 150 * 151 * The public properties are : mtu and linkmode. 152 * mtu is a read-only property. 153 * linkmode can take two values - UD and CM. 154 * 155 * Changing the linkmode requires some bookkeeping in the driver. The 156 * capabilities need to be re-reported to the mac layer. This is done by 157 * calling mac_capab_update(). The maxsdu is updated by calling 158 * mac_maxsdu_update(). 159 * The private properties retain their values across the change of linkmode. 160 * NOTE: 161 * - The port driver does not support any property apart from mtu. 162 * - All other properties are only meant for the partition object. 163 * - The properties cannot be set when an instance is plumbed. The 164 * instance has to be unplumbed to effect any setting. 165 */ 166 167 /* 168 * Driver wide tunables 169 * 170 * ibd_tx_softintr 171 * ibd_rx_softintr 172 * The softintr mechanism allows ibd to avoid event queue overflows if 173 * the receive/completion handlers are to be expensive. These are enabled 174 * by default. 175 * 176 * ibd_log_sz 177 * This specifies the size of the ibd log buffer in bytes. The buffer is 178 * allocated and logging is enabled only when IBD_LOGGING is defined. 179 * 180 */ 181 uint_t ibd_rx_softintr = 1; 182 uint_t ibd_tx_softintr = 1; 183 184 #ifdef IBD_LOGGING 185 uint_t ibd_log_sz = 0x20000; 186 #endif 187 188 #ifdef IBD_LOGGING 189 #define IBD_LOG_SZ ibd_log_sz 190 #endif 191 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 193 #define IBD_RX_POST_CNT 8 194 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 196 #define IBD_LOG_RX_POST 4 197 198 /* Minimum number of receive work requests driver needs to always have */ 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 200 201 /* 202 * LSO parameters 203 */ 204 #define IBD_LSO_MAXLEN 65536 205 #define IBD_LSO_BUFSZ 8192 206 207 /* 208 * Async operation states 209 */ 210 #define IBD_OP_NOTSTARTED 0 211 #define IBD_OP_ONGOING 1 212 #define IBD_OP_COMPLETED 2 213 #define IBD_OP_ERRORED 3 214 #define IBD_OP_ROUTERED 4 215 216 /* 217 * Start/stop in-progress flags; note that restart must always remain 218 * the OR of start and stop flag values. 219 */ 220 #define IBD_DRV_START_IN_PROGRESS 0x10000000 221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS 224 225 /* 226 * Miscellaneous constants 227 */ 228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 229 #define IBD_DEF_MAX_SDU 2044 230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) 231 #define IBD_DEF_RC_MAX_SDU 65520 232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) 233 #define IBD_DEFAULT_QKEY 0xB1B 234 #ifdef IBD_LOGGING 235 #define IBD_DMAX_LINE 100 236 #endif 237 238 /* 239 * Enumerations for link states 240 */ 241 typedef enum { 242 IBD_LINK_DOWN, 243 IBD_LINK_UP, 244 IBD_LINK_UP_ABSENT 245 } ibd_link_op_t; 246 247 /* 248 * Driver State Pointer 249 */ 250 void *ibd_list; 251 252 /* 253 * Driver Global Data 254 */ 255 ibd_global_state_t ibd_gstate; 256 257 /* 258 * Partition object list 259 */ 260 ibd_state_t *ibd_objlist_head = NULL; 261 kmutex_t ibd_objlist_lock; 262 263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */ 264 265 /* 266 * Logging 267 */ 268 #ifdef IBD_LOGGING 269 kmutex_t ibd_lbuf_lock; 270 uint8_t *ibd_lbuf; 271 uint32_t ibd_lbuf_ndx; 272 #endif 273 274 /* 275 * Required system entry points 276 */ 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 279 280 /* 281 * Required driver entry points for GLDv3 282 */ 283 static int ibd_m_stat(void *, uint_t, uint64_t *); 284 static int ibd_m_start(void *); 285 static void ibd_m_stop(void *); 286 static int ibd_m_promisc(void *, boolean_t); 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 288 static int ibd_m_unicst(void *, const uint8_t *); 289 static mblk_t *ibd_m_tx(void *, mblk_t *); 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 291 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 293 const void *); 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, 296 mac_prop_info_handle_t); 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, 298 const void *); 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); 300 301 /* 302 * Private driver entry points for GLDv3 303 */ 304 305 /* 306 * Initialization 307 */ 308 static int ibd_state_init(ibd_state_t *, dev_info_t *); 309 static int ibd_init_txlist(ibd_state_t *); 310 static int ibd_init_rxlist(ibd_state_t *); 311 static int ibd_acache_init(ibd_state_t *); 312 #ifdef IBD_LOGGING 313 static void ibd_log_init(void); 314 #endif 315 316 /* 317 * Termination/cleanup 318 */ 319 static void ibd_state_fini(ibd_state_t *); 320 static void ibd_fini_txlist(ibd_state_t *); 321 static void ibd_fini_rxlist(ibd_state_t *); 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 324 static void ibd_acache_fini(ibd_state_t *); 325 #ifdef IBD_LOGGING 326 static void ibd_log_fini(void); 327 #endif 328 329 /* 330 * Allocation/acquire/map routines 331 */ 332 static int ibd_alloc_tx_copybufs(ibd_state_t *); 333 static int ibd_alloc_rx_copybufs(ibd_state_t *); 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 337 uint32_t *); 338 339 /* 340 * Free/release/unmap routines 341 */ 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 343 static void ibd_free_tx_copybufs(ibd_state_t *); 344 static void ibd_free_rx_copybufs(ibd_state_t *); 345 static void ibd_free_rx_rsrcs(ibd_state_t *); 346 static void ibd_free_tx_lsobufs(ibd_state_t *); 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 350 351 /* 352 * Handlers/callback routines 353 */ 354 static uint_t ibd_intr(caddr_t); 355 static uint_t ibd_tx_recycle(caddr_t); 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 362 static void ibd_freemsg_cb(char *); 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 364 ibt_async_event_t *); 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 366 ibt_async_event_t *); 367 static void ibd_snet_notices_handler(void *, ib_gid_t, 368 ibt_subnet_event_code_t, ibt_subnet_event_t *); 369 370 /* 371 * Send/receive routines 372 */ 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 377 378 /* 379 * Threads 380 */ 381 static void ibd_async_work(ibd_state_t *); 382 383 /* 384 * Async tasks 385 */ 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 388 static void ibd_async_setprom(ibd_state_t *); 389 static void ibd_async_unsetprom(ibd_state_t *); 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 392 static void ibd_async_txsched(ibd_state_t *); 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 394 395 /* 396 * Async task helpers 397 */ 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 402 ipoib_mac_t *, ipoib_mac_t *); 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 409 static uint64_t ibd_get_portspeed(ibd_state_t *); 410 static boolean_t ibd_async_safe(ibd_state_t *); 411 static void ibd_async_done(ibd_state_t *); 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 416 417 /* 418 * Helpers for attach/start routines 419 */ 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 421 static int ibd_record_capab(ibd_state_t *); 422 static int ibd_get_port_details(ibd_state_t *); 423 static int ibd_alloc_cqs(ibd_state_t *); 424 static int ibd_setup_ud_channel(ibd_state_t *); 425 static int ibd_start(ibd_state_t *); 426 static int ibd_undo_start(ibd_state_t *, link_state_t); 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); 430 static void ibd_part_unattach(ibd_state_t *state); 431 static int ibd_port_attach(dev_info_t *); 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *); 434 static int ibd_part_busy(ibd_state_t *); 435 436 /* 437 * Miscellaneous helpers 438 */ 439 static int ibd_sched_poll(ibd_state_t *, int, int); 440 static void ibd_resume_transmission(ibd_state_t *); 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 443 static void *list_get_head(list_t *); 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 446 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); 449 450 #ifdef IBD_LOGGING 451 static void ibd_log(const char *, ...); 452 #endif 453 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 456 457 /* Module Driver Info */ 458 static struct modldrv ibd_modldrv = { 459 &mod_driverops, /* This one is a driver */ 460 "InfiniBand GLDv3 Driver", /* short description */ 461 &ibd_dev_ops /* driver specific ops */ 462 }; 463 464 /* Module Linkage */ 465 static struct modlinkage ibd_modlinkage = { 466 MODREV_1, (void *)&ibd_modldrv, NULL 467 }; 468 469 /* 470 * Module (static) info passed to IBTL during ibt_attach 471 */ 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 473 IBTI_V_CURR, 474 IBT_NETWORK, 475 ibd_async_handler, 476 NULL, 477 "IBPART" 478 }; 479 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { 481 IBTI_V_CURR, 482 IBT_NETWORK, 483 ibdpd_async_handler, 484 NULL, 485 "IPIB" 486 }; 487 488 /* 489 * GLDv3 entry points 490 */ 491 #define IBD_M_CALLBACK_FLAGS \ 492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) 493 494 static mac_callbacks_t ibd_m_callbacks = { 495 IBD_M_CALLBACK_FLAGS, 496 ibd_m_stat, 497 ibd_m_start, 498 ibd_m_stop, 499 ibd_m_promisc, 500 ibd_m_multicst, 501 ibd_m_unicst, 502 ibd_m_tx, 503 NULL, 504 NULL, 505 ibd_m_getcapab, 506 NULL, 507 NULL, 508 ibd_m_setprop, 509 ibd_m_getprop, 510 ibd_m_propinfo 511 }; 512 513 /* Private properties */ 514 char *ibd_priv_props[] = { 515 "_ibd_broadcast_group", 516 "_ibd_coalesce_completions", 517 "_ibd_create_broadcast_group", 518 "_ibd_hash_size", 519 "_ibd_lso_enable", 520 "_ibd_num_ah", 521 "_ibd_num_lso_bufs", 522 "_ibd_rc_enable_srq", 523 "_ibd_rc_num_rwqe", 524 "_ibd_rc_num_srq", 525 "_ibd_rc_num_swqe", 526 "_ibd_rc_rx_comp_count", 527 "_ibd_rc_rx_comp_usec", 528 "_ibd_rc_rx_copy_thresh", 529 "_ibd_rc_rx_rwqe_thresh", 530 "_ibd_rc_tx_comp_count", 531 "_ibd_rc_tx_comp_usec", 532 "_ibd_rc_tx_copy_thresh", 533 "_ibd_ud_num_rwqe", 534 "_ibd_ud_num_swqe", 535 "_ibd_ud_rx_comp_count", 536 "_ibd_ud_rx_comp_usec", 537 "_ibd_ud_tx_comp_count", 538 "_ibd_ud_tx_comp_usec", 539 "_ibd_ud_tx_copy_thresh", 540 NULL 541 }; 542 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); 546 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = { 548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), 549 ibd_create_partition, secpolicy_dl_config}, 550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), 551 ibd_delete_partition, secpolicy_dl_config}, 552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), 553 ibd_get_partition_info, NULL} 554 }; 555 556 /* 557 * Fill/clear <scope> and <p_key> in multicast/broadcast address 558 */ 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 560 { \ 561 *(uint32_t *)((char *)(maddr) + 4) |= \ 562 htonl((uint32_t)(scope) << 16); \ 563 *(uint32_t *)((char *)(maddr) + 8) |= \ 564 htonl((uint32_t)(pkey) << 16); \ 565 } 566 567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 568 { \ 569 *(uint32_t *)((char *)(maddr) + 4) &= \ 570 htonl(~((uint32_t)0xF << 16)); \ 571 *(uint32_t *)((char *)(maddr) + 8) &= \ 572 htonl(~((uint32_t)0xFFFF << 16)); \ 573 } 574 575 /* 576 * Rudimentary debugging support 577 */ 578 #ifdef DEBUG 579 int ibd_debuglevel = 100; 580 void 581 debug_print(int l, char *fmt, ...) 582 { 583 va_list ap; 584 585 if (l < ibd_debuglevel) 586 return; 587 va_start(ap, fmt); 588 vcmn_err(CE_CONT, fmt, ap); 589 va_end(ap); 590 } 591 #endif 592 593 /* 594 * Common routine to print warning messages; adds in hca guid, port number 595 * and pkey to be able to identify the IBA interface. 596 */ 597 void 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 599 { 600 ib_guid_t hca_guid; 601 char ibd_print_buf[MAXNAMELEN + 256]; 602 int len; 603 va_list ap; 604 char part_name[MAXNAMELEN]; 605 datalink_id_t linkid = state->id_plinkid; 606 607 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 608 0, "hca-guid", 0); 609 (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL); 610 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 611 "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ", 612 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 613 (u_longlong_t)hca_guid, state->id_port, state->id_pkey, 614 part_name); 615 va_start(ap, fmt); 616 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 617 fmt, ap); 618 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 619 va_end(ap); 620 } 621 622 /* 623 * Warlock directives 624 */ 625 626 /* 627 * id_lso_lock 628 * 629 * state->id_lso->bkt_nfree may be accessed without a lock to 630 * determine the threshold at which we have to ask the nw layer 631 * to resume transmission (see ibd_resume_transmission()). 632 */ 633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 634 ibd_state_t::id_lso)) 635 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 636 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 637 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 638 639 /* 640 * id_scq_poll_lock 641 */ 642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 643 ibd_state_t::id_scq_poll_busy)) 644 645 /* 646 * id_txpost_lock 647 */ 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 649 ibd_state_t::id_tx_head)) 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 651 ibd_state_t::id_tx_busy)) 652 653 /* 654 * id_acache_req_lock 655 */ 656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 657 ibd_state_t::id_acache_req_cv)) 658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 659 ibd_state_t::id_req_list)) 660 _NOTE(SCHEME_PROTECTS_DATA("atomic", 661 ibd_acache_s::ac_ref)) 662 663 /* 664 * id_ac_mutex 665 * 666 * This mutex is actually supposed to protect id_ah_op as well, 667 * but this path of the code isn't clean (see update of id_ah_op 668 * in ibd_async_acache(), immediately after the call to 669 * ibd_async_mcache()). For now, we'll skip this check by 670 * declaring that id_ah_op is protected by some internal scheme 671 * that warlock isn't aware of. 672 */ 673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 674 ibd_state_t::id_ah_active)) 675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 676 ibd_state_t::id_ah_free)) 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 678 ibd_state_t::id_ah_addr)) 679 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 680 ibd_state_t::id_ah_op)) 681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 682 ibd_state_t::id_ah_error)) 683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 684 ibd_state_t::id_ac_hot_ace)) 685 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 686 687 /* 688 * id_mc_mutex 689 */ 690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 691 ibd_state_t::id_mc_full)) 692 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 693 ibd_state_t::id_mc_non)) 694 695 /* 696 * id_trap_lock 697 */ 698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 699 ibd_state_t::id_trap_cv)) 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 701 ibd_state_t::id_trap_stop)) 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 703 ibd_state_t::id_trap_inprog)) 704 705 /* 706 * id_prom_op 707 */ 708 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 709 ibd_state_t::id_prom_op)) 710 711 /* 712 * id_sched_lock 713 */ 714 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 715 ibd_state_t::id_sched_needed)) 716 717 /* 718 * id_link_mutex 719 */ 720 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 721 ibd_state_t::id_link_state)) 722 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 723 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 724 ibd_state_t::id_link_speed)) 725 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 726 727 /* 728 * id_tx_list.dl_mutex 729 */ 730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 731 ibd_state_t::id_tx_list.dl_head)) 732 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 733 ibd_state_t::id_tx_list.dl_pending_sends)) 734 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 735 ibd_state_t::id_tx_list.dl_cnt)) 736 737 /* 738 * id_rx_list.dl_mutex 739 */ 740 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 741 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 742 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 743 ibd_state_t::id_rx_list.dl_cnt)) 744 745 /* 746 * rc_timeout_lock 747 */ 748 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 749 ibd_state_t::rc_timeout_start)) 750 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 751 ibd_state_t::rc_timeout)) 752 753 754 /* 755 * Items protected by atomic updates 756 */ 757 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 758 ibd_state_s::id_brd_rcv 759 ibd_state_s::id_brd_xmt 760 ibd_state_s::id_multi_rcv 761 ibd_state_s::id_multi_xmt 762 ibd_state_s::id_num_intrs 763 ibd_state_s::id_rcv_bytes 764 ibd_state_s::id_rcv_pkt 765 ibd_state_s::id_rx_post_queue_index 766 ibd_state_s::id_tx_short 767 ibd_state_s::id_xmt_bytes 768 ibd_state_s::id_xmt_pkt 769 ibd_state_s::rc_rcv_trans_byte 770 ibd_state_s::rc_rcv_trans_pkt 771 ibd_state_s::rc_rcv_copy_byte 772 ibd_state_s::rc_rcv_copy_pkt 773 ibd_state_s::rc_xmt_bytes 774 ibd_state_s::rc_xmt_small_pkt 775 ibd_state_s::rc_xmt_fragmented_pkt 776 ibd_state_s::rc_xmt_map_fail_pkt 777 ibd_state_s::rc_xmt_map_succ_pkt 778 ibd_rc_chan_s::rcq_invoking)) 779 780 /* 781 * Non-mutex protection schemes for data elements. Almost all of 782 * these are non-shared items. 783 */ 784 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 785 callb_cpr 786 ib_gid_s 787 ib_header_info 788 ibd_acache_rq 789 ibd_acache_s::ac_mce 790 ibd_acache_s::ac_chan 791 ibd_mcache::mc_fullreap 792 ibd_mcache::mc_jstate 793 ibd_mcache::mc_req 794 ibd_rwqe_s 795 ibd_swqe_s 796 ibd_wqe_s 797 ibt_wr_ds_s::ds_va 798 ibt_wr_lso_s 799 ipoib_mac::ipoib_qpn 800 mac_capab_lso_s 801 msgb::b_next 802 msgb::b_cont 803 msgb::b_rptr 804 msgb::b_wptr 805 ibd_state_s::id_bgroup_created 806 ibd_state_s::id_mac_state 807 ibd_state_s::id_mtu 808 ibd_state_s::id_ud_num_rwqe 809 ibd_state_s::id_ud_num_swqe 810 ibd_state_s::id_qpnum 811 ibd_state_s::id_rcq_hdl 812 ibd_state_s::id_rx_buf_sz 813 ibd_state_s::id_rx_bufs 814 ibd_state_s::id_rx_mr_hdl 815 ibd_state_s::id_rx_wqes 816 ibd_state_s::id_rxwcs 817 ibd_state_s::id_rxwcs_size 818 ibd_state_s::id_rx_nqueues 819 ibd_state_s::id_rx_queues 820 ibd_state_s::id_scope 821 ibd_state_s::id_scq_hdl 822 ibd_state_s::id_tx_buf_sz 823 ibd_state_s::id_tx_bufs 824 ibd_state_s::id_tx_mr_hdl 825 ibd_state_s::id_tx_rel_list.dl_cnt 826 ibd_state_s::id_tx_wqes 827 ibd_state_s::id_txwcs 828 ibd_state_s::id_txwcs_size 829 ibd_state_s::rc_listen_hdl 830 ibd_state_s::rc_listen_hdl_OFED_interop 831 ibd_state_s::rc_srq_size 832 ibd_state_s::rc_srq_rwqes 833 ibd_state_s::rc_srq_rx_bufs 834 ibd_state_s::rc_srq_rx_mr_hdl 835 ibd_state_s::rc_tx_largebuf_desc_base 836 ibd_state_s::rc_tx_mr_bufs 837 ibd_state_s::rc_tx_mr_hdl 838 ipha_s 839 icmph_s 840 ibt_path_info_s::pi_sid 841 ibd_rc_chan_s::ace 842 ibd_rc_chan_s::chan_hdl 843 ibd_rc_chan_s::state 844 ibd_rc_chan_s::chan_state 845 ibd_rc_chan_s::is_tx_chan 846 ibd_rc_chan_s::rcq_hdl 847 ibd_rc_chan_s::rcq_size 848 ibd_rc_chan_s::scq_hdl 849 ibd_rc_chan_s::scq_size 850 ibd_rc_chan_s::rx_bufs 851 ibd_rc_chan_s::rx_mr_hdl 852 ibd_rc_chan_s::rx_rwqes 853 ibd_rc_chan_s::tx_wqes 854 ibd_rc_chan_s::tx_mr_bufs 855 ibd_rc_chan_s::tx_mr_hdl 856 ibd_rc_chan_s::tx_rel_list.dl_cnt 857 ibd_rc_chan_s::is_used 858 ibd_rc_tx_largebuf_s::lb_buf 859 ibd_rc_msg_hello_s 860 ibt_cm_return_args_s)) 861 862 /* 863 * ibd_rc_chan_s::next is protected by two mutexes: 864 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 865 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 866 */ 867 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 868 ibd_rc_chan_s::next)) 869 870 /* 871 * ibd_state_s.rc_tx_large_bufs_lock 872 */ 873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 874 ibd_state_s::rc_tx_largebuf_free_head)) 875 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 876 ibd_state_s::rc_tx_largebuf_nfree)) 877 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 878 ibd_rc_tx_largebuf_s::lb_next)) 879 880 /* 881 * ibd_acache_s.tx_too_big_mutex 882 */ 883 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 884 ibd_acache_s::tx_too_big_ongoing)) 885 886 /* 887 * tx_wqe_list.dl_mutex 888 */ 889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 890 ibd_rc_chan_s::tx_wqe_list.dl_head)) 891 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 892 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 893 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 894 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 895 896 /* 897 * ibd_state_s.rc_ace_recycle_lock 898 */ 899 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 900 ibd_state_s::rc_ace_recycle)) 901 902 /* 903 * rc_srq_rwqe_list.dl_mutex 904 */ 905 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 906 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 907 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 908 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 909 910 /* 911 * Non-mutex protection schemes for data elements. They are counters 912 * for problem diagnosis. Don't need be protected. 913 */ 914 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 915 ibd_state_s::rc_rcv_alloc_fail 916 ibd_state_s::rc_rcq_err 917 ibd_state_s::rc_ace_not_found 918 ibd_state_s::rc_xmt_drop_too_long_pkt 919 ibd_state_s::rc_xmt_icmp_too_long_pkt 920 ibd_state_s::rc_xmt_reenter_too_long_pkt 921 ibd_state_s::rc_swqe_short 922 ibd_state_s::rc_swqe_mac_update 923 ibd_state_s::rc_xmt_buf_short 924 ibd_state_s::rc_xmt_buf_mac_update 925 ibd_state_s::rc_scq_no_swqe 926 ibd_state_s::rc_scq_no_largebuf 927 ibd_state_s::rc_conn_succ 928 ibd_state_s::rc_conn_fail 929 ibd_state_s::rc_null_conn 930 ibd_state_s::rc_no_estab_conn 931 ibd_state_s::rc_act_close 932 ibd_state_s::rc_pas_close 933 ibd_state_s::rc_delay_ace_recycle 934 ibd_state_s::rc_act_close_simultaneous 935 ibd_state_s::rc_act_close_not_clean 936 ibd_state_s::rc_pas_close_rcq_invoking 937 ibd_state_s::rc_reset_cnt 938 ibd_state_s::rc_timeout_act 939 ibd_state_s::rc_timeout_pas 940 ibd_state_s::rc_stop_connect)) 941 942 #ifdef DEBUG 943 /* 944 * Non-mutex protection schemes for data elements. They are counters 945 * for problem diagnosis. Don't need be protected. 946 */ 947 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 948 ibd_state_s::rc_rwqe_short 949 ibd_rc_stat_s::rc_rcv_trans_byte 950 ibd_rc_stat_s::rc_rcv_trans_pkt 951 ibd_rc_stat_s::rc_rcv_copy_byte 952 ibd_rc_stat_s::rc_rcv_copy_pkt 953 ibd_rc_stat_s::rc_rcv_alloc_fail 954 ibd_rc_stat_s::rc_rcq_err 955 ibd_rc_stat_s::rc_rwqe_short 956 ibd_rc_stat_s::rc_xmt_bytes 957 ibd_rc_stat_s::rc_xmt_small_pkt 958 ibd_rc_stat_s::rc_xmt_fragmented_pkt 959 ibd_rc_stat_s::rc_xmt_map_fail_pkt 960 ibd_rc_stat_s::rc_xmt_map_succ_pkt 961 ibd_rc_stat_s::rc_ace_not_found 962 ibd_rc_stat_s::rc_scq_no_swqe 963 ibd_rc_stat_s::rc_scq_no_largebuf 964 ibd_rc_stat_s::rc_swqe_short 965 ibd_rc_stat_s::rc_swqe_mac_update 966 ibd_rc_stat_s::rc_xmt_buf_short 967 ibd_rc_stat_s::rc_xmt_buf_mac_update 968 ibd_rc_stat_s::rc_conn_succ 969 ibd_rc_stat_s::rc_conn_fail 970 ibd_rc_stat_s::rc_null_conn 971 ibd_rc_stat_s::rc_no_estab_conn 972 ibd_rc_stat_s::rc_act_close 973 ibd_rc_stat_s::rc_pas_close 974 ibd_rc_stat_s::rc_delay_ace_recycle 975 ibd_rc_stat_s::rc_act_close_simultaneous 976 ibd_rc_stat_s::rc_reset_cnt 977 ibd_rc_stat_s::rc_timeout_act 978 ibd_rc_stat_s::rc_timeout_pas)) 979 #endif 980 981 int 982 _init() 983 { 984 int status; 985 986 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 987 PAGESIZE), 0); 988 if (status != 0) { 989 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 990 return (status); 991 } 992 993 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); 994 995 mac_init_ops(&ibd_dev_ops, "ibp"); 996 status = mod_install(&ibd_modlinkage); 997 if (status != 0) { 998 DPRINT(10, "_init:failed in mod_install()"); 999 ddi_soft_state_fini(&ibd_list); 1000 mac_fini_ops(&ibd_dev_ops); 1001 return (status); 1002 } 1003 1004 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 1005 mutex_enter(&ibd_gstate.ig_mutex); 1006 ibd_gstate.ig_ibt_hdl = NULL; 1007 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 1008 ibd_gstate.ig_service_list = NULL; 1009 mutex_exit(&ibd_gstate.ig_mutex); 1010 1011 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, 1012 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { 1013 return (EIO); 1014 } 1015 1016 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); 1017 1018 #ifdef IBD_LOGGING 1019 ibd_log_init(); 1020 #endif 1021 return (0); 1022 } 1023 1024 int 1025 _info(struct modinfo *modinfop) 1026 { 1027 return (mod_info(&ibd_modlinkage, modinfop)); 1028 } 1029 1030 int 1031 _fini() 1032 { 1033 int status; 1034 1035 status = mod_remove(&ibd_modlinkage); 1036 if (status != 0) 1037 return (status); 1038 1039 ibt_unregister_part_attr_cb(); 1040 1041 mac_fini_ops(&ibd_dev_ops); 1042 mutex_destroy(&ibd_objlist_lock); 1043 ddi_soft_state_fini(&ibd_list); 1044 mutex_destroy(&ibd_gstate.ig_mutex); 1045 #ifdef IBD_LOGGING 1046 ibd_log_fini(); 1047 #endif 1048 return (0); 1049 } 1050 1051 /* 1052 * Convert the GID part of the mac address from network byte order 1053 * to host order. 1054 */ 1055 static void 1056 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 1057 { 1058 ib_sn_prefix_t nbopref; 1059 ib_guid_t nboguid; 1060 1061 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 1062 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 1063 dgid->gid_prefix = b2h64(nbopref); 1064 dgid->gid_guid = b2h64(nboguid); 1065 } 1066 1067 /* 1068 * Create the IPoIB address in network byte order from host order inputs. 1069 */ 1070 static void 1071 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 1072 ib_guid_t guid) 1073 { 1074 ib_sn_prefix_t nbopref; 1075 ib_guid_t nboguid; 1076 1077 mac->ipoib_qpn = htonl(qpn); 1078 nbopref = h2b64(prefix); 1079 nboguid = h2b64(guid); 1080 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 1081 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 1082 } 1083 1084 /* 1085 * Send to the appropriate all-routers group when the IBA multicast group 1086 * does not exist, based on whether the target group is v4 or v6. 1087 */ 1088 static boolean_t 1089 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 1090 ipoib_mac_t *rmac) 1091 { 1092 boolean_t retval = B_TRUE; 1093 uint32_t adjscope = state->id_scope << 16; 1094 uint32_t topword; 1095 1096 /* 1097 * Copy the first 4 bytes in without assuming any alignment of 1098 * input mac address; this will have IPoIB signature, flags and 1099 * scope bits. 1100 */ 1101 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 1102 topword = ntohl(topword); 1103 1104 /* 1105 * Generate proper address for IPv4/v6, adding in the Pkey properly. 1106 */ 1107 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 1108 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 1109 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 1110 ((uint32_t)(state->id_pkey << 16))), 1111 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 1112 else 1113 /* 1114 * Does not have proper bits in the mgid address. 1115 */ 1116 retval = B_FALSE; 1117 1118 return (retval); 1119 } 1120 1121 /* 1122 * Membership states for different mcg's are tracked by two lists: 1123 * the "non" list is used for promiscuous mode, when all mcg traffic 1124 * needs to be inspected. This type of membership is never used for 1125 * transmission, so there can not be an AH in the active list 1126 * corresponding to a member in this list. This list does not need 1127 * any protection, since all operations are performed by the async 1128 * thread. 1129 * 1130 * "Full" and "SendOnly" membership is tracked using a single list, 1131 * the "full" list. This is because this single list can then be 1132 * searched during transmit to a multicast group (if an AH for the 1133 * mcg is not found in the active list), since at least one type 1134 * of membership must be present before initiating the transmit. 1135 * This list is also emptied during driver detach, since sendonly 1136 * membership acquired during transmit is dropped at detach time 1137 * along with ipv4 broadcast full membership. Insert/deletes to 1138 * this list are done only by the async thread, but it is also 1139 * searched in program context (see multicast disable case), thus 1140 * the id_mc_mutex protects the list. The driver detach path also 1141 * deconstructs the "full" list, but it ensures that the async 1142 * thread will not be accessing the list (by blocking out mcg 1143 * trap handling and making sure no more Tx reaping will happen). 1144 * 1145 * Currently, an IBA attach is done in the SendOnly case too, 1146 * although this is not required. 1147 */ 1148 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1149 list_insert_head(&state->id_mc_full, mce) 1150 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1151 list_insert_head(&state->id_mc_non, mce) 1152 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1153 ibd_mcache_find(mgid, &state->id_mc_full) 1154 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1155 ibd_mcache_find(mgid, &state->id_mc_non) 1156 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1157 list_remove(&state->id_mc_full, mce) 1158 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1159 list_remove(&state->id_mc_non, mce) 1160 1161 static void * 1162 list_get_head(list_t *list) 1163 { 1164 list_node_t *lhead = list_head(list); 1165 1166 if (lhead != NULL) 1167 list_remove(list, lhead); 1168 return (lhead); 1169 } 1170 1171 /* 1172 * This is always guaranteed to be able to queue the work. 1173 */ 1174 void 1175 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1176 { 1177 /* Initialize request */ 1178 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1179 ptr->rq_op = op; 1180 1181 /* 1182 * Queue provided slot onto request pool. 1183 */ 1184 mutex_enter(&state->id_acache_req_lock); 1185 list_insert_tail(&state->id_req_list, ptr); 1186 1187 /* Go, fetch, async thread */ 1188 cv_signal(&state->id_acache_req_cv); 1189 mutex_exit(&state->id_acache_req_lock); 1190 } 1191 1192 /* 1193 * Main body of the per interface async thread. 1194 */ 1195 static void 1196 ibd_async_work(ibd_state_t *state) 1197 { 1198 ibd_req_t *ptr; 1199 callb_cpr_t cprinfo; 1200 1201 mutex_enter(&state->id_acache_req_lock); 1202 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1203 callb_generic_cpr, "ibd_async_work"); 1204 1205 for (;;) { 1206 ptr = list_get_head(&state->id_req_list); 1207 if (ptr != NULL) { 1208 mutex_exit(&state->id_acache_req_lock); 1209 1210 /* 1211 * If we are in late hca initialization mode, do not 1212 * process any other async request other than TRAP. TRAP 1213 * is used for indicating creation of a broadcast group; 1214 * in which case, we need to join/create the group. 1215 */ 1216 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 1217 (ptr->rq_op != IBD_ASYNC_TRAP)) { 1218 goto free_req_and_continue; 1219 } 1220 1221 /* 1222 * Once we have done the operation, there is no 1223 * guarantee the request slot is going to be valid, 1224 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1225 * TRAP). 1226 * 1227 * Perform the request. 1228 */ 1229 switch (ptr->rq_op) { 1230 case IBD_ASYNC_GETAH: 1231 ibd_async_acache(state, &ptr->rq_mac); 1232 break; 1233 case IBD_ASYNC_JOIN: 1234 case IBD_ASYNC_LEAVE: 1235 ibd_async_multicast(state, 1236 ptr->rq_gid, ptr->rq_op); 1237 break; 1238 case IBD_ASYNC_PROMON: 1239 ibd_async_setprom(state); 1240 break; 1241 case IBD_ASYNC_PROMOFF: 1242 ibd_async_unsetprom(state); 1243 break; 1244 case IBD_ASYNC_REAP: 1245 ibd_async_reap_group(state, 1246 ptr->rq_ptr, ptr->rq_gid, 1247 IB_MC_JSTATE_FULL); 1248 /* 1249 * the req buf contains in mce 1250 * structure, so we do not need 1251 * to free it here. 1252 */ 1253 ptr = NULL; 1254 break; 1255 case IBD_ASYNC_TRAP: 1256 ibd_async_trap(state, ptr); 1257 break; 1258 case IBD_ASYNC_SCHED: 1259 ibd_async_txsched(state); 1260 break; 1261 case IBD_ASYNC_LINK: 1262 ibd_async_link(state, ptr); 1263 break; 1264 case IBD_ASYNC_EXIT: 1265 mutex_enter(&state->id_acache_req_lock); 1266 #ifndef __lock_lint 1267 CALLB_CPR_EXIT(&cprinfo); 1268 #else 1269 mutex_exit(&state->id_acache_req_lock); 1270 #endif 1271 return; 1272 case IBD_ASYNC_RC_TOO_BIG: 1273 ibd_async_rc_process_too_big(state, 1274 ptr); 1275 break; 1276 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1277 ibd_async_rc_close_act_chan(state, ptr); 1278 break; 1279 case IBD_ASYNC_RC_RECYCLE_ACE: 1280 ibd_async_rc_recycle_ace(state, ptr); 1281 break; 1282 case IBD_ASYNC_RC_CLOSE_PAS_CHAN: 1283 (void) ibd_rc_pas_close(ptr->rq_ptr, 1284 B_TRUE, B_TRUE); 1285 break; 1286 } 1287 free_req_and_continue: 1288 if (ptr != NULL) 1289 kmem_cache_free(state->id_req_kmc, ptr); 1290 1291 mutex_enter(&state->id_acache_req_lock); 1292 } else { 1293 #ifndef __lock_lint 1294 /* 1295 * Nothing to do: wait till new request arrives. 1296 */ 1297 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1298 cv_wait(&state->id_acache_req_cv, 1299 &state->id_acache_req_lock); 1300 CALLB_CPR_SAFE_END(&cprinfo, 1301 &state->id_acache_req_lock); 1302 #endif 1303 } 1304 } 1305 1306 /*NOTREACHED*/ 1307 _NOTE(NOT_REACHED) 1308 } 1309 1310 /* 1311 * Return when it is safe to queue requests to the async daemon; primarily 1312 * for subnet trap and async event handling. Disallow requests before the 1313 * daemon is created, and when interface deinitilization starts. 1314 */ 1315 static boolean_t 1316 ibd_async_safe(ibd_state_t *state) 1317 { 1318 mutex_enter(&state->id_trap_lock); 1319 if (state->id_trap_stop) { 1320 mutex_exit(&state->id_trap_lock); 1321 return (B_FALSE); 1322 } 1323 state->id_trap_inprog++; 1324 mutex_exit(&state->id_trap_lock); 1325 return (B_TRUE); 1326 } 1327 1328 /* 1329 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1330 * trap or event handling to complete to kill the async thread and deconstruct 1331 * the mcg/ace list. 1332 */ 1333 static void 1334 ibd_async_done(ibd_state_t *state) 1335 { 1336 mutex_enter(&state->id_trap_lock); 1337 if (--state->id_trap_inprog == 0) 1338 cv_signal(&state->id_trap_cv); 1339 mutex_exit(&state->id_trap_lock); 1340 } 1341 1342 /* 1343 * Hash functions: 1344 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1345 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1346 * These operate on mac addresses input into ibd_send, but there is no 1347 * guarantee on the alignment of the ipoib_mac_t structure. 1348 */ 1349 /*ARGSUSED*/ 1350 static uint_t 1351 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1352 { 1353 ulong_t ptraddr = (ulong_t)key; 1354 uint_t hval; 1355 1356 /* 1357 * If the input address is 4 byte aligned, we can just dereference 1358 * it. This is most common, since IP will send in a 4 byte aligned 1359 * IP header, which implies the 24 byte IPoIB psuedo header will be 1360 * 4 byte aligned too. 1361 */ 1362 if ((ptraddr & 3) == 0) 1363 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1364 1365 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1366 return (hval); 1367 } 1368 1369 static int 1370 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1371 { 1372 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1373 return (0); 1374 else 1375 return (1); 1376 } 1377 1378 /* 1379 * Initialize all the per interface caches and lists; AH cache, 1380 * MCG list etc. 1381 */ 1382 static int 1383 ibd_acache_init(ibd_state_t *state) 1384 { 1385 ibd_ace_t *ce; 1386 int i; 1387 1388 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1389 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1390 mutex_enter(&state->id_ac_mutex); 1391 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1392 offsetof(ibd_ace_t, ac_list)); 1393 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1394 offsetof(ibd_ace_t, ac_list)); 1395 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1396 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, 1397 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1398 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1399 offsetof(ibd_mce_t, mc_list)); 1400 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1401 offsetof(ibd_mce_t, mc_list)); 1402 state->id_ac_hot_ace = NULL; 1403 1404 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1405 state->id_num_ah, KM_SLEEP); 1406 for (i = 0; i < state->id_num_ah; i++, ce++) { 1407 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1408 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1409 mutex_exit(&state->id_ac_mutex); 1410 ibd_acache_fini(state); 1411 return (DDI_FAILURE); 1412 } else { 1413 CLEAR_REFCYCLE(ce); 1414 ce->ac_mce = NULL; 1415 mutex_init(&ce->tx_too_big_mutex, NULL, 1416 MUTEX_DRIVER, NULL); 1417 IBD_ACACHE_INSERT_FREE(state, ce); 1418 } 1419 } 1420 mutex_exit(&state->id_ac_mutex); 1421 return (DDI_SUCCESS); 1422 } 1423 1424 static void 1425 ibd_acache_fini(ibd_state_t *state) 1426 { 1427 ibd_ace_t *ptr; 1428 1429 mutex_enter(&state->id_ac_mutex); 1430 1431 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1432 ASSERT(GET_REF(ptr) == 0); 1433 mutex_destroy(&ptr->tx_too_big_mutex); 1434 (void) ibt_free_ud_dest(ptr->ac_dest); 1435 } 1436 1437 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1438 ASSERT(GET_REF(ptr) == 0); 1439 mutex_destroy(&ptr->tx_too_big_mutex); 1440 (void) ibt_free_ud_dest(ptr->ac_dest); 1441 } 1442 1443 list_destroy(&state->id_ah_free); 1444 list_destroy(&state->id_ah_active); 1445 list_destroy(&state->id_mc_full); 1446 list_destroy(&state->id_mc_non); 1447 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); 1448 mutex_exit(&state->id_ac_mutex); 1449 mutex_destroy(&state->id_ac_mutex); 1450 mutex_destroy(&state->id_mc_mutex); 1451 } 1452 1453 /* 1454 * Search AH active hash list for a cached path to input destination. 1455 * If we are "just looking", hold == F. When we are in the Tx path, 1456 * we set hold == T to grab a reference on the AH so that it can not 1457 * be recycled to a new destination while the Tx request is posted. 1458 */ 1459 ibd_ace_t * 1460 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1461 { 1462 ibd_ace_t *ptr; 1463 1464 ASSERT(mutex_owned(&state->id_ac_mutex)); 1465 1466 /* 1467 * Do hash search. 1468 */ 1469 if (mod_hash_find(state->id_ah_active_hash, 1470 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1471 if (hold) 1472 INC_REF(ptr, num); 1473 return (ptr); 1474 } 1475 return (NULL); 1476 } 1477 1478 /* 1479 * This is called by the tx side; if an initialized AH is found in 1480 * the active list, it is locked down and can be used; if no entry 1481 * is found, an async request is queued to do path resolution. 1482 */ 1483 static ibd_ace_t * 1484 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1485 { 1486 ibd_ace_t *ptr; 1487 ibd_req_t *req; 1488 1489 /* 1490 * Only attempt to print when we can; in the mdt pattr case, the 1491 * address is not aligned properly. 1492 */ 1493 if (((ulong_t)mac & 3) == 0) { 1494 DPRINT(4, 1495 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1496 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1497 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1498 htonl(mac->ipoib_gidsuff[1])); 1499 } 1500 1501 mutex_enter(&state->id_ac_mutex); 1502 1503 if (((ptr = state->id_ac_hot_ace) != NULL) && 1504 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1505 INC_REF(ptr, numwqe); 1506 mutex_exit(&state->id_ac_mutex); 1507 return (ptr); 1508 } 1509 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1510 state->id_ac_hot_ace = ptr; 1511 mutex_exit(&state->id_ac_mutex); 1512 return (ptr); 1513 } 1514 1515 /* 1516 * Implementation of a single outstanding async request; if 1517 * the operation is not started yet, queue a request and move 1518 * to ongoing state. Remember in id_ah_addr for which address 1519 * we are queueing the request, in case we need to flag an error; 1520 * Any further requests, for the same or different address, until 1521 * the operation completes, is sent back to GLDv3 to be retried. 1522 * The async thread will update id_ah_op with an error indication 1523 * or will set it to indicate the next look up can start; either 1524 * way, it will mac_tx_update() so that all blocked requests come 1525 * back here. 1526 */ 1527 *err = EAGAIN; 1528 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1529 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1530 if (req != NULL) { 1531 /* 1532 * We did not even find the entry; queue a request 1533 * for it. 1534 */ 1535 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1536 state->id_ah_op = IBD_OP_ONGOING; 1537 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1538 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1539 } 1540 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1541 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1542 /* 1543 * Check the status of the pathrecord lookup request 1544 * we had queued before. 1545 */ 1546 if (state->id_ah_op == IBD_OP_ERRORED) { 1547 *err = EFAULT; 1548 state->id_ah_error++; 1549 } else { 1550 /* 1551 * IBD_OP_ROUTERED case: We need to send to the 1552 * all-router MCG. If we can find the AH for 1553 * the mcg, the Tx will be attempted. If we 1554 * do not find the AH, we return NORESOURCES 1555 * to retry. 1556 */ 1557 ipoib_mac_t routermac; 1558 1559 (void) ibd_get_allroutergroup(state, mac, &routermac); 1560 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1561 numwqe); 1562 } 1563 state->id_ah_op = IBD_OP_NOTSTARTED; 1564 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1565 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1566 /* 1567 * This case can happen when we get a higher band 1568 * packet. The easiest way is to reset the state machine 1569 * to accommodate the higher priority packet. 1570 */ 1571 state->id_ah_op = IBD_OP_NOTSTARTED; 1572 } 1573 mutex_exit(&state->id_ac_mutex); 1574 1575 return (ptr); 1576 } 1577 1578 /* 1579 * Grab a not-currently-in-use AH/PathRecord from the active 1580 * list to recycle to a new destination. Only the async thread 1581 * executes this code. 1582 */ 1583 static ibd_ace_t * 1584 ibd_acache_get_unref(ibd_state_t *state) 1585 { 1586 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1587 boolean_t try_rc_chan_recycle = B_FALSE; 1588 1589 ASSERT(mutex_owned(&state->id_ac_mutex)); 1590 1591 /* 1592 * Do plain linear search. 1593 */ 1594 while (ptr != NULL) { 1595 /* 1596 * Note that it is possible that the "cycle" bit 1597 * is set on the AH w/o any reference count. The 1598 * mcg must have been deleted, and the tx cleanup 1599 * just decremented the reference count to 0, but 1600 * hasn't gotten around to grabbing the id_ac_mutex 1601 * to move the AH into the free list. 1602 */ 1603 if (GET_REF(ptr) == 0) { 1604 if (ptr->ac_chan != NULL) { 1605 ASSERT(state->id_enable_rc == B_TRUE); 1606 if (!try_rc_chan_recycle) { 1607 try_rc_chan_recycle = B_TRUE; 1608 ibd_rc_signal_ace_recycle(state, ptr); 1609 } 1610 } else { 1611 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1612 break; 1613 } 1614 } 1615 ptr = list_prev(&state->id_ah_active, ptr); 1616 } 1617 return (ptr); 1618 } 1619 1620 /* 1621 * Invoked to clean up AH from active list in case of multicast 1622 * disable and to handle sendonly memberships during mcg traps. 1623 * And for port up processing for multicast and unicast AHs. 1624 * Normally, the AH is taken off the active list, and put into 1625 * the free list to be recycled for a new destination. In case 1626 * Tx requests on the AH have not completed yet, the AH is marked 1627 * for reaping (which will put the AH on the free list) once the Tx's 1628 * complete; in this case, depending on the "force" input, we take 1629 * out the AH from the active list right now, or leave it also for 1630 * the reap operation. Returns TRUE if the AH is taken off the active 1631 * list (and either put into the free list right now, or arranged for 1632 * later), FALSE otherwise. 1633 */ 1634 boolean_t 1635 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1636 { 1637 ibd_ace_t *acactive; 1638 boolean_t ret = B_TRUE; 1639 1640 ASSERT(mutex_owned(&state->id_ac_mutex)); 1641 1642 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1643 1644 /* 1645 * Note that the AH might already have the cycle bit set 1646 * on it; this might happen if sequences of multicast 1647 * enables and disables are coming so fast, that posted 1648 * Tx's to the mcg have not completed yet, and the cycle 1649 * bit is set successively by each multicast disable. 1650 */ 1651 if (SET_CYCLE_IF_REF(acactive)) { 1652 if (!force) { 1653 /* 1654 * The ace is kept on the active list, further 1655 * Tx's can still grab a reference on it; the 1656 * ace is reaped when all pending Tx's 1657 * referencing the AH complete. 1658 */ 1659 ret = B_FALSE; 1660 } else { 1661 /* 1662 * In the mcg trap case, we always pull the 1663 * AH from the active list. And also the port 1664 * up multi/unicast case. 1665 */ 1666 ASSERT(acactive->ac_chan == NULL); 1667 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1668 acactive->ac_mce = NULL; 1669 } 1670 } else { 1671 /* 1672 * Determined the ref count is 0, thus reclaim 1673 * immediately after pulling out the ace from 1674 * the active list. 1675 */ 1676 ASSERT(acactive->ac_chan == NULL); 1677 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1678 acactive->ac_mce = NULL; 1679 IBD_ACACHE_INSERT_FREE(state, acactive); 1680 } 1681 1682 } 1683 return (ret); 1684 } 1685 1686 /* 1687 * Helper function for async path record lookup. If we are trying to 1688 * Tx to a MCG, check our membership, possibly trying to join the 1689 * group if required. If that fails, try to send the packet to the 1690 * all router group (indicated by the redirect output), pointing 1691 * the input mac address to the router mcg address. 1692 */ 1693 static ibd_mce_t * 1694 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1695 { 1696 ib_gid_t mgid; 1697 ibd_mce_t *mce; 1698 ipoib_mac_t routermac; 1699 1700 *redirect = B_FALSE; 1701 ibd_n2h_gid(mac, &mgid); 1702 1703 /* 1704 * Check the FullMember+SendOnlyNonMember list. 1705 * Since we are the only one who manipulates the 1706 * id_mc_full list, no locks are needed. 1707 */ 1708 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1709 if (mce != NULL) { 1710 DPRINT(4, "ibd_async_mcache : already joined to group"); 1711 return (mce); 1712 } 1713 1714 /* 1715 * Not found; try to join(SendOnlyNonMember) and attach. 1716 */ 1717 DPRINT(4, "ibd_async_mcache : not joined to group"); 1718 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1719 NULL) { 1720 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1721 return (mce); 1722 } 1723 1724 /* 1725 * MCGroup not present; try to join the all-router group. If 1726 * any of the following steps succeed, we will be redirecting 1727 * to the all router group. 1728 */ 1729 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1730 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1731 return (NULL); 1732 *redirect = B_TRUE; 1733 ibd_n2h_gid(&routermac, &mgid); 1734 bcopy(&routermac, mac, IPOIB_ADDRL); 1735 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1736 mgid.gid_prefix, mgid.gid_guid); 1737 1738 /* 1739 * Are we already joined to the router group? 1740 */ 1741 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1742 DPRINT(4, "ibd_async_mcache : using already joined router" 1743 "group\n"); 1744 return (mce); 1745 } 1746 1747 /* 1748 * Can we join(SendOnlyNonMember) the router group? 1749 */ 1750 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1751 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1752 NULL) { 1753 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1754 return (mce); 1755 } 1756 1757 return (NULL); 1758 } 1759 1760 /* 1761 * Async path record lookup code. 1762 */ 1763 static void 1764 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1765 { 1766 ibd_ace_t *ce; 1767 ibd_mce_t *mce = NULL; 1768 ibt_path_attr_t path_attr; 1769 ibt_path_info_t path_info; 1770 ib_gid_t destgid; 1771 char ret = IBD_OP_NOTSTARTED; 1772 1773 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1774 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1775 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1776 htonl(mac->ipoib_gidsuff[1])); 1777 1778 /* 1779 * Check whether we are trying to transmit to a MCG. 1780 * In that case, we need to make sure we are a member of 1781 * the MCG. 1782 */ 1783 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1784 boolean_t redirected; 1785 1786 /* 1787 * If we can not find or join the group or even 1788 * redirect, error out. 1789 */ 1790 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1791 NULL) { 1792 state->id_ah_op = IBD_OP_ERRORED; 1793 return; 1794 } 1795 1796 /* 1797 * If we got redirected, we need to determine whether 1798 * the AH for the new mcg is in the cache already, and 1799 * not pull it in then; otherwise proceed to get the 1800 * path for the new mcg. There is no guarantee that 1801 * if the AH is currently in the cache, it will still be 1802 * there when we look in ibd_acache_lookup(), but that's 1803 * okay, we will come back here. 1804 */ 1805 if (redirected) { 1806 ret = IBD_OP_ROUTERED; 1807 DPRINT(4, "ibd_async_acache : redirected to " 1808 "%08X:%08X:%08X:%08X:%08X", 1809 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1810 htonl(mac->ipoib_gidpref[1]), 1811 htonl(mac->ipoib_gidsuff[0]), 1812 htonl(mac->ipoib_gidsuff[1])); 1813 1814 mutex_enter(&state->id_ac_mutex); 1815 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1816 state->id_ah_op = IBD_OP_ROUTERED; 1817 mutex_exit(&state->id_ac_mutex); 1818 DPRINT(4, "ibd_async_acache : router AH found"); 1819 return; 1820 } 1821 mutex_exit(&state->id_ac_mutex); 1822 } 1823 } 1824 1825 /* 1826 * Get an AH from the free list. 1827 */ 1828 mutex_enter(&state->id_ac_mutex); 1829 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1830 /* 1831 * No free ones; try to grab an unreferenced active 1832 * one. Maybe we need to make the active list LRU, 1833 * but that will create more work for Tx callbacks. 1834 * Is there a way of not having to pull out the 1835 * entry from the active list, but just indicate it 1836 * is being recycled? Yes, but that creates one more 1837 * check in the fast lookup path. 1838 */ 1839 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1840 /* 1841 * Pretty serious shortage now. 1842 */ 1843 state->id_ah_op = IBD_OP_NOTSTARTED; 1844 mutex_exit(&state->id_ac_mutex); 1845 DPRINT(10, "ibd_async_acache : failed to find AH " 1846 "slot\n"); 1847 return; 1848 } 1849 /* 1850 * We could check whether ac_mce points to a SendOnly 1851 * member and drop that membership now. Or do it lazily 1852 * at detach time. 1853 */ 1854 ce->ac_mce = NULL; 1855 } 1856 mutex_exit(&state->id_ac_mutex); 1857 ASSERT(ce->ac_mce == NULL); 1858 1859 /* 1860 * Update the entry. 1861 */ 1862 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1863 1864 bzero(&path_info, sizeof (path_info)); 1865 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1866 path_attr.pa_sgid = state->id_sgid; 1867 path_attr.pa_num_dgids = 1; 1868 ibd_n2h_gid(&ce->ac_mac, &destgid); 1869 path_attr.pa_dgids = &destgid; 1870 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1871 path_attr.pa_pkey = state->id_pkey; 1872 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1, 1873 &path_info, NULL) != IBT_SUCCESS) { 1874 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1875 goto error; 1876 } 1877 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1878 ntohl(ce->ac_mac.ipoib_qpn), 1879 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1880 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1881 goto error; 1882 } 1883 1884 /* 1885 * mce is set whenever an AH is being associated with a 1886 * MCG; this will come in handy when we leave the MCG. The 1887 * lock protects Tx fastpath from scanning the active list. 1888 */ 1889 if (mce != NULL) 1890 ce->ac_mce = mce; 1891 1892 /* 1893 * initiate a RC mode connection for unicast address 1894 */ 1895 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1896 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1897 ASSERT(ce->ac_chan == NULL); 1898 DPRINT(10, "ibd_async_acache: call " 1899 "ibd_rc_try_connect(ace=%p)", ce); 1900 ibd_rc_try_connect(state, ce, &path_info); 1901 if (ce->ac_chan == NULL) { 1902 DPRINT(10, "ibd_async_acache: fail to setup RC" 1903 " channel"); 1904 state->rc_conn_fail++; 1905 goto error; 1906 } 1907 } 1908 1909 mutex_enter(&state->id_ac_mutex); 1910 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1911 state->id_ah_op = ret; 1912 mutex_exit(&state->id_ac_mutex); 1913 return; 1914 error: 1915 /* 1916 * We might want to drop SendOnly membership here if we 1917 * joined above. The lock protects Tx callbacks inserting 1918 * into the free list. 1919 */ 1920 mutex_enter(&state->id_ac_mutex); 1921 state->id_ah_op = IBD_OP_ERRORED; 1922 IBD_ACACHE_INSERT_FREE(state, ce); 1923 mutex_exit(&state->id_ac_mutex); 1924 } 1925 1926 /* 1927 * While restoring port's presence on the subnet on a port up, it is possible 1928 * that the port goes down again. 1929 */ 1930 static void 1931 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1932 { 1933 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1934 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1935 LINK_STATE_UP; 1936 ibd_mce_t *mce, *pmce; 1937 ibd_ace_t *ace, *pace; 1938 1939 DPRINT(10, "ibd_async_link(): %d", opcode); 1940 1941 /* 1942 * On a link up, revalidate the link speed/width. No point doing 1943 * this on a link down, since we will be unable to do SA operations, 1944 * defaulting to the lowest speed. Also notice that we update our 1945 * notion of speed before calling mac_link_update(), which will do 1946 * necessary higher level notifications for speed changes. 1947 */ 1948 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1949 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1950 state->id_link_speed = ibd_get_portspeed(state); 1951 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1952 } 1953 1954 /* 1955 * Do all the work required to establish our presence on 1956 * the subnet. 1957 */ 1958 if (opcode == IBD_LINK_UP_ABSENT) { 1959 /* 1960 * If in promiscuous mode ... 1961 */ 1962 if (state->id_prom_op == IBD_OP_COMPLETED) { 1963 /* 1964 * Drop all nonmembership. 1965 */ 1966 ibd_async_unsetprom(state); 1967 1968 /* 1969 * Then, try to regain nonmembership to all mcg's. 1970 */ 1971 ibd_async_setprom(state); 1972 1973 } 1974 1975 /* 1976 * Drop all sendonly membership (which also gets rid of the 1977 * AHs); try to reacquire all full membership. 1978 */ 1979 mce = list_head(&state->id_mc_full); 1980 while ((pmce = mce) != NULL) { 1981 mce = list_next(&state->id_mc_full, mce); 1982 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1983 ibd_leave_group(state, 1984 pmce->mc_info.mc_adds_vect.av_dgid, 1985 IB_MC_JSTATE_SEND_ONLY_NON); 1986 else 1987 ibd_reacquire_group(state, pmce); 1988 } 1989 1990 /* 1991 * Recycle all active AHs to free list (and if there are 1992 * pending posts, make sure they will go into the free list 1993 * once the Tx's complete). Grab the lock to prevent 1994 * concurrent Tx's as well as Tx cleanups. 1995 */ 1996 mutex_enter(&state->id_ac_mutex); 1997 ace = list_head(&state->id_ah_active); 1998 while ((pace = ace) != NULL) { 1999 boolean_t cycled; 2000 2001 ace = list_next(&state->id_ah_active, ace); 2002 mce = pace->ac_mce; 2003 if (pace->ac_chan != NULL) { 2004 ASSERT(mce == NULL); 2005 ASSERT(state->id_enable_rc == B_TRUE); 2006 if (pace->ac_chan->chan_state == 2007 IBD_RC_STATE_ACT_ESTAB) { 2008 INC_REF(pace, 1); 2009 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 2010 pace->ac_chan->chan_state = 2011 IBD_RC_STATE_ACT_CLOSING; 2012 ibd_rc_signal_act_close(state, pace); 2013 } else { 2014 state->rc_act_close_simultaneous++; 2015 DPRINT(40, "ibd_async_link: other " 2016 "thread is closing it, ace=%p, " 2017 "ac_chan=%p, chan_state=%d", 2018 pace, pace->ac_chan, 2019 pace->ac_chan->chan_state); 2020 } 2021 } else { 2022 cycled = ibd_acache_recycle(state, 2023 &pace->ac_mac, B_TRUE); 2024 } 2025 /* 2026 * If this is for an mcg, it must be for a fullmember, 2027 * since we got rid of send-only members above when 2028 * processing the mce list. 2029 */ 2030 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2031 IB_MC_JSTATE_FULL))); 2032 2033 /* 2034 * Check if the fullmember mce needs to be torn down, 2035 * ie whether the DLPI disable has already been done. 2036 * If so, do some of the work of tx_cleanup, namely 2037 * causing leave (which will fail), detach and 2038 * mce-freeing. tx_cleanup will put the AH into free 2039 * list. The reason to duplicate some of this 2040 * tx_cleanup work is because we want to delete the 2041 * AH right now instead of waiting for tx_cleanup, to 2042 * force subsequent Tx's to reacquire an AH. 2043 */ 2044 if ((mce != NULL) && (mce->mc_fullreap)) 2045 ibd_async_reap_group(state, mce, 2046 mce->mc_info.mc_adds_vect.av_dgid, 2047 mce->mc_jstate); 2048 } 2049 mutex_exit(&state->id_ac_mutex); 2050 } 2051 2052 /* 2053 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2054 * (which stops further events from being delivered) before 2055 * mac_unregister(). At this point, it is guaranteed that mac_register 2056 * has already been done. 2057 */ 2058 mutex_enter(&state->id_link_mutex); 2059 state->id_link_state = lstate; 2060 mac_link_update(state->id_mh, lstate); 2061 mutex_exit(&state->id_link_mutex); 2062 2063 ibd_async_done(state); 2064 } 2065 2066 /* 2067 * Check the pkey table to see if we can find the pkey we're looking for. 2068 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 2069 * failure. 2070 */ 2071 static int 2072 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 2073 uint16_t *pkix) 2074 { 2075 uint16_t ndx; 2076 2077 ASSERT(pkix != NULL); 2078 2079 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 2080 if (pkey_tbl[ndx] == pkey) { 2081 *pkix = ndx; 2082 return (0); 2083 } 2084 } 2085 return (-1); 2086 } 2087 2088 /* 2089 * Late HCA Initialization: 2090 * If plumb had succeeded without the availability of an active port or the 2091 * pkey, and either of their availability is now being indicated via PORT_UP 2092 * or PORT_CHANGE respectively, try a start of the interface. 2093 * 2094 * Normal Operation: 2095 * When the link is notified up, we need to do a few things, based 2096 * on the port's current p_init_type_reply claiming a reinit has been 2097 * done or not. The reinit steps are: 2098 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2099 * the old Pkey and GID0 are correct. 2100 * 2. Register for mcg traps (already done by ibmf). 2101 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2102 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2103 * 4. Give up all sendonly memberships. 2104 * 5. Acquire all full memberships. 2105 * 6. In promiscuous mode, acquire all non memberships. 2106 * 7. Recycle all AHs to free list. 2107 */ 2108 static void 2109 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2110 { 2111 ibt_hca_portinfo_t *port_infop = NULL; 2112 ibt_status_t ibt_status; 2113 uint_t psize, port_infosz; 2114 ibd_link_op_t opcode; 2115 ibd_req_t *req; 2116 link_state_t new_link_state = LINK_STATE_UP; 2117 uint8_t itreply; 2118 uint16_t pkix; 2119 int ret; 2120 2121 /* 2122 * Let's not race with a plumb or an unplumb; if we detect a 2123 * pkey relocation event later on here, we may have to restart. 2124 */ 2125 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2126 2127 mutex_enter(&state->id_link_mutex); 2128 2129 /* 2130 * If the link state is unknown, a plumb has not yet been attempted 2131 * on the interface. Nothing to do. 2132 */ 2133 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2134 mutex_exit(&state->id_link_mutex); 2135 goto link_mod_return; 2136 } 2137 2138 /* 2139 * If link state is down because of plumb failure, and we are not in 2140 * late HCA init, and we were not successfully plumbed, nothing to do. 2141 */ 2142 if ((state->id_link_state == LINK_STATE_DOWN) && 2143 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && 2144 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { 2145 mutex_exit(&state->id_link_mutex); 2146 goto link_mod_return; 2147 } 2148 2149 /* 2150 * If this routine was called in response to a port down event, 2151 * we just need to see if this should be informed. 2152 */ 2153 if (code == IBT_ERROR_PORT_DOWN) { 2154 new_link_state = LINK_STATE_DOWN; 2155 goto update_link_state; 2156 } 2157 2158 /* 2159 * If it's not a port down event we've received, try to get the port 2160 * attributes first. If we fail here, the port is as good as down. 2161 * Otherwise, if the link went down by the time the handler gets 2162 * here, give up - we cannot even validate the pkey/gid since those 2163 * are not valid and this is as bad as a port down anyway. 2164 */ 2165 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2166 &port_infop, &psize, &port_infosz); 2167 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2168 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2169 new_link_state = LINK_STATE_DOWN; 2170 goto update_link_state; 2171 } 2172 2173 /* 2174 * If in the previous attempt, the pkey was not found either due to the 2175 * port state being down, or due to it's absence in the pkey table, 2176 * look for it now and try to start the interface. 2177 */ 2178 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { 2179 mutex_exit(&state->id_link_mutex); 2180 if ((ret = ibd_start(state)) != 0) { 2181 DPRINT(10, "ibd_linkmod: cannot start from late HCA " 2182 "init, ret=%d", ret); 2183 } 2184 ibt_free_portinfo(port_infop, port_infosz); 2185 goto link_mod_return; 2186 } 2187 2188 /* 2189 * Check the SM InitTypeReply flags. If both NoLoadReply and 2190 * PreserveContentReply are 0, we don't know anything about the 2191 * data loaded into the port attributes, so we need to verify 2192 * if gid0 and pkey are still valid. 2193 */ 2194 itreply = port_infop->p_init_type_reply; 2195 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2196 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2197 /* 2198 * Check to see if the subnet part of GID0 has changed. If 2199 * not, check the simple case first to see if the pkey 2200 * index is the same as before; finally check to see if the 2201 * pkey has been relocated to a different index in the table. 2202 */ 2203 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2204 if (bcmp(port_infop->p_sgid_tbl, 2205 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2206 2207 new_link_state = LINK_STATE_DOWN; 2208 2209 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2210 state->id_pkey) { 2211 2212 new_link_state = LINK_STATE_UP; 2213 2214 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2215 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2216 2217 ibt_free_portinfo(port_infop, port_infosz); 2218 mutex_exit(&state->id_link_mutex); 2219 2220 /* 2221 * Currently a restart is required if our pkey has moved 2222 * in the pkey table. If we get the ibt_recycle_ud() to 2223 * work as documented (expected), we may be able to 2224 * avoid a complete restart. Note that we've already 2225 * marked both the start and stop 'in-progress' flags, 2226 * so it is ok to go ahead and do this restart. 2227 */ 2228 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2229 if ((ret = ibd_start(state)) != 0) { 2230 DPRINT(10, "ibd_restart: cannot restart, " 2231 "ret=%d", ret); 2232 } 2233 2234 goto link_mod_return; 2235 } else { 2236 new_link_state = LINK_STATE_DOWN; 2237 } 2238 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2239 } 2240 2241 update_link_state: 2242 if (port_infop) { 2243 ibt_free_portinfo(port_infop, port_infosz); 2244 } 2245 2246 /* 2247 * If we're reporting a link up, check InitTypeReply to see if 2248 * the SM has ensured that the port's presence in mcg, traps, 2249 * etc. is intact. 2250 */ 2251 if (new_link_state == LINK_STATE_DOWN) { 2252 opcode = IBD_LINK_DOWN; 2253 } else { 2254 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2255 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2256 opcode = IBD_LINK_UP; 2257 } else { 2258 opcode = IBD_LINK_UP_ABSENT; 2259 } 2260 } 2261 2262 /* 2263 * If the old state is the same as the new state, and the SM indicated 2264 * no change in the port parameters, nothing to do. 2265 */ 2266 if ((state->id_link_state == new_link_state) && (opcode != 2267 IBD_LINK_UP_ABSENT)) { 2268 mutex_exit(&state->id_link_mutex); 2269 goto link_mod_return; 2270 } 2271 2272 /* 2273 * Ok, so there was a link state change; see if it's safe to ask 2274 * the async thread to do the work 2275 */ 2276 if (!ibd_async_safe(state)) { 2277 state->id_link_state = new_link_state; 2278 mutex_exit(&state->id_link_mutex); 2279 goto link_mod_return; 2280 } 2281 2282 mutex_exit(&state->id_link_mutex); 2283 2284 /* 2285 * Queue up a request for ibd_async_link() to handle this link 2286 * state change event 2287 */ 2288 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2289 req->rq_ptr = (void *)opcode; 2290 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2291 2292 link_mod_return: 2293 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2294 } 2295 2296 /* 2297 * For the port up/down events, IBTL guarantees there will not be concurrent 2298 * invocations of the handler. IBTL might coalesce link transition events, 2299 * and not invoke the handler for _each_ up/down transition, but it will 2300 * invoke the handler with last known state 2301 */ 2302 static void 2303 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2304 ibt_async_code_t code, ibt_async_event_t *event) 2305 { 2306 ibd_state_t *state = (ibd_state_t *)clnt_private; 2307 2308 switch (code) { 2309 case IBT_ERROR_CATASTROPHIC_CHAN: 2310 ibd_print_warn(state, "catastrophic channel error"); 2311 break; 2312 case IBT_ERROR_CQ: 2313 ibd_print_warn(state, "completion queue error"); 2314 break; 2315 case IBT_PORT_CHANGE_EVENT: 2316 /* 2317 * Events will be delivered to all instances that have 2318 * done ibt_open_hca() but not yet done ibt_close_hca(). 2319 * Only need to do work for our port; IBTF will deliver 2320 * events for other ports on the hca we have ibt_open_hca'ed 2321 * too. Note that id_port is initialized in ibd_attach() 2322 * before we do an ibt_open_hca() in ibd_attach(). 2323 */ 2324 ASSERT(state->id_hca_hdl == hca_hdl); 2325 if (state->id_port != event->ev_port) 2326 break; 2327 2328 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2329 IBT_PORT_CHANGE_PKEY) { 2330 ibd_link_mod(state, code); 2331 } 2332 break; 2333 case IBT_ERROR_PORT_DOWN: 2334 case IBT_CLNT_REREG_EVENT: 2335 case IBT_EVENT_PORT_UP: 2336 /* 2337 * Events will be delivered to all instances that have 2338 * done ibt_open_hca() but not yet done ibt_close_hca(). 2339 * Only need to do work for our port; IBTF will deliver 2340 * events for other ports on the hca we have ibt_open_hca'ed 2341 * too. Note that id_port is initialized in ibd_attach() 2342 * before we do an ibt_open_hca() in ibd_attach(). 2343 */ 2344 ASSERT(state->id_hca_hdl == hca_hdl); 2345 if (state->id_port != event->ev_port) 2346 break; 2347 2348 ibd_link_mod(state, code); 2349 break; 2350 2351 case IBT_HCA_ATTACH_EVENT: 2352 case IBT_HCA_DETACH_EVENT: 2353 /* 2354 * When a new card is plugged to the system, attach_event is 2355 * invoked. Additionally, a cfgadm needs to be run to make the 2356 * card known to the system, and an ifconfig needs to be run to 2357 * plumb up any ibd interfaces on the card. In the case of card 2358 * unplug, a cfgadm is run that will trigger any RCM scripts to 2359 * unplumb the ibd interfaces on the card; when the card is 2360 * actually unplugged, the detach_event is invoked; 2361 * additionally, if any ibd instances are still active on the 2362 * card (eg there were no associated RCM scripts), driver's 2363 * detach routine is invoked. 2364 */ 2365 break; 2366 default: 2367 break; 2368 } 2369 } 2370 2371 static int 2372 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2373 { 2374 mac_register_t *macp; 2375 int ret; 2376 2377 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2378 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2379 return (DDI_FAILURE); 2380 } 2381 2382 /* 2383 * Note that when we register with mac during attach, we don't 2384 * have the id_macaddr yet, so we'll simply be registering a 2385 * zero macaddr that we'll overwrite later during plumb (in 2386 * ibd_m_start()). Similar is the case with id_mtu - we'll 2387 * update the mac layer with the correct mtu during plumb. 2388 */ 2389 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2390 macp->m_driver = state; 2391 macp->m_dip = dip; 2392 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2393 macp->m_callbacks = &ibd_m_callbacks; 2394 macp->m_min_sdu = 0; 2395 if (state->id_type == IBD_PORT_DRIVER) { 2396 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 2397 } else if (state->id_enable_rc) { 2398 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2399 } else { 2400 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2401 } 2402 macp->m_priv_props = ibd_priv_props; 2403 2404 /* 2405 * Register ourselves with the GLDv3 interface 2406 */ 2407 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2408 mac_free(macp); 2409 DPRINT(10, 2410 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2411 return (DDI_FAILURE); 2412 } 2413 2414 mac_free(macp); 2415 return (DDI_SUCCESS); 2416 } 2417 2418 static int 2419 ibd_record_capab(ibd_state_t *state) 2420 { 2421 ibt_hca_attr_t hca_attrs; 2422 ibt_status_t ibt_status; 2423 2424 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2425 2426 /* 2427 * Query the HCA and fetch its attributes 2428 */ 2429 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2430 ASSERT(ibt_status == IBT_SUCCESS); 2431 2432 /* 2433 * 1. Set the Hardware Checksum capability. Currently we only consider 2434 * full checksum offload. 2435 */ 2436 if (state->id_enable_rc) { 2437 state->id_hwcksum_capab = 0; 2438 } else { 2439 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2440 == IBT_HCA_CKSUM_FULL) { 2441 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2442 } 2443 } 2444 2445 /* 2446 * 2. Set LSO policy, capability and maximum length 2447 */ 2448 if (state->id_enable_rc) { 2449 state->id_lso_capable = B_FALSE; 2450 state->id_lso_maxlen = 0; 2451 } else { 2452 if (hca_attrs.hca_max_lso_size > 0) { 2453 state->id_lso_capable = B_TRUE; 2454 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2455 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2456 else 2457 state->id_lso_maxlen = 2458 hca_attrs.hca_max_lso_size; 2459 } else { 2460 state->id_lso_capable = B_FALSE; 2461 state->id_lso_maxlen = 0; 2462 } 2463 } 2464 2465 /* 2466 * 3. Set Reserved L_Key capability 2467 */ 2468 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2469 state->id_hca_res_lkey_capab = 1; 2470 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2471 state->rc_enable_iov_map = B_TRUE; 2472 } else { 2473 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2474 state->rc_enable_iov_map = B_FALSE; 2475 } 2476 2477 /* 2478 * 4. Set maximum sqseg value after checking to see if extended sgl 2479 * size information is provided by the hca 2480 */ 2481 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2482 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2483 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2484 } else { 2485 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2486 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2487 } 2488 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2489 state->id_max_sqseg = IBD_MAX_SQSEG; 2490 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2491 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2492 state->id_max_sqseg, IBD_MAX_SQSEG); 2493 } 2494 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2495 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2496 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2497 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2498 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2499 } 2500 2501 /* 2502 * Translating the virtual address regions into physical regions 2503 * for using the Reserved LKey feature results in a wr sgl that 2504 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2505 * we'll fix a high-water mark (65%) for when we should stop. 2506 */ 2507 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2508 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2509 2510 /* 2511 * 5. Set number of recv and send wqes after checking hca maximum 2512 * channel size. Store the max channel size in the state so that it 2513 * can be referred to when the swqe/rwqe change is requested via 2514 * dladm. 2515 */ 2516 2517 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; 2518 2519 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) 2520 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; 2521 2522 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - 2523 IBD_RWQE_MIN; 2524 2525 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) 2526 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; 2527 2528 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2529 2530 return (DDI_SUCCESS); 2531 } 2532 2533 static int 2534 ibd_part_busy(ibd_state_t *state) 2535 { 2536 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2537 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n"); 2538 return (DDI_FAILURE); 2539 } 2540 2541 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2542 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n"); 2543 return (DDI_FAILURE); 2544 } 2545 2546 /* 2547 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is 2548 * connecting to a remote IPoIB port. We can't remove this port. 2549 */ 2550 if (state->id_ah_op == IBD_OP_ONGOING) { 2551 DPRINT(10, "ibd_part_busy: failed: connecting\n"); 2552 return (DDI_FAILURE); 2553 } 2554 2555 return (DDI_SUCCESS); 2556 } 2557 2558 2559 static void 2560 ibd_part_unattach(ibd_state_t *state) 2561 { 2562 uint32_t progress = state->id_mac_state; 2563 ibt_status_t ret; 2564 2565 /* make sure rx resources are freed */ 2566 ibd_free_rx_rsrcs(state); 2567 2568 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2569 ASSERT(state->id_enable_rc); 2570 ibd_rc_fini_srq_list(state); 2571 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2572 } 2573 2574 if (progress & IBD_DRV_MAC_REGISTERED) { 2575 (void) mac_unregister(state->id_mh); 2576 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2577 } 2578 2579 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 2580 /* 2581 * No new async requests will be posted since the device 2582 * link state has been marked as unknown; completion handlers 2583 * have been turned off, so Tx handler will not cause any 2584 * more IBD_ASYNC_REAP requests. 2585 * 2586 * Queue a request for the async thread to exit, which will 2587 * be serviced after any pending ones. This can take a while, 2588 * specially if the SM is unreachable, since IBMF will slowly 2589 * timeout each SM request issued by the async thread. Reap 2590 * the thread before continuing on, we do not want it to be 2591 * lingering in modunloaded code. 2592 */ 2593 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 2594 thread_join(state->id_async_thrid); 2595 2596 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 2597 } 2598 2599 if (progress & IBD_DRV_REQ_LIST_INITED) { 2600 list_destroy(&state->id_req_list); 2601 mutex_destroy(&state->id_acache_req_lock); 2602 cv_destroy(&state->id_acache_req_cv); 2603 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; 2604 } 2605 2606 if (progress & IBD_DRV_PD_ALLOCD) { 2607 if ((ret = ibt_free_pd(state->id_hca_hdl, 2608 state->id_pd_hdl)) != IBT_SUCCESS) { 2609 ibd_print_warn(state, "failed to free " 2610 "protection domain, ret=%d", ret); 2611 } 2612 state->id_pd_hdl = NULL; 2613 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2614 } 2615 2616 if (progress & IBD_DRV_HCA_OPENED) { 2617 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2618 IBT_SUCCESS) { 2619 ibd_print_warn(state, "failed to close " 2620 "HCA device, ret=%d", ret); 2621 } 2622 state->id_hca_hdl = NULL; 2623 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2624 } 2625 2626 mutex_enter(&ibd_gstate.ig_mutex); 2627 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2628 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2629 IBT_SUCCESS) { 2630 ibd_print_warn(state, 2631 "ibt_detach() failed, ret=%d", ret); 2632 } 2633 state->id_ibt_hdl = NULL; 2634 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2635 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2636 } 2637 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2638 (ibd_gstate.ig_ibt_hdl != NULL)) { 2639 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2640 IBT_SUCCESS) { 2641 ibd_print_warn(state, "ibt_detach(): global " 2642 "failed, ret=%d", ret); 2643 } 2644 ibd_gstate.ig_ibt_hdl = NULL; 2645 } 2646 mutex_exit(&ibd_gstate.ig_mutex); 2647 2648 if (progress & IBD_DRV_TXINTR_ADDED) { 2649 ddi_remove_softintr(state->id_tx); 2650 state->id_tx = NULL; 2651 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2652 } 2653 2654 if (progress & IBD_DRV_RXINTR_ADDED) { 2655 ddi_remove_softintr(state->id_rx); 2656 state->id_rx = NULL; 2657 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2658 } 2659 2660 #ifdef DEBUG 2661 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2662 kstat_delete(state->rc_ksp); 2663 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2664 } 2665 #endif 2666 2667 if (progress & IBD_DRV_STATE_INITIALIZED) { 2668 ibd_state_fini(state); 2669 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2670 } 2671 } 2672 2673 int 2674 ibd_part_attach(ibd_state_t *state, dev_info_t *dip) 2675 { 2676 ibt_status_t ret; 2677 int rv; 2678 kthread_t *kht; 2679 2680 /* 2681 * Initialize mutexes and condition variables 2682 */ 2683 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2684 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()"); 2685 return (DDI_FAILURE); 2686 } 2687 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2688 2689 /* 2690 * Allocate rx,tx softintr 2691 */ 2692 if (ibd_rx_softintr == 1) { 2693 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2694 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2695 DPRINT(10, "ibd_part_attach: failed in " 2696 "ddi_add_softintr(id_rx), ret=%d", rv); 2697 return (DDI_FAILURE); 2698 } 2699 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2700 } 2701 if (ibd_tx_softintr == 1) { 2702 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2703 NULL, NULL, ibd_tx_recycle, 2704 (caddr_t)state)) != DDI_SUCCESS) { 2705 DPRINT(10, "ibd_part_attach: failed in " 2706 "ddi_add_softintr(id_tx), ret=%d", rv); 2707 return (DDI_FAILURE); 2708 } 2709 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2710 } 2711 2712 /* 2713 * Attach to IBTL 2714 */ 2715 mutex_enter(&ibd_gstate.ig_mutex); 2716 if (ibd_gstate.ig_ibt_hdl == NULL) { 2717 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2718 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2719 DPRINT(10, "ibd_part_attach: global: failed in " 2720 "ibt_attach(), ret=%d", ret); 2721 mutex_exit(&ibd_gstate.ig_mutex); 2722 return (DDI_FAILURE); 2723 } 2724 } 2725 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2726 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2727 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d", 2728 ret); 2729 mutex_exit(&ibd_gstate.ig_mutex); 2730 return (DDI_FAILURE); 2731 } 2732 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2733 mutex_exit(&ibd_gstate.ig_mutex); 2734 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2735 2736 /* 2737 * Open the HCA 2738 */ 2739 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 2740 &state->id_hca_hdl)) != IBT_SUCCESS) { 2741 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d", 2742 ret); 2743 return (DDI_FAILURE); 2744 } 2745 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2746 2747 #ifdef DEBUG 2748 /* Initialize Driver Counters for Reliable Connected Mode */ 2749 if (state->id_enable_rc) { 2750 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2751 DPRINT(10, "ibd_part_attach: failed in " 2752 "ibd_rc_init_stats"); 2753 return (DDI_FAILURE); 2754 } 2755 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2756 } 2757 #endif 2758 2759 /* 2760 * Record capabilities 2761 */ 2762 (void) ibd_record_capab(state); 2763 2764 /* 2765 * Allocate a protection domain on the HCA 2766 */ 2767 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2768 &state->id_pd_hdl)) != IBT_SUCCESS) { 2769 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d", 2770 ret); 2771 return (DDI_FAILURE); 2772 } 2773 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2774 2775 2776 /* 2777 * We need to initialise the req_list that is required for the 2778 * operation of the async_thread. 2779 */ 2780 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 2781 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 2782 list_create(&state->id_req_list, sizeof (ibd_req_t), 2783 offsetof(ibd_req_t, rq_list)); 2784 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; 2785 2786 /* 2787 * Create the async thread; thread_create never fails. 2788 */ 2789 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 2790 TS_RUN, minclsyspri); 2791 state->id_async_thrid = kht->t_did; 2792 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 2793 2794 return (DDI_SUCCESS); 2795 } 2796 2797 /* 2798 * Attach device to the IO framework. 2799 */ 2800 static int 2801 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2802 { 2803 int ret; 2804 2805 switch (cmd) { 2806 case DDI_ATTACH: 2807 ret = ibd_port_attach(dip); 2808 break; 2809 default: 2810 ret = DDI_FAILURE; 2811 break; 2812 } 2813 return (ret); 2814 } 2815 2816 /* 2817 * Detach device from the IO framework. 2818 */ 2819 static int 2820 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2821 { 2822 ibd_state_t *state; 2823 int instance; 2824 2825 /* 2826 * IBD doesn't support suspend/resume 2827 */ 2828 if (cmd != DDI_DETACH) 2829 return (DDI_FAILURE); 2830 2831 /* 2832 * Get the instance softstate 2833 */ 2834 instance = ddi_get_instance(dip); 2835 state = ddi_get_soft_state(ibd_list, instance); 2836 2837 /* 2838 * Release all resources we're holding still. Note that if we'd 2839 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2840 * so far, we should find all the flags we need in id_mac_state. 2841 */ 2842 return (ibd_port_unattach(state, dip)); 2843 } 2844 2845 /* 2846 * Pre ibt_attach() driver initialization 2847 */ 2848 static int 2849 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2850 { 2851 char buf[64]; 2852 2853 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2854 state->id_link_state = LINK_STATE_UNKNOWN; 2855 2856 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2857 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2858 state->id_trap_stop = B_TRUE; 2859 state->id_trap_inprog = 0; 2860 2861 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2862 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2863 state->id_dip = dip; 2864 2865 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2866 2867 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2868 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2869 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2870 state->id_tx_busy = 0; 2871 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2872 2873 state->id_rx_list.dl_bufs_outstanding = 0; 2874 state->id_rx_list.dl_cnt = 0; 2875 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2876 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2877 (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip), 2878 state->id_pkey, state->id_plinkid); 2879 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2880 0, NULL, NULL, NULL, NULL, NULL, 0); 2881 2882 /* For Reliable Connected Mode */ 2883 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2884 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2885 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2886 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2887 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2888 MUTEX_DRIVER, NULL); 2889 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL); 2890 2891 /* 2892 * Make the default link mode as RC. If this fails during connection 2893 * setup, the link mode is automatically transitioned to UD. 2894 * Also set the RC MTU. 2895 */ 2896 state->id_enable_rc = IBD_DEF_LINK_MODE; 2897 state->rc_mtu = IBD_DEF_RC_MAX_MTU; 2898 state->id_mtu = IBD_DEF_MAX_MTU; 2899 2900 /* Iniatialize all tunables to default */ 2901 state->id_lso_policy = IBD_DEF_LSO_POLICY; 2902 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; 2903 state->id_num_ah = IBD_DEF_NUM_AH; 2904 state->id_hash_size = IBD_DEF_HASH_SIZE; 2905 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; 2906 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; 2907 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; 2908 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; 2909 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; 2910 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; 2911 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; 2912 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; 2913 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; 2914 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; 2915 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; 2916 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; 2917 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; 2918 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; 2919 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; 2920 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; 2921 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; 2922 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; 2923 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; 2924 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; 2925 2926 return (DDI_SUCCESS); 2927 } 2928 2929 /* 2930 * Post ibt_detach() driver deconstruction 2931 */ 2932 static void 2933 ibd_state_fini(ibd_state_t *state) 2934 { 2935 kmem_cache_destroy(state->id_req_kmc); 2936 2937 mutex_destroy(&state->id_rx_list.dl_mutex); 2938 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2939 2940 mutex_destroy(&state->id_txpost_lock); 2941 mutex_destroy(&state->id_tx_list.dl_mutex); 2942 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2943 mutex_destroy(&state->id_lso_lock); 2944 2945 mutex_destroy(&state->id_sched_lock); 2946 mutex_destroy(&state->id_scq_poll_lock); 2947 mutex_destroy(&state->id_rcq_poll_lock); 2948 2949 cv_destroy(&state->id_trap_cv); 2950 mutex_destroy(&state->id_trap_lock); 2951 mutex_destroy(&state->id_link_mutex); 2952 2953 /* For Reliable Connected Mode */ 2954 mutex_destroy(&state->rc_timeout_lock); 2955 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2956 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2957 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2958 mutex_destroy(&state->rc_tx_large_bufs_lock); 2959 mutex_destroy(&state->rc_rx_lock); 2960 } 2961 2962 /* 2963 * Fetch link speed from SA for snmp ifspeed reporting. 2964 */ 2965 static uint64_t 2966 ibd_get_portspeed(ibd_state_t *state) 2967 { 2968 int ret; 2969 ibt_path_info_t path; 2970 ibt_path_attr_t path_attr; 2971 uint8_t num_paths; 2972 uint64_t ifspeed; 2973 2974 /* 2975 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2976 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2977 * 2000000000. Start with that as default. 2978 */ 2979 ifspeed = 2000000000; 2980 2981 bzero(&path_attr, sizeof (path_attr)); 2982 2983 /* 2984 * Get the port speed from Loopback path information. 2985 */ 2986 path_attr.pa_dgids = &state->id_sgid; 2987 path_attr.pa_num_dgids = 1; 2988 path_attr.pa_sgid = state->id_sgid; 2989 2990 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2991 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2992 goto earlydone; 2993 2994 if (num_paths < 1) 2995 goto earlydone; 2996 2997 /* 2998 * In case SA does not return an expected value, report the default 2999 * speed as 1X. 3000 */ 3001 ret = 1; 3002 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 3003 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 3004 ret = 1; 3005 break; 3006 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 3007 ret = 4; 3008 break; 3009 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 3010 ret = 12; 3011 break; 3012 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 3013 ret = 2; 3014 break; 3015 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 3016 ret = 8; 3017 break; 3018 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 3019 ret = 16; 3020 break; 3021 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 3022 ret = 24; 3023 break; 3024 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 3025 ret = 32; 3026 break; 3027 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 3028 ret = 48; 3029 break; 3030 } 3031 3032 ifspeed *= ret; 3033 3034 earlydone: 3035 return (ifspeed); 3036 } 3037 3038 /* 3039 * Search input mcg list (id_mc_full or id_mc_non) for an entry 3040 * representing the input mcg mgid. 3041 */ 3042 static ibd_mce_t * 3043 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 3044 { 3045 ibd_mce_t *ptr = list_head(mlist); 3046 3047 /* 3048 * Do plain linear search. 3049 */ 3050 while (ptr != NULL) { 3051 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 3052 sizeof (ib_gid_t)) == 0) 3053 return (ptr); 3054 ptr = list_next(mlist, ptr); 3055 } 3056 return (NULL); 3057 } 3058 3059 /* 3060 * Execute IBA JOIN. 3061 */ 3062 static ibt_status_t 3063 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 3064 { 3065 ibt_mcg_attr_t mcg_attr; 3066 3067 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3068 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 3069 mcg_attr.mc_mgid = mgid; 3070 mcg_attr.mc_join_state = mce->mc_jstate; 3071 mcg_attr.mc_scope = state->id_scope; 3072 mcg_attr.mc_pkey = state->id_pkey; 3073 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 3074 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 3075 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 3076 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 3077 NULL, NULL)); 3078 } 3079 3080 /* 3081 * This code JOINs the port in the proper way (depending on the join 3082 * state) so that IBA fabric will forward mcg packets to/from the port. 3083 * It also attaches the QPN to the mcg so it can receive those mcg 3084 * packets. This code makes sure not to attach the mcg to the QP if 3085 * that has been previously done due to the mcg being joined with a 3086 * different join state, even though this is not required by SWG_0216, 3087 * refid 3610. 3088 */ 3089 static ibd_mce_t * 3090 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3091 { 3092 ibt_status_t ibt_status; 3093 ibd_mce_t *mce, *tmce, *omce = NULL; 3094 boolean_t do_attach = B_TRUE; 3095 3096 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 3097 jstate, mgid.gid_prefix, mgid.gid_guid); 3098 3099 /* 3100 * For enable_multicast Full member joins, we need to do some 3101 * extra work. If there is already an mce on the list that 3102 * indicates full membership, that means the membership has 3103 * not yet been dropped (since the disable_multicast was issued) 3104 * because there are pending Tx's to the mcg; in that case, just 3105 * mark the mce not to be reaped when the Tx completion queues 3106 * an async reap operation. 3107 * 3108 * If there is already an mce on the list indicating sendonly 3109 * membership, try to promote to full membership. Be careful 3110 * not to deallocate the old mce, since there might be an AH 3111 * pointing to it; instead, update the old mce with new data 3112 * that tracks the full membership. 3113 */ 3114 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 3115 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 3116 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 3117 ASSERT(omce->mc_fullreap); 3118 omce->mc_fullreap = B_FALSE; 3119 return (omce); 3120 } else { 3121 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3122 } 3123 } 3124 3125 /* 3126 * Allocate the ibd_mce_t to track this JOIN. 3127 */ 3128 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 3129 mce->mc_fullreap = B_FALSE; 3130 mce->mc_jstate = jstate; 3131 3132 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 3133 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 3134 ibt_status); 3135 kmem_free(mce, sizeof (ibd_mce_t)); 3136 return (NULL); 3137 } 3138 3139 /* 3140 * Is an IBA attach required? Not if the interface is already joined 3141 * to the mcg in a different appropriate join state. 3142 */ 3143 if (jstate == IB_MC_JSTATE_NON) { 3144 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3145 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3146 do_attach = B_FALSE; 3147 } else if (jstate == IB_MC_JSTATE_FULL) { 3148 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3149 do_attach = B_FALSE; 3150 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3151 do_attach = B_FALSE; 3152 } 3153 3154 if (do_attach) { 3155 /* 3156 * Do the IBA attach. 3157 */ 3158 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 3159 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 3160 &mce->mc_info)) != IBT_SUCCESS) { 3161 DPRINT(10, "ibd_join_group : failed qp attachment " 3162 "%d\n", ibt_status); 3163 /* 3164 * NOTE that we should probably preserve the join info 3165 * in the list and later try to leave again at detach 3166 * time. 3167 */ 3168 (void) ibt_leave_mcg(state->id_sgid, mgid, 3169 state->id_sgid, jstate); 3170 kmem_free(mce, sizeof (ibd_mce_t)); 3171 return (NULL); 3172 } 3173 } 3174 3175 /* 3176 * Insert the ibd_mce_t in the proper list. 3177 */ 3178 if (jstate == IB_MC_JSTATE_NON) { 3179 IBD_MCACHE_INSERT_NON(state, mce); 3180 } else { 3181 /* 3182 * Set up the mc_req fields used for reaping the 3183 * mcg in case of delayed tx completion (see 3184 * ibd_tx_cleanup()). Also done for sendonly join in 3185 * case we are promoted to fullmembership later and 3186 * keep using the same mce. 3187 */ 3188 mce->mc_req.rq_gid = mgid; 3189 mce->mc_req.rq_ptr = mce; 3190 /* 3191 * Check whether this is the case of trying to join 3192 * full member, and we were already joined send only. 3193 * We try to drop our SendOnly membership, but it is 3194 * possible that the mcg does not exist anymore (and 3195 * the subnet trap never reached us), so the leave 3196 * operation might fail. 3197 */ 3198 if (omce != NULL) { 3199 (void) ibt_leave_mcg(state->id_sgid, mgid, 3200 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3201 omce->mc_jstate = IB_MC_JSTATE_FULL; 3202 bcopy(&mce->mc_info, &omce->mc_info, 3203 sizeof (ibt_mcg_info_t)); 3204 kmem_free(mce, sizeof (ibd_mce_t)); 3205 return (omce); 3206 } 3207 mutex_enter(&state->id_mc_mutex); 3208 IBD_MCACHE_INSERT_FULL(state, mce); 3209 mutex_exit(&state->id_mc_mutex); 3210 } 3211 3212 return (mce); 3213 } 3214 3215 /* 3216 * Called during port up event handling to attempt to reacquire full 3217 * membership to an mcg. Stripped down version of ibd_join_group(). 3218 * Note that it is possible that the mcg might have gone away, and 3219 * gets recreated at this point. 3220 */ 3221 static void 3222 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3223 { 3224 ib_gid_t mgid; 3225 3226 /* 3227 * If the mc_fullreap flag is set, or this join fails, a subsequent 3228 * reap/leave is going to try to leave the group. We could prevent 3229 * that by adding a boolean flag into ibd_mce_t, if required. 3230 */ 3231 if (mce->mc_fullreap) 3232 return; 3233 3234 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3235 3236 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3237 mgid.gid_guid); 3238 3239 /* While reacquiring, leave and then join the MCG */ 3240 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 3241 mce->mc_jstate); 3242 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3243 ibd_print_warn(state, "Failure on port up to rejoin " 3244 "multicast gid %016llx:%016llx", 3245 (u_longlong_t)mgid.gid_prefix, 3246 (u_longlong_t)mgid.gid_guid); 3247 } 3248 3249 /* 3250 * This code handles delayed Tx completion cleanups for mcg's to which 3251 * disable_multicast has been issued, regular mcg related cleanups during 3252 * disable_multicast, disable_promiscuous and mcg traps, as well as 3253 * cleanups during driver detach time. Depending on the join state, 3254 * it deletes the mce from the appropriate list and issues the IBA 3255 * leave/detach; except in the disable_multicast case when the mce 3256 * is left on the active list for a subsequent Tx completion cleanup. 3257 */ 3258 static void 3259 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3260 uint8_t jstate) 3261 { 3262 ibd_mce_t *tmce; 3263 boolean_t do_detach = B_TRUE; 3264 3265 /* 3266 * Before detaching, we must check whether the other list 3267 * contains the mcg; if we detach blindly, the consumer 3268 * who set up the other list will also stop receiving 3269 * traffic. 3270 */ 3271 if (jstate == IB_MC_JSTATE_FULL) { 3272 /* 3273 * The following check is only relevant while coming 3274 * from the Tx completion path in the reap case. 3275 */ 3276 if (!mce->mc_fullreap) 3277 return; 3278 mutex_enter(&state->id_mc_mutex); 3279 IBD_MCACHE_PULLOUT_FULL(state, mce); 3280 mutex_exit(&state->id_mc_mutex); 3281 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3282 do_detach = B_FALSE; 3283 } else if (jstate == IB_MC_JSTATE_NON) { 3284 IBD_MCACHE_PULLOUT_NON(state, mce); 3285 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3286 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3287 do_detach = B_FALSE; 3288 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3289 mutex_enter(&state->id_mc_mutex); 3290 IBD_MCACHE_PULLOUT_FULL(state, mce); 3291 mutex_exit(&state->id_mc_mutex); 3292 do_detach = B_FALSE; 3293 } 3294 3295 /* 3296 * If we are reacting to a mcg trap and leaving our sendonly or 3297 * non membership, the mcg is possibly already gone, so attempting 3298 * to leave might fail. On the other hand, we must try to leave 3299 * anyway, since this might be a trap from long ago, and we could 3300 * have potentially sendonly joined to a recent incarnation of 3301 * the mcg and are about to loose track of this information. 3302 */ 3303 if (do_detach) { 3304 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3305 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3306 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3307 } 3308 3309 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3310 kmem_free(mce, sizeof (ibd_mce_t)); 3311 } 3312 3313 /* 3314 * Async code executed due to multicast and promiscuous disable requests 3315 * and mcg trap handling; also executed during driver detach. Mostly, a 3316 * leave and detach is done; except for the fullmember case when Tx 3317 * requests are pending, whence arrangements are made for subsequent 3318 * cleanup on Tx completion. 3319 */ 3320 static void 3321 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3322 { 3323 ipoib_mac_t mcmac; 3324 boolean_t recycled; 3325 ibd_mce_t *mce; 3326 3327 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3328 jstate, mgid.gid_prefix, mgid.gid_guid); 3329 3330 if (jstate == IB_MC_JSTATE_NON) { 3331 recycled = B_TRUE; 3332 mce = IBD_MCACHE_FIND_NON(state, mgid); 3333 /* 3334 * In case we are handling a mcg trap, we might not find 3335 * the mcg in the non list. 3336 */ 3337 if (mce == NULL) { 3338 return; 3339 } 3340 } else { 3341 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3342 3343 /* 3344 * In case we are handling a mcg trap, make sure the trap 3345 * is not arriving late; if we have an mce that indicates 3346 * that we are already a fullmember, that would be a clear 3347 * indication that the trap arrived late (ie, is for a 3348 * previous incarnation of the mcg). 3349 */ 3350 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3351 if ((mce == NULL) || (mce->mc_jstate == 3352 IB_MC_JSTATE_FULL)) { 3353 return; 3354 } 3355 } else { 3356 ASSERT(jstate == IB_MC_JSTATE_FULL); 3357 3358 /* 3359 * If join group failed, mce will be NULL here. 3360 * This is because in GLDv3 driver, set multicast 3361 * will always return success. 3362 */ 3363 if (mce == NULL) { 3364 return; 3365 } 3366 3367 mce->mc_fullreap = B_TRUE; 3368 } 3369 3370 /* 3371 * If no pending Tx's remain that reference the AH 3372 * for the mcg, recycle it from active to free list. 3373 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3374 * so the last completing Tx will cause an async reap 3375 * operation to be invoked, at which time we will drop our 3376 * membership to the mcg so that the pending Tx's complete 3377 * successfully. Refer to comments on "AH and MCE active 3378 * list manipulation" at top of this file. The lock protects 3379 * against Tx fast path and Tx cleanup code. 3380 */ 3381 mutex_enter(&state->id_ac_mutex); 3382 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3383 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3384 IB_MC_JSTATE_SEND_ONLY_NON)); 3385 mutex_exit(&state->id_ac_mutex); 3386 } 3387 3388 if (recycled) { 3389 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3390 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3391 ibd_async_reap_group(state, mce, mgid, jstate); 3392 } 3393 } 3394 3395 /* 3396 * Find the broadcast address as defined by IPoIB; implicitly 3397 * determines the IBA scope, mtu, tclass etc of the link the 3398 * interface is going to be a member of. 3399 */ 3400 static ibt_status_t 3401 ibd_find_bgroup(ibd_state_t *state) 3402 { 3403 ibt_mcg_attr_t mcg_attr; 3404 uint_t numg; 3405 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3406 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3407 IB_MC_SCOPE_GLOBAL }; 3408 int i, mcgmtu; 3409 boolean_t found = B_FALSE; 3410 int ret; 3411 ibt_mcg_info_t mcg_info; 3412 3413 state->id_bgroup_created = B_FALSE; 3414 state->id_bgroup_present = B_FALSE; 3415 3416 query_bcast_grp: 3417 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3418 mcg_attr.mc_pkey = state->id_pkey; 3419 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3420 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3421 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3422 3423 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3424 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3425 3426 /* 3427 * Look for the IPoIB broadcast group. 3428 */ 3429 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3430 state->id_mgid.gid_prefix = 3431 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3432 ((uint64_t)state->id_scope << 48) | 3433 ((uint32_t)(state->id_pkey << 16))); 3434 mcg_attr.mc_mgid = state->id_mgid; 3435 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3436 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3437 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3438 found = B_TRUE; 3439 break; 3440 } 3441 } 3442 3443 if (!found) { 3444 if (state->id_create_broadcast_group) { 3445 /* 3446 * If we created the broadcast group, but failed to 3447 * find it, we can't do anything except leave the 3448 * one we created and return failure. 3449 */ 3450 if (state->id_bgroup_created) { 3451 ibd_print_warn(state, "IPoIB broadcast group " 3452 "absent. Unable to query after create."); 3453 goto find_bgroup_fail; 3454 } 3455 3456 /* 3457 * Create the ipoib broadcast group if it didn't exist 3458 */ 3459 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3460 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3461 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3462 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3463 mcg_attr.mc_pkey = state->id_pkey; 3464 mcg_attr.mc_flow = 0; 3465 mcg_attr.mc_sl = 0; 3466 mcg_attr.mc_tclass = 0; 3467 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3468 state->id_mgid.gid_prefix = 3469 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3470 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3471 ((uint32_t)(state->id_pkey << 16))); 3472 mcg_attr.mc_mgid = state->id_mgid; 3473 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3474 3475 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3476 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3477 ibd_print_warn(state, "IPoIB broadcast group " 3478 "absent, create failed: ret = %d\n", ret); 3479 state->id_bgroup_created = B_FALSE; 3480 return (IBT_FAILURE); 3481 } 3482 state->id_bgroup_created = B_TRUE; 3483 goto query_bcast_grp; 3484 } else { 3485 ibd_print_warn(state, "IPoIB broadcast group absent"); 3486 return (IBT_FAILURE); 3487 } 3488 } 3489 3490 /* 3491 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3492 */ 3493 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3494 if (state->id_mtu < mcgmtu) { 3495 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3496 "greater than port's maximum MTU %d", mcgmtu, 3497 state->id_mtu); 3498 ibt_free_mcg_info(state->id_mcinfo, 1); 3499 goto find_bgroup_fail; 3500 } 3501 state->id_mtu = mcgmtu; 3502 state->id_bgroup_present = B_TRUE; 3503 3504 return (IBT_SUCCESS); 3505 3506 find_bgroup_fail: 3507 if (state->id_bgroup_created) { 3508 (void) ibt_leave_mcg(state->id_sgid, 3509 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3510 IB_MC_JSTATE_FULL); 3511 } 3512 3513 return (IBT_FAILURE); 3514 } 3515 3516 static int 3517 ibd_alloc_tx_copybufs(ibd_state_t *state) 3518 { 3519 ibt_mr_attr_t mem_attr; 3520 3521 /* 3522 * Allocate one big chunk for all regular tx copy bufs 3523 */ 3524 state->id_tx_buf_sz = state->id_mtu; 3525 if (state->id_lso_policy && state->id_lso_capable && 3526 (state->id_ud_tx_copy_thresh > state->id_mtu)) { 3527 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; 3528 } 3529 3530 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * 3531 state->id_tx_buf_sz, KM_SLEEP); 3532 3533 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * 3534 sizeof (ibd_swqe_t), KM_SLEEP); 3535 3536 /* 3537 * Do one memory registration on the entire txbuf area 3538 */ 3539 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3540 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz; 3541 mem_attr.mr_as = NULL; 3542 mem_attr.mr_flags = IBT_MR_SLEEP; 3543 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3544 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3545 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3546 kmem_free(state->id_tx_wqes, 3547 state->id_ud_num_swqe * sizeof (ibd_swqe_t)); 3548 kmem_free(state->id_tx_bufs, 3549 state->id_ud_num_swqe * state->id_tx_buf_sz); 3550 state->id_tx_bufs = NULL; 3551 return (DDI_FAILURE); 3552 } 3553 3554 return (DDI_SUCCESS); 3555 } 3556 3557 static int 3558 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3559 { 3560 ibt_mr_attr_t mem_attr; 3561 ibd_lsobuf_t *buflist; 3562 ibd_lsobuf_t *lbufp; 3563 ibd_lsobuf_t *tail; 3564 ibd_lsobkt_t *bktp; 3565 uint8_t *membase; 3566 uint8_t *memp; 3567 uint_t memsz; 3568 int i; 3569 3570 /* 3571 * Allocate the lso bucket 3572 */ 3573 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3574 3575 /* 3576 * Allocate the entire lso memory and register it 3577 */ 3578 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ; 3579 membase = kmem_zalloc(memsz, KM_SLEEP); 3580 3581 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3582 mem_attr.mr_len = memsz; 3583 mem_attr.mr_as = NULL; 3584 mem_attr.mr_flags = IBT_MR_SLEEP; 3585 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3586 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3587 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3588 kmem_free(membase, memsz); 3589 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3590 return (DDI_FAILURE); 3591 } 3592 3593 mutex_enter(&state->id_lso_lock); 3594 3595 /* 3596 * Now allocate the buflist. Note that the elements in the buflist and 3597 * the buffers in the lso memory have a permanent 1-1 relation, so we 3598 * can always derive the address of a buflist entry from the address of 3599 * an lso buffer. 3600 */ 3601 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t), 3602 KM_SLEEP); 3603 3604 /* 3605 * Set up the lso buf chain 3606 */ 3607 memp = membase; 3608 lbufp = buflist; 3609 for (i = 0; i < state->id_num_lso_bufs; i++) { 3610 lbufp->lb_isfree = 1; 3611 lbufp->lb_buf = memp; 3612 lbufp->lb_next = lbufp + 1; 3613 3614 tail = lbufp; 3615 3616 memp += IBD_LSO_BUFSZ; 3617 lbufp++; 3618 } 3619 tail->lb_next = NULL; 3620 3621 /* 3622 * Set up the LSO buffer information in ibd state 3623 */ 3624 bktp->bkt_bufl = buflist; 3625 bktp->bkt_free_head = buflist; 3626 bktp->bkt_mem = membase; 3627 bktp->bkt_nelem = state->id_num_lso_bufs; 3628 bktp->bkt_nfree = bktp->bkt_nelem; 3629 3630 state->id_lso = bktp; 3631 mutex_exit(&state->id_lso_lock); 3632 3633 return (DDI_SUCCESS); 3634 } 3635 3636 /* 3637 * Statically allocate Tx buffer list(s). 3638 */ 3639 static int 3640 ibd_init_txlist(ibd_state_t *state) 3641 { 3642 ibd_swqe_t *swqe; 3643 ibt_lkey_t lkey; 3644 int i; 3645 uint_t len; 3646 uint8_t *bufaddr; 3647 3648 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3649 return (DDI_FAILURE); 3650 3651 if (state->id_lso_policy && state->id_lso_capable) { 3652 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3653 state->id_lso_capable = B_FALSE; 3654 } 3655 3656 mutex_enter(&state->id_tx_list.dl_mutex); 3657 state->id_tx_list.dl_head = NULL; 3658 state->id_tx_list.dl_pending_sends = B_FALSE; 3659 state->id_tx_list.dl_cnt = 0; 3660 mutex_exit(&state->id_tx_list.dl_mutex); 3661 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3662 state->id_tx_rel_list.dl_head = NULL; 3663 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3664 state->id_tx_rel_list.dl_cnt = 0; 3665 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3666 3667 /* 3668 * Allocate and setup the swqe list 3669 */ 3670 lkey = state->id_tx_mr_desc.md_lkey; 3671 bufaddr = state->id_tx_bufs; 3672 len = state->id_tx_buf_sz; 3673 swqe = state->id_tx_wqes; 3674 mutex_enter(&state->id_tx_list.dl_mutex); 3675 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) { 3676 swqe->swqe_next = NULL; 3677 swqe->swqe_im_mblk = NULL; 3678 3679 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3680 bufaddr; 3681 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3682 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3683 3684 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3685 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3686 swqe->w_swr.wr_trans = IBT_UD_SRV; 3687 3688 /* These are set in send */ 3689 swqe->w_swr.wr_nds = 0; 3690 swqe->w_swr.wr_sgl = NULL; 3691 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3692 3693 /* add to list */ 3694 state->id_tx_list.dl_cnt++; 3695 swqe->swqe_next = state->id_tx_list.dl_head; 3696 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3697 } 3698 mutex_exit(&state->id_tx_list.dl_mutex); 3699 3700 return (DDI_SUCCESS); 3701 } 3702 3703 static int 3704 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3705 uint32_t *nds_p) 3706 { 3707 ibd_lsobkt_t *bktp; 3708 ibd_lsobuf_t *lbufp; 3709 ibd_lsobuf_t *nextp; 3710 ibt_lkey_t lso_lkey; 3711 uint_t frag_sz; 3712 uint_t num_needed; 3713 int i; 3714 3715 ASSERT(sgl_p != NULL); 3716 ASSERT(nds_p != NULL); 3717 ASSERT(req_sz != 0); 3718 3719 /* 3720 * Determine how many bufs we'd need for the size requested 3721 */ 3722 num_needed = req_sz / IBD_LSO_BUFSZ; 3723 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3724 num_needed++; 3725 3726 mutex_enter(&state->id_lso_lock); 3727 3728 /* 3729 * If we don't have enough lso bufs, return failure 3730 */ 3731 ASSERT(state->id_lso != NULL); 3732 bktp = state->id_lso; 3733 if (bktp->bkt_nfree < num_needed) { 3734 mutex_exit(&state->id_lso_lock); 3735 return (-1); 3736 } 3737 3738 /* 3739 * Pick the first 'num_needed' bufs from the free list 3740 */ 3741 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3742 lbufp = bktp->bkt_free_head; 3743 for (i = 0; i < num_needed; i++) { 3744 ASSERT(lbufp->lb_isfree != 0); 3745 ASSERT(lbufp->lb_buf != NULL); 3746 3747 nextp = lbufp->lb_next; 3748 3749 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3750 sgl_p[i].ds_key = lso_lkey; 3751 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3752 3753 lbufp->lb_isfree = 0; 3754 lbufp->lb_next = NULL; 3755 3756 lbufp = nextp; 3757 } 3758 bktp->bkt_free_head = lbufp; 3759 3760 /* 3761 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3762 * to adjust the last sgl entry's length. Since we know we need atleast 3763 * one, the i-1 use below is ok. 3764 */ 3765 if (frag_sz) { 3766 sgl_p[i-1].ds_len = frag_sz; 3767 } 3768 3769 /* 3770 * Update nfree count and return 3771 */ 3772 bktp->bkt_nfree -= num_needed; 3773 3774 mutex_exit(&state->id_lso_lock); 3775 3776 *nds_p = num_needed; 3777 3778 return (0); 3779 } 3780 3781 static void 3782 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3783 { 3784 ibd_lsobkt_t *bktp; 3785 ibd_lsobuf_t *lbufp; 3786 uint8_t *lso_mem_end; 3787 uint_t ndx; 3788 int i; 3789 3790 mutex_enter(&state->id_lso_lock); 3791 3792 bktp = state->id_lso; 3793 ASSERT(bktp != NULL); 3794 3795 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3796 for (i = 0; i < nds; i++) { 3797 uint8_t *va; 3798 3799 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3800 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3801 3802 /* 3803 * Figure out the buflist element this sgl buffer corresponds 3804 * to and put it back at the head 3805 */ 3806 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3807 lbufp = bktp->bkt_bufl + ndx; 3808 3809 ASSERT(lbufp->lb_isfree == 0); 3810 ASSERT(lbufp->lb_buf == va); 3811 3812 lbufp->lb_isfree = 1; 3813 lbufp->lb_next = bktp->bkt_free_head; 3814 bktp->bkt_free_head = lbufp; 3815 } 3816 bktp->bkt_nfree += nds; 3817 3818 mutex_exit(&state->id_lso_lock); 3819 } 3820 3821 static void 3822 ibd_free_tx_copybufs(ibd_state_t *state) 3823 { 3824 /* 3825 * Unregister txbuf mr 3826 */ 3827 if (ibt_deregister_mr(state->id_hca_hdl, 3828 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3829 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3830 } 3831 state->id_tx_mr_hdl = NULL; 3832 3833 /* 3834 * Free txbuf memory 3835 */ 3836 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * 3837 sizeof (ibd_swqe_t)); 3838 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * 3839 state->id_tx_buf_sz); 3840 state->id_tx_wqes = NULL; 3841 state->id_tx_bufs = NULL; 3842 } 3843 3844 static void 3845 ibd_free_tx_lsobufs(ibd_state_t *state) 3846 { 3847 ibd_lsobkt_t *bktp; 3848 3849 mutex_enter(&state->id_lso_lock); 3850 3851 if ((bktp = state->id_lso) == NULL) { 3852 mutex_exit(&state->id_lso_lock); 3853 return; 3854 } 3855 3856 /* 3857 * First, free the buflist 3858 */ 3859 ASSERT(bktp->bkt_bufl != NULL); 3860 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3861 3862 /* 3863 * Unregister the LSO memory and free it 3864 */ 3865 ASSERT(bktp->bkt_mr_hdl != NULL); 3866 if (ibt_deregister_mr(state->id_hca_hdl, 3867 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3868 DPRINT(10, 3869 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3870 } 3871 ASSERT(bktp->bkt_mem); 3872 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3873 3874 /* 3875 * Finally free the bucket 3876 */ 3877 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3878 state->id_lso = NULL; 3879 3880 mutex_exit(&state->id_lso_lock); 3881 } 3882 3883 /* 3884 * Free the statically allocated Tx buffer list. 3885 */ 3886 static void 3887 ibd_fini_txlist(ibd_state_t *state) 3888 { 3889 /* 3890 * Free the allocated swqes 3891 */ 3892 mutex_enter(&state->id_tx_list.dl_mutex); 3893 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3894 state->id_tx_list.dl_head = NULL; 3895 state->id_tx_list.dl_pending_sends = B_FALSE; 3896 state->id_tx_list.dl_cnt = 0; 3897 state->id_tx_rel_list.dl_head = NULL; 3898 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3899 state->id_tx_rel_list.dl_cnt = 0; 3900 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3901 mutex_exit(&state->id_tx_list.dl_mutex); 3902 3903 ibd_free_tx_lsobufs(state); 3904 ibd_free_tx_copybufs(state); 3905 } 3906 3907 /* 3908 * post a list of rwqes, NULL terminated. 3909 */ 3910 static void 3911 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3912 { 3913 uint_t i; 3914 uint_t num_posted; 3915 ibt_status_t ibt_status; 3916 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3917 3918 while (rwqe) { 3919 /* Post up to IBD_RX_POST_CNT receive work requests */ 3920 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3921 wrs[i] = rwqe->w_rwr; 3922 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3923 if (rwqe == NULL) { 3924 i++; 3925 break; 3926 } 3927 } 3928 3929 /* 3930 * If posting fails for some reason, we'll never receive 3931 * completion intimation, so we'll need to cleanup. But 3932 * we need to make sure we don't clean up nodes whose 3933 * wrs have been successfully posted. We assume that the 3934 * hca driver returns on the first failure to post and 3935 * therefore the first 'num_posted' entries don't need 3936 * cleanup here. 3937 */ 3938 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3939 3940 num_posted = 0; 3941 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3942 &num_posted); 3943 if (ibt_status != IBT_SUCCESS) { 3944 /* This cannot happen unless the device has an error. */ 3945 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3946 "posting multiple wrs failed: " 3947 "requested=%d, done=%d, ret=%d", 3948 IBD_RX_POST_CNT, num_posted, ibt_status); 3949 atomic_add_32(&state->id_rx_list.dl_cnt, 3950 num_posted - i); 3951 } 3952 } 3953 } 3954 3955 /* 3956 * Grab a list of rwqes from the array of lists, and post the list. 3957 */ 3958 static void 3959 ibd_post_recv_intr(ibd_state_t *state) 3960 { 3961 ibd_rx_queue_t *rxp; 3962 ibd_rwqe_t *list; 3963 3964 /* rotate through the rx_queue array, expecting an adequate number */ 3965 state->id_rx_post_queue_index = 3966 (state->id_rx_post_queue_index + 1) & 3967 (state->id_rx_nqueues - 1); 3968 3969 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3970 mutex_enter(&rxp->rx_post_lock); 3971 list = WQE_TO_RWQE(rxp->rx_head); 3972 rxp->rx_head = NULL; 3973 rxp->rx_cnt = 0; 3974 mutex_exit(&rxp->rx_post_lock); 3975 ibd_post_recv_list(state, list); 3976 } 3977 3978 /* macro explained below */ 3979 #define RX_QUEUE_HASH(rwqe) \ 3980 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3981 3982 /* 3983 * Add a rwqe to one of the the Rx lists. If the list is large enough 3984 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3985 * 3986 * Note: one of 2^N lists is chosen via a hash. This is done 3987 * because using one list is contentious. If the first list is busy 3988 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3989 * 3990 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3991 * even distribution of mapping rwqes to the 2^N queues. 3992 */ 3993 static void 3994 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3995 { 3996 ibd_rx_queue_t *rxp; 3997 3998 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3999 4000 if (!mutex_tryenter(&rxp->rx_post_lock)) { 4001 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 4002 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 4003 mutex_enter(&rxp->rx_post_lock); 4004 } 4005 rwqe->rwqe_next = rxp->rx_head; 4006 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 4007 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 4008 4009 /* only call ibt_post_recv() every Nth time through here */ 4010 if ((active & (state->id_rx_nqueues - 1)) == 0) { 4011 rxp->rx_head = NULL; 4012 rxp->rx_cnt = 0; 4013 mutex_exit(&rxp->rx_post_lock); 4014 ibd_post_recv_list(state, rwqe); 4015 return; 4016 } 4017 } 4018 rxp->rx_head = RWQE_TO_WQE(rwqe); 4019 mutex_exit(&rxp->rx_post_lock); 4020 } 4021 4022 static int 4023 ibd_alloc_rx_copybufs(ibd_state_t *state) 4024 { 4025 ibt_mr_attr_t mem_attr; 4026 int i; 4027 4028 /* 4029 * Allocate one big chunk for all regular rx copy bufs 4030 */ 4031 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 4032 4033 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe * 4034 state->id_rx_buf_sz, KM_SLEEP); 4035 4036 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe * 4037 sizeof (ibd_rwqe_t), KM_SLEEP); 4038 4039 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 4040 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 4041 sizeof (ibd_rx_queue_t), KM_SLEEP); 4042 for (i = 0; i < state->id_rx_nqueues; i++) { 4043 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4044 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 4045 } 4046 4047 /* 4048 * Do one memory registration on the entire rxbuf area 4049 */ 4050 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 4051 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz; 4052 mem_attr.mr_as = NULL; 4053 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 4054 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 4055 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 4056 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 4057 kmem_free(state->id_rx_wqes, 4058 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); 4059 kmem_free(state->id_rx_bufs, 4060 state->id_ud_num_rwqe * state->id_rx_buf_sz); 4061 state->id_rx_bufs = NULL; 4062 state->id_rx_wqes = NULL; 4063 return (DDI_FAILURE); 4064 } 4065 4066 return (DDI_SUCCESS); 4067 } 4068 4069 /* 4070 * Allocate the statically allocated Rx buffer list. 4071 */ 4072 static int 4073 ibd_init_rxlist(ibd_state_t *state) 4074 { 4075 ibd_rwqe_t *rwqe, *next; 4076 ibd_wqe_t *list; 4077 ibt_lkey_t lkey; 4078 int i; 4079 uint_t len; 4080 uint8_t *bufaddr; 4081 4082 mutex_enter(&state->id_rx_free_list.dl_mutex); 4083 if (state->id_rx_free_list.dl_head != NULL) { 4084 /* rx rsrcs were never freed. Just repost them */ 4085 len = state->id_rx_buf_sz; 4086 list = state->id_rx_free_list.dl_head; 4087 state->id_rx_free_list.dl_head = NULL; 4088 state->id_rx_free_list.dl_cnt = 0; 4089 mutex_exit(&state->id_rx_free_list.dl_mutex); 4090 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4091 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4092 if ((rwqe->rwqe_im_mblk = desballoc( 4093 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 4094 &rwqe->w_freemsg_cb)) == NULL) { 4095 /* allow freemsg_cb to free the rwqes */ 4096 if (atomic_dec_32_nv(&state->id_running) != 0) { 4097 cmn_err(CE_WARN, "ibd_init_rxlist: " 4098 "id_running was not 1\n"); 4099 } 4100 DPRINT(10, "ibd_init_rxlist : " 4101 "failed in desballoc()"); 4102 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4103 rwqe = next) { 4104 next = WQE_TO_RWQE(rwqe->rwqe_next); 4105 if (rwqe->rwqe_im_mblk) { 4106 atomic_inc_32(&state-> 4107 id_rx_list. 4108 dl_bufs_outstanding); 4109 freemsg(rwqe->rwqe_im_mblk); 4110 } else 4111 ibd_free_rwqe(state, rwqe); 4112 } 4113 atomic_inc_32(&state->id_running); 4114 return (DDI_FAILURE); 4115 } 4116 } 4117 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4118 return (DDI_SUCCESS); 4119 } 4120 mutex_exit(&state->id_rx_free_list.dl_mutex); 4121 4122 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 4123 return (DDI_FAILURE); 4124 4125 /* 4126 * Allocate and setup the rwqe list 4127 */ 4128 len = state->id_rx_buf_sz; 4129 lkey = state->id_rx_mr_desc.md_lkey; 4130 rwqe = state->id_rx_wqes; 4131 bufaddr = state->id_rx_bufs; 4132 list = NULL; 4133 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) { 4134 rwqe->w_state = state; 4135 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 4136 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 4137 4138 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 4139 4140 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 4141 &rwqe->w_freemsg_cb)) == NULL) { 4142 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 4143 /* allow freemsg_cb to free the rwqes */ 4144 if (atomic_dec_32_nv(&state->id_running) != 0) { 4145 cmn_err(CE_WARN, "ibd_init_rxlist: " 4146 "id_running was not 1\n"); 4147 } 4148 DPRINT(10, "ibd_init_rxlist : " 4149 "failed in desballoc()"); 4150 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4151 rwqe = next) { 4152 next = WQE_TO_RWQE(rwqe->rwqe_next); 4153 freemsg(rwqe->rwqe_im_mblk); 4154 } 4155 atomic_inc_32(&state->id_running); 4156 4157 /* remove reference to free'd rwqes */ 4158 mutex_enter(&state->id_rx_free_list.dl_mutex); 4159 state->id_rx_free_list.dl_head = NULL; 4160 state->id_rx_free_list.dl_cnt = 0; 4161 mutex_exit(&state->id_rx_free_list.dl_mutex); 4162 4163 ibd_fini_rxlist(state); 4164 return (DDI_FAILURE); 4165 } 4166 4167 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 4168 rwqe->rwqe_copybuf.ic_sgl.ds_va = 4169 (ib_vaddr_t)(uintptr_t)bufaddr; 4170 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 4171 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 4172 rwqe->w_rwr.wr_nds = 1; 4173 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 4174 4175 rwqe->rwqe_next = list; 4176 list = RWQE_TO_WQE(rwqe); 4177 } 4178 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4179 4180 return (DDI_SUCCESS); 4181 } 4182 4183 static void 4184 ibd_free_rx_copybufs(ibd_state_t *state) 4185 { 4186 int i; 4187 4188 /* 4189 * Unregister rxbuf mr 4190 */ 4191 if (ibt_deregister_mr(state->id_hca_hdl, 4192 state->id_rx_mr_hdl) != IBT_SUCCESS) { 4193 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 4194 } 4195 state->id_rx_mr_hdl = NULL; 4196 4197 /* 4198 * Free rxbuf memory 4199 */ 4200 for (i = 0; i < state->id_rx_nqueues; i++) { 4201 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4202 mutex_destroy(&rxp->rx_post_lock); 4203 } 4204 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 4205 sizeof (ibd_rx_queue_t)); 4206 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * 4207 sizeof (ibd_rwqe_t)); 4208 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * 4209 state->id_rx_buf_sz); 4210 state->id_rx_queues = NULL; 4211 state->id_rx_wqes = NULL; 4212 state->id_rx_bufs = NULL; 4213 } 4214 4215 static void 4216 ibd_free_rx_rsrcs(ibd_state_t *state) 4217 { 4218 mutex_enter(&state->id_rx_free_list.dl_mutex); 4219 if (state->id_rx_free_list.dl_head == NULL) { 4220 /* already freed */ 4221 mutex_exit(&state->id_rx_free_list.dl_mutex); 4222 return; 4223 } 4224 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe); 4225 ibd_free_rx_copybufs(state); 4226 state->id_rx_free_list.dl_cnt = 0; 4227 state->id_rx_free_list.dl_head = NULL; 4228 mutex_exit(&state->id_rx_free_list.dl_mutex); 4229 } 4230 4231 /* 4232 * Free the statically allocated Rx buffer list. 4233 */ 4234 static void 4235 ibd_fini_rxlist(ibd_state_t *state) 4236 { 4237 ibd_rwqe_t *rwqe; 4238 int i; 4239 4240 /* run through the rx_queue's, calling freemsg() */ 4241 for (i = 0; i < state->id_rx_nqueues; i++) { 4242 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4243 mutex_enter(&rxp->rx_post_lock); 4244 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 4245 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4246 freemsg(rwqe->rwqe_im_mblk); 4247 rxp->rx_cnt--; 4248 } 4249 rxp->rx_head = NULL; 4250 mutex_exit(&rxp->rx_post_lock); 4251 } 4252 4253 /* cannot free rx resources unless gld returned everything */ 4254 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 4255 ibd_free_rx_rsrcs(state); 4256 } 4257 4258 /* 4259 * Free an allocated recv wqe. 4260 */ 4261 /* ARGSUSED */ 4262 static void 4263 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 4264 { 4265 /* 4266 * desballoc() failed (no memory). 4267 * 4268 * This rwqe is placed on a free list so that it 4269 * can be reinstated when memory is available. 4270 * 4271 * NOTE: no code currently exists to reinstate 4272 * these "lost" rwqes. 4273 */ 4274 mutex_enter(&state->id_rx_free_list.dl_mutex); 4275 state->id_rx_free_list.dl_cnt++; 4276 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4277 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4278 mutex_exit(&state->id_rx_free_list.dl_mutex); 4279 } 4280 4281 /* 4282 * IBA Rx completion queue handler. Guaranteed to be single 4283 * threaded and nonreentrant for this CQ. 4284 */ 4285 /* ARGSUSED */ 4286 static void 4287 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4288 { 4289 ibd_state_t *state = (ibd_state_t *)arg; 4290 4291 atomic_inc_64(&state->id_num_intrs); 4292 4293 if (ibd_rx_softintr == 1) { 4294 mutex_enter(&state->id_rcq_poll_lock); 4295 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4296 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4297 mutex_exit(&state->id_rcq_poll_lock); 4298 return; 4299 } else { 4300 mutex_exit(&state->id_rcq_poll_lock); 4301 ddi_trigger_softintr(state->id_rx); 4302 } 4303 } else 4304 (void) ibd_intr((caddr_t)state); 4305 } 4306 4307 /* 4308 * CQ handler for Tx completions, when the Tx CQ is in 4309 * interrupt driven mode. 4310 */ 4311 /* ARGSUSED */ 4312 static void 4313 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4314 { 4315 ibd_state_t *state = (ibd_state_t *)arg; 4316 4317 atomic_inc_64(&state->id_num_intrs); 4318 4319 if (ibd_tx_softintr == 1) { 4320 mutex_enter(&state->id_scq_poll_lock); 4321 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4322 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4323 mutex_exit(&state->id_scq_poll_lock); 4324 return; 4325 } else { 4326 mutex_exit(&state->id_scq_poll_lock); 4327 ddi_trigger_softintr(state->id_tx); 4328 } 4329 } else 4330 (void) ibd_tx_recycle((caddr_t)state); 4331 } 4332 4333 /* 4334 * Multicast group create/delete trap handler. These will be delivered 4335 * on a kernel thread (handling can thus block) and can be invoked 4336 * concurrently. The handler can be invoked anytime after it is 4337 * registered and before ibt_detach(). 4338 */ 4339 /* ARGSUSED */ 4340 static void 4341 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4342 ibt_subnet_event_t *event) 4343 { 4344 ibd_state_t *state = (ibd_state_t *)arg; 4345 ibd_req_t *req; 4346 4347 /* 4348 * The trap handler will get invoked once for every event for 4349 * every port. The input "gid" is the GID0 of the port the 4350 * trap came in on; we just need to act on traps that came 4351 * to our port, meaning the port on which the ipoib interface 4352 * resides. Since ipoib uses GID0 of the port, we just match 4353 * the gids to check whether we need to handle the trap. 4354 */ 4355 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4356 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4357 return; 4358 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4359 4360 DPRINT(10, "ibd_notices_handler : %d\n", code); 4361 4362 switch (code) { 4363 case IBT_SM_EVENT_UNAVAILABLE: 4364 /* 4365 * If we are in promiscuous mode or have 4366 * sendnonmembers, we need to print a warning 4367 * message right now. Else, just store the 4368 * information, print when we enter promiscuous 4369 * mode or attempt nonmember send. We might 4370 * also want to stop caching sendnonmember. 4371 */ 4372 ibd_print_warn(state, "IBA multicast support " 4373 "degraded due to unavailability of multicast " 4374 "traps"); 4375 break; 4376 case IBT_SM_EVENT_AVAILABLE: 4377 /* 4378 * If we printed a warning message above or 4379 * while trying to nonmember send or get into 4380 * promiscuous mode, print an okay message. 4381 */ 4382 ibd_print_warn(state, "IBA multicast support " 4383 "restored due to availability of multicast " 4384 "traps"); 4385 break; 4386 case IBT_SM_EVENT_MCG_CREATED: 4387 case IBT_SM_EVENT_MCG_DELETED: 4388 /* 4389 * If it is a "deleted" event and we are in late hca 4390 * init, nothing to do. 4391 */ 4392 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4393 IBD_DRV_IN_LATE_HCA_INIT) && (code == 4394 IBT_SM_EVENT_MCG_DELETED)) { 4395 break; 4396 } 4397 /* 4398 * Common processing of creation/deletion traps. 4399 * First check if the instance is being 4400 * [de]initialized; back off then, without doing 4401 * anything more, since we are not sure if the 4402 * async thread is around, or whether we might 4403 * be racing with the detach code in ibd_m_stop() 4404 * that scans the mcg list. 4405 */ 4406 if (!ibd_async_safe(state)) 4407 return; 4408 4409 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4410 req->rq_gid = event->sm_notice_gid; 4411 req->rq_ptr = (void *)code; 4412 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4413 break; 4414 } 4415 } 4416 4417 static void 4418 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4419 { 4420 ib_gid_t mgid = req->rq_gid; 4421 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4422 int ret; 4423 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff; 4424 4425 DPRINT(10, "ibd_async_trap : %d\n", code); 4426 4427 /* 4428 * Check if we have already joined the IPoIB broadcast group for our 4429 * PKEY. If joined, perform the rest of the operation. 4430 * Else, the interface is not initialised. Do the initialisation here 4431 * by calling ibd_start() and return. 4432 */ 4433 4434 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4435 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) && 4436 (code == IBT_SM_EVENT_MCG_CREATED)) { 4437 /* 4438 * If we are in late HCA init and a notification for the 4439 * creation of a MCG came in, check if it is the IPoIB MCG for 4440 * this pkey. If not, return. 4441 */ 4442 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey != 4443 state->id_pkey)) { 4444 ibd_async_done(state); 4445 return; 4446 } 4447 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4448 /* 4449 * Check if there is still a necessity to start the interface. 4450 * It is possible that the user attempted unplumb at just about 4451 * the same time, and if unplumb succeeded, we have nothing to 4452 * do. 4453 */ 4454 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4455 IBD_DRV_IN_LATE_HCA_INIT) && 4456 ((ret = ibd_start(state)) != 0)) { 4457 DPRINT(10, "ibd_async_trap: cannot start from late HCA " 4458 "init, ret=%d", ret); 4459 } 4460 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4461 ibd_async_done(state); 4462 return; 4463 } 4464 4465 /* 4466 * Atomically search the nonmember and sendonlymember lists and 4467 * delete. 4468 */ 4469 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4470 4471 if (state->id_prom_op == IBD_OP_COMPLETED) { 4472 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4473 4474 /* 4475 * If in promiscuous mode, try to join/attach to the new 4476 * mcg. Given the unreliable out-of-order mode of trap 4477 * delivery, we can never be sure whether it is a problem 4478 * if the join fails. Thus, we warn the admin of a failure 4479 * if this was a creation trap. Note that the trap might 4480 * actually be reporting a long past event, and the mcg 4481 * might already have been deleted, thus we might be warning 4482 * in vain. 4483 */ 4484 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4485 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4486 ibd_print_warn(state, "IBA promiscuous mode missed " 4487 "new multicast gid %016llx:%016llx", 4488 (u_longlong_t)mgid.gid_prefix, 4489 (u_longlong_t)mgid.gid_guid); 4490 } 4491 4492 /* 4493 * Free the request slot allocated by the subnet event thread. 4494 */ 4495 ibd_async_done(state); 4496 } 4497 4498 /* 4499 * GLDv3 entry point to get capabilities. 4500 */ 4501 static boolean_t 4502 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4503 { 4504 ibd_state_t *state = arg; 4505 4506 if (state->id_type == IBD_PORT_DRIVER) 4507 return (B_FALSE); 4508 4509 switch (cap) { 4510 case MAC_CAPAB_HCKSUM: { 4511 uint32_t *txflags = cap_data; 4512 4513 /* 4514 * We either do full checksum or not do it at all 4515 */ 4516 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4517 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4518 else 4519 return (B_FALSE); 4520 break; 4521 } 4522 4523 case MAC_CAPAB_LSO: { 4524 mac_capab_lso_t *cap_lso = cap_data; 4525 4526 /* 4527 * In addition to the capability and policy, since LSO 4528 * relies on hw checksum, we'll not enable LSO if we 4529 * don't have hw checksum. Of course, if the HCA doesn't 4530 * provide the reserved lkey capability, enabling LSO will 4531 * actually affect performance adversely, so we'll disable 4532 * LSO even for that case. 4533 */ 4534 if (!state->id_lso_policy || !state->id_lso_capable) 4535 return (B_FALSE); 4536 4537 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4538 return (B_FALSE); 4539 4540 if (state->id_hca_res_lkey_capab == 0) { 4541 ibd_print_warn(state, "no reserved-lkey capability, " 4542 "disabling LSO"); 4543 return (B_FALSE); 4544 } 4545 4546 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4547 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4548 break; 4549 } 4550 4551 default: 4552 return (B_FALSE); 4553 } 4554 4555 return (B_TRUE); 4556 } 4557 4558 /* 4559 * callback function for set/get of properties 4560 */ 4561 static int 4562 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4563 uint_t pr_valsize, const void *pr_val) 4564 { 4565 ibd_state_t *state = arg; 4566 int err = 0; 4567 uint32_t link_mode; 4568 4569 /* Cannot set properties on a port driver */ 4570 if (state->id_type == IBD_PORT_DRIVER) { 4571 return (ENOTSUP); 4572 } 4573 4574 switch (pr_num) { 4575 case MAC_PROP_IB_LINKMODE: 4576 if (state->id_mac_state & IBD_DRV_STARTED) { 4577 err = EBUSY; 4578 break; 4579 } 4580 if (pr_val == NULL) { 4581 err = EINVAL; 4582 break; 4583 } 4584 bcopy(pr_val, &link_mode, sizeof (link_mode)); 4585 if (link_mode != IBD_LINK_MODE_UD && 4586 link_mode != IBD_LINK_MODE_RC) { 4587 err = EINVAL; 4588 } else { 4589 if (link_mode == IBD_LINK_MODE_RC) { 4590 if (state->id_enable_rc) { 4591 return (0); 4592 } 4593 state->id_enable_rc = 1; 4594 /* inform MAC framework of new MTU */ 4595 err = mac_maxsdu_update(state->id_mh, 4596 state->rc_mtu - IPOIB_HDRSIZE); 4597 } else { 4598 if (!state->id_enable_rc) { 4599 return (0); 4600 } 4601 state->id_enable_rc = 0; 4602 err = mac_maxsdu_update(state->id_mh, 4603 state->id_mtu - IPOIB_HDRSIZE); 4604 } 4605 (void) ibd_record_capab(state); 4606 mac_capab_update(state->id_mh); 4607 } 4608 break; 4609 case MAC_PROP_PRIVATE: 4610 err = ibd_set_priv_prop(state, pr_name, 4611 pr_valsize, pr_val); 4612 break; 4613 default: 4614 err = ENOTSUP; 4615 break; 4616 } 4617 return (err); 4618 } 4619 4620 static int 4621 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4622 uint_t pr_valsize, void *pr_val) 4623 { 4624 ibd_state_t *state = arg; 4625 int err = 0; 4626 4627 switch (pr_num) { 4628 case MAC_PROP_MTU: 4629 break; 4630 default: 4631 if (state->id_type == IBD_PORT_DRIVER) { 4632 return (ENOTSUP); 4633 } 4634 break; 4635 } 4636 4637 switch (pr_num) { 4638 case MAC_PROP_IB_LINKMODE: 4639 *(uint_t *)pr_val = state->id_enable_rc; 4640 break; 4641 case MAC_PROP_PRIVATE: 4642 err = ibd_get_priv_prop(state, pr_name, pr_valsize, 4643 pr_val); 4644 break; 4645 default: 4646 err = ENOTSUP; 4647 break; 4648 } 4649 return (err); 4650 } 4651 4652 static void 4653 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4654 mac_prop_info_handle_t prh) 4655 { 4656 ibd_state_t *state = arg; 4657 4658 switch (pr_num) { 4659 case MAC_PROP_IB_LINKMODE: { 4660 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE); 4661 break; 4662 } 4663 case MAC_PROP_MTU: { 4664 uint32_t min, max; 4665 if (state->id_type == IBD_PORT_DRIVER) { 4666 min = 1500; 4667 max = IBD_DEF_RC_MAX_SDU; 4668 } else if (state->id_enable_rc) { 4669 min = max = IBD_DEF_RC_MAX_SDU; 4670 } else { 4671 min = max = state->id_mtu - IPOIB_HDRSIZE; 4672 } 4673 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4674 mac_prop_info_set_range_uint32(prh, min, max); 4675 break; 4676 } 4677 case MAC_PROP_PRIVATE: { 4678 char valstr[64]; 4679 int value; 4680 4681 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 4682 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4683 return; 4684 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4685 value = IBD_DEF_COALESCE_COMPLETIONS; 4686 } else if (strcmp(pr_name, 4687 "_ibd_create_broadcast_group") == 0) { 4688 value = IBD_DEF_CREATE_BCAST_GROUP; 4689 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4690 value = IBD_DEF_HASH_SIZE; 4691 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4692 value = IBD_DEF_LSO_POLICY; 4693 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4694 value = IBD_DEF_NUM_AH; 4695 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4696 value = IBD_DEF_NUM_LSO_BUFS; 4697 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4698 value = IBD_DEF_RC_ENABLE_SRQ; 4699 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4700 value = IBD_DEF_RC_NUM_RWQE; 4701 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4702 value = IBD_DEF_RC_NUM_SRQ; 4703 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4704 value = IBD_DEF_RC_NUM_SWQE; 4705 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4706 value = IBD_DEF_RC_RX_COMP_COUNT; 4707 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4708 value = IBD_DEF_RC_RX_COMP_USEC; 4709 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4710 value = IBD_DEF_RC_RX_COPY_THRESH; 4711 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4712 value = IBD_DEF_RC_RX_RWQE_THRESH; 4713 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4714 value = IBD_DEF_RC_TX_COMP_COUNT; 4715 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4716 value = IBD_DEF_RC_TX_COMP_USEC; 4717 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4718 value = IBD_DEF_RC_TX_COPY_THRESH; 4719 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4720 value = IBD_DEF_UD_NUM_RWQE; 4721 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4722 value = IBD_DEF_UD_NUM_SWQE; 4723 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4724 value = IBD_DEF_UD_RX_COMP_COUNT; 4725 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4726 value = IBD_DEF_UD_RX_COMP_USEC; 4727 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4728 value = IBD_DEF_UD_TX_COMP_COUNT; 4729 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4730 value = IBD_DEF_UD_TX_COMP_USEC; 4731 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4732 value = IBD_DEF_UD_TX_COPY_THRESH; 4733 } else { 4734 return; 4735 } 4736 4737 (void) snprintf(valstr, sizeof (valstr), "%d", value); 4738 mac_prop_info_set_default_str(prh, valstr); 4739 break; 4740 } 4741 } /* switch (pr_num) */ 4742 } 4743 4744 /* ARGSUSED2 */ 4745 static int 4746 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name, 4747 uint_t pr_valsize, const void *pr_val) 4748 { 4749 int err = 0; 4750 long result; 4751 4752 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4753 if (pr_val == NULL) { 4754 return (EINVAL); 4755 } 4756 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4757 if (result < 0 || result > 1) { 4758 err = EINVAL; 4759 } else { 4760 state->id_allow_coalesce_comp_tuning = (result == 1) ? 4761 B_TRUE: B_FALSE; 4762 } 4763 return (err); 4764 } 4765 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 4766 if (state->id_mac_state & IBD_DRV_STARTED) { 4767 return (EBUSY); 4768 } 4769 if (pr_val == NULL) { 4770 return (EINVAL); 4771 } 4772 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4773 if (result < 0 || result > 1) { 4774 err = EINVAL; 4775 } else { 4776 state->id_create_broadcast_group = (result == 1) ? 4777 B_TRUE: B_FALSE; 4778 } 4779 return (err); 4780 } 4781 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4782 if (state->id_mac_state & IBD_DRV_STARTED) { 4783 return (EBUSY); 4784 } 4785 if (pr_val == NULL) { 4786 return (EINVAL); 4787 } 4788 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4789 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) { 4790 err = EINVAL; 4791 } else { 4792 state->id_hash_size = (uint32_t)result; 4793 } 4794 return (err); 4795 } 4796 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4797 if (state->id_mac_state & IBD_DRV_STARTED) { 4798 return (EBUSY); 4799 } 4800 if (pr_val == NULL) { 4801 return (EINVAL); 4802 } 4803 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4804 if (result < 0 || result > 1) { 4805 err = EINVAL; 4806 } else { 4807 state->id_lso_policy = (result == 1) ? 4808 B_TRUE: B_FALSE; 4809 } 4810 mac_capab_update(state->id_mh); 4811 return (err); 4812 } 4813 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4814 if (state->id_mac_state & IBD_DRV_STARTED) { 4815 return (EBUSY); 4816 } 4817 if (pr_val == NULL) { 4818 return (EINVAL); 4819 } 4820 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4821 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) { 4822 err = EINVAL; 4823 } else { 4824 state->id_num_ah = (uint32_t)result; 4825 } 4826 return (err); 4827 } 4828 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4829 if (state->id_mac_state & IBD_DRV_STARTED) { 4830 return (EBUSY); 4831 } 4832 if (!state->id_lso_policy || !state->id_lso_capable) { 4833 return (EINVAL); 4834 } 4835 if (pr_val == NULL) { 4836 return (EINVAL); 4837 } 4838 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4839 if (result < IBD_MIN_NUM_LSO_BUFS || 4840 result > IBD_MAX_NUM_LSO_BUFS) { 4841 err = EINVAL; 4842 } else { 4843 state->id_num_lso_bufs = (uint32_t)result; 4844 } 4845 return (err); 4846 } 4847 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4848 if (state->id_mac_state & IBD_DRV_STARTED) { 4849 return (EBUSY); 4850 } 4851 if (pr_val == NULL) { 4852 return (EINVAL); 4853 } 4854 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4855 if (result < 0 || result > 1) { 4856 err = EINVAL; 4857 } else { 4858 state->rc_enable_srq = (result == 1) ? 4859 B_TRUE: B_FALSE; 4860 } 4861 if (!state->rc_enable_srq) { 4862 state->id_rc_num_srq = 0; 4863 } 4864 return (err); 4865 } 4866 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4867 if (state->id_mac_state & IBD_DRV_STARTED) { 4868 return (EBUSY); 4869 } 4870 if (pr_val == NULL) { 4871 return (EINVAL); 4872 } 4873 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4874 if (result < IBD_MIN_RC_NUM_RWQE || 4875 result > IBD_MAX_RC_NUM_RWQE) { 4876 err = EINVAL; 4877 } else { 4878 state->id_rc_num_rwqe = (uint32_t)result; 4879 if (state->id_allow_coalesce_comp_tuning && 4880 state->id_rc_rx_comp_count > state->id_rc_num_rwqe) 4881 state->id_rc_rx_comp_count = 4882 state->id_rc_num_rwqe; 4883 if (state->id_rc_num_srq > state->id_rc_num_rwqe) 4884 state->id_rc_num_srq = 4885 state->id_rc_num_rwqe - 1; 4886 /* 4887 * If rx_rwqe_threshold is greater than the number of 4888 * rwqes, pull it back to 25% of number of rwqes. 4889 */ 4890 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe) 4891 state->id_rc_rx_rwqe_thresh = 4892 (state->id_rc_num_rwqe >> 2); 4893 4894 } 4895 return (err); 4896 } 4897 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4898 if (state->id_mac_state & IBD_DRV_STARTED) { 4899 return (EBUSY); 4900 } 4901 if (pr_val == NULL) { 4902 return (EINVAL); 4903 } 4904 if (!state->rc_enable_srq) 4905 return (EINVAL); 4906 4907 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4908 if (result < IBD_MIN_RC_NUM_SRQ || 4909 result >= state->id_rc_num_rwqe) { 4910 err = EINVAL; 4911 } else 4912 state->id_rc_num_srq = (uint32_t)result; 4913 return (err); 4914 } 4915 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4916 if (state->id_mac_state & IBD_DRV_STARTED) { 4917 return (EBUSY); 4918 } 4919 if (pr_val == NULL) { 4920 return (EINVAL); 4921 } 4922 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4923 if (result < IBD_MIN_RC_NUM_SWQE || 4924 result > IBD_MAX_RC_NUM_SWQE) { 4925 err = EINVAL; 4926 } else { 4927 state->id_rc_num_swqe = (uint32_t)result; 4928 if (state->id_allow_coalesce_comp_tuning && 4929 state->id_rc_tx_comp_count > state->id_rc_num_swqe) 4930 state->id_rc_tx_comp_count = 4931 state->id_rc_num_swqe; 4932 } 4933 return (err); 4934 } 4935 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4936 if (!state->id_allow_coalesce_comp_tuning) { 4937 return (ENOTSUP); 4938 } 4939 if (pr_val == NULL) { 4940 return (EINVAL); 4941 } 4942 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4943 if (result < 1 || result > state->id_rc_num_rwqe) { 4944 err = EINVAL; 4945 } else { 4946 state->id_rc_rx_comp_count = (uint32_t)result; 4947 } 4948 return (err); 4949 } 4950 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4951 if (!state->id_allow_coalesce_comp_tuning) { 4952 return (ENOTSUP); 4953 } 4954 if (pr_val == NULL) { 4955 return (EINVAL); 4956 } 4957 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4958 if (result < 1) { 4959 err = EINVAL; 4960 } else { 4961 state->id_rc_rx_comp_usec = (uint32_t)result; 4962 } 4963 return (err); 4964 } 4965 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4966 if (state->id_mac_state & IBD_DRV_STARTED) { 4967 return (EBUSY); 4968 } 4969 if (pr_val == NULL) { 4970 return (EINVAL); 4971 } 4972 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4973 if (result < IBD_MIN_RC_RX_COPY_THRESH || 4974 result > state->rc_mtu) { 4975 err = EINVAL; 4976 } else { 4977 state->id_rc_rx_copy_thresh = (uint32_t)result; 4978 } 4979 return (err); 4980 } 4981 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4982 if (state->id_mac_state & IBD_DRV_STARTED) { 4983 return (EBUSY); 4984 } 4985 if (pr_val == NULL) { 4986 return (EINVAL); 4987 } 4988 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4989 if (result < IBD_MIN_RC_RX_RWQE_THRESH || 4990 result >= state->id_rc_num_rwqe) { 4991 err = EINVAL; 4992 } else { 4993 state->id_rc_rx_rwqe_thresh = (uint32_t)result; 4994 } 4995 return (err); 4996 } 4997 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4998 if (!state->id_allow_coalesce_comp_tuning) { 4999 return (ENOTSUP); 5000 } 5001 if (pr_val == NULL) { 5002 return (EINVAL); 5003 } 5004 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5005 if (result < 1 || result > state->id_rc_num_swqe) { 5006 err = EINVAL; 5007 } else { 5008 state->id_rc_tx_comp_count = (uint32_t)result; 5009 } 5010 return (err); 5011 } 5012 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5013 if (!state->id_allow_coalesce_comp_tuning) { 5014 return (ENOTSUP); 5015 } 5016 if (pr_val == NULL) { 5017 return (EINVAL); 5018 } 5019 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5020 if (result < 1) 5021 err = EINVAL; 5022 else { 5023 state->id_rc_tx_comp_usec = (uint32_t)result; 5024 } 5025 return (err); 5026 } 5027 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5028 if (state->id_mac_state & IBD_DRV_STARTED) { 5029 return (EBUSY); 5030 } 5031 if (pr_val == NULL) { 5032 return (EINVAL); 5033 } 5034 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5035 if (result < IBD_MIN_RC_TX_COPY_THRESH || 5036 result > state->rc_mtu) { 5037 err = EINVAL; 5038 } else { 5039 state->id_rc_tx_copy_thresh = (uint32_t)result; 5040 } 5041 return (err); 5042 } 5043 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5044 if (state->id_mac_state & IBD_DRV_STARTED) { 5045 return (EBUSY); 5046 } 5047 if (pr_val == NULL) { 5048 return (EINVAL); 5049 } 5050 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5051 if (result < IBD_MIN_UD_NUM_RWQE || 5052 result > IBD_MAX_UD_NUM_RWQE) { 5053 err = EINVAL; 5054 } else { 5055 if (result > state->id_hca_max_chan_sz) { 5056 state->id_ud_num_rwqe = 5057 state->id_hca_max_chan_sz; 5058 } else { 5059 state->id_ud_num_rwqe = (uint32_t)result; 5060 } 5061 if (state->id_allow_coalesce_comp_tuning && 5062 state->id_ud_rx_comp_count > state->id_ud_num_rwqe) 5063 state->id_ud_rx_comp_count = 5064 state->id_ud_num_rwqe; 5065 } 5066 return (err); 5067 } 5068 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5069 if (state->id_mac_state & IBD_DRV_STARTED) { 5070 return (EBUSY); 5071 } 5072 if (pr_val == NULL) { 5073 return (EINVAL); 5074 } 5075 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5076 if (result < IBD_MIN_UD_NUM_SWQE || 5077 result > IBD_MAX_UD_NUM_SWQE) { 5078 err = EINVAL; 5079 } else { 5080 if (result > state->id_hca_max_chan_sz) { 5081 state->id_ud_num_swqe = 5082 state->id_hca_max_chan_sz; 5083 } else { 5084 state->id_ud_num_swqe = (uint32_t)result; 5085 } 5086 if (state->id_allow_coalesce_comp_tuning && 5087 state->id_ud_tx_comp_count > state->id_ud_num_swqe) 5088 state->id_ud_tx_comp_count = 5089 state->id_ud_num_swqe; 5090 } 5091 return (err); 5092 } 5093 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5094 if (!state->id_allow_coalesce_comp_tuning) { 5095 return (ENOTSUP); 5096 } 5097 if (pr_val == NULL) { 5098 return (EINVAL); 5099 } 5100 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5101 if (result < 1 || result > state->id_ud_num_rwqe) { 5102 err = EINVAL; 5103 } else { 5104 state->id_ud_rx_comp_count = (uint32_t)result; 5105 } 5106 return (err); 5107 } 5108 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5109 if (!state->id_allow_coalesce_comp_tuning) { 5110 return (ENOTSUP); 5111 } 5112 if (pr_val == NULL) { 5113 return (EINVAL); 5114 } 5115 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5116 if (result < 1) { 5117 err = EINVAL; 5118 } else { 5119 state->id_ud_rx_comp_usec = (uint32_t)result; 5120 } 5121 return (err); 5122 } 5123 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5124 if (!state->id_allow_coalesce_comp_tuning) { 5125 return (ENOTSUP); 5126 } 5127 if (pr_val == NULL) { 5128 return (EINVAL); 5129 } 5130 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5131 if (result < 1 || result > state->id_ud_num_swqe) { 5132 err = EINVAL; 5133 } else { 5134 state->id_ud_tx_comp_count = (uint32_t)result; 5135 } 5136 return (err); 5137 } 5138 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5139 if (!state->id_allow_coalesce_comp_tuning) { 5140 return (ENOTSUP); 5141 } 5142 if (pr_val == NULL) { 5143 return (EINVAL); 5144 } 5145 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5146 if (result < 1) { 5147 err = EINVAL; 5148 } else { 5149 state->id_ud_tx_comp_usec = (uint32_t)result; 5150 } 5151 return (err); 5152 } 5153 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5154 if (state->id_mac_state & IBD_DRV_STARTED) { 5155 return (EBUSY); 5156 } 5157 if (pr_val == NULL) { 5158 return (EINVAL); 5159 } 5160 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5161 if (result < IBD_MIN_UD_TX_COPY_THRESH || 5162 result > IBD_MAX_UD_TX_COPY_THRESH) { 5163 err = EINVAL; 5164 } else { 5165 state->id_ud_tx_copy_thresh = (uint32_t)result; 5166 } 5167 return (err); 5168 } 5169 return (ENOTSUP); 5170 } 5171 5172 static int 5173 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, 5174 void *pr_val) 5175 { 5176 int err = ENOTSUP; 5177 int value; 5178 5179 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 5180 value = state->id_bgroup_present; 5181 err = 0; 5182 goto done; 5183 } 5184 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 5185 value = state->id_allow_coalesce_comp_tuning; 5186 err = 0; 5187 goto done; 5188 } 5189 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 5190 value = state->id_create_broadcast_group; 5191 err = 0; 5192 goto done; 5193 } 5194 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 5195 value = state->id_hash_size; 5196 err = 0; 5197 goto done; 5198 } 5199 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 5200 value = state->id_lso_policy; 5201 err = 0; 5202 goto done; 5203 } 5204 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 5205 value = state->id_num_ah; 5206 err = 0; 5207 goto done; 5208 } 5209 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 5210 value = state->id_num_lso_bufs; 5211 err = 0; 5212 goto done; 5213 } 5214 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 5215 value = state->rc_enable_srq; 5216 err = 0; 5217 goto done; 5218 } 5219 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 5220 value = state->id_rc_num_rwqe; 5221 err = 0; 5222 goto done; 5223 } 5224 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 5225 value = state->id_rc_num_srq; 5226 err = 0; 5227 goto done; 5228 } 5229 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 5230 value = state->id_rc_num_swqe; 5231 err = 0; 5232 goto done; 5233 } 5234 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 5235 value = state->id_rc_rx_comp_count; 5236 err = 0; 5237 goto done; 5238 } 5239 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 5240 value = state->id_rc_rx_comp_usec; 5241 err = 0; 5242 goto done; 5243 } 5244 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 5245 value = state->id_rc_rx_copy_thresh; 5246 err = 0; 5247 goto done; 5248 } 5249 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 5250 value = state->id_rc_rx_rwqe_thresh; 5251 err = 0; 5252 goto done; 5253 } 5254 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 5255 value = state->id_rc_tx_comp_count; 5256 err = 0; 5257 goto done; 5258 } 5259 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5260 value = state->id_rc_tx_comp_usec; 5261 err = 0; 5262 goto done; 5263 } 5264 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5265 value = state->id_rc_tx_copy_thresh; 5266 err = 0; 5267 goto done; 5268 } 5269 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5270 value = state->id_ud_num_rwqe; 5271 err = 0; 5272 goto done; 5273 } 5274 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5275 value = state->id_ud_num_swqe; 5276 err = 0; 5277 goto done; 5278 } 5279 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5280 value = state->id_ud_rx_comp_count; 5281 err = 0; 5282 goto done; 5283 } 5284 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5285 value = state->id_ud_rx_comp_usec; 5286 err = 0; 5287 goto done; 5288 } 5289 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5290 value = state->id_ud_tx_comp_count; 5291 err = 0; 5292 goto done; 5293 } 5294 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5295 value = state->id_ud_tx_comp_usec; 5296 err = 0; 5297 goto done; 5298 } 5299 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5300 value = state->id_ud_tx_copy_thresh; 5301 err = 0; 5302 goto done; 5303 } 5304 done: 5305 if (err == 0) { 5306 (void) snprintf(pr_val, pr_valsize, "%d", value); 5307 } 5308 return (err); 5309 } 5310 5311 static int 5312 ibd_get_port_details(ibd_state_t *state) 5313 { 5314 ibt_hca_portinfo_t *port_infop; 5315 ibt_status_t ret; 5316 uint_t psize, port_infosz; 5317 5318 mutex_enter(&state->id_link_mutex); 5319 5320 /* 5321 * Query for port information 5322 */ 5323 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 5324 &port_infop, &psize, &port_infosz); 5325 if ((ret != IBT_SUCCESS) || (psize != 1)) { 5326 mutex_exit(&state->id_link_mutex); 5327 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 5328 "failed, ret=%d", ret); 5329 return (ENETDOWN); 5330 } 5331 5332 /* 5333 * If the link is active, verify the pkey 5334 */ 5335 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) { 5336 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 5337 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 5338 state->id_link_state = LINK_STATE_DOWN; 5339 } else { 5340 state->id_link_state = LINK_STATE_UP; 5341 } 5342 state->id_mtu = (128 << port_infop->p_mtu); 5343 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5344 state->id_sgid = *port_infop->p_sgid_tbl; 5345 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5346 /* 5347 * Now that the port is active, record the port speed 5348 */ 5349 state->id_link_speed = ibd_get_portspeed(state); 5350 } else { 5351 /* Make sure that these are handled in PORT_UP/CHANGE */ 5352 state->id_mtu = 0; 5353 state->id_link_state = LINK_STATE_DOWN; 5354 state->id_link_speed = 0; 5355 } 5356 mutex_exit(&state->id_link_mutex); 5357 ibt_free_portinfo(port_infop, port_infosz); 5358 5359 return (0); 5360 } 5361 5362 static int 5363 ibd_alloc_cqs(ibd_state_t *state) 5364 { 5365 ibt_hca_attr_t hca_attrs; 5366 ibt_cq_attr_t cq_attr; 5367 ibt_status_t ret; 5368 uint32_t real_size; 5369 uint_t num_rwqe_change = 0; 5370 uint_t num_swqe_change = 0; 5371 5372 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 5373 ASSERT(ret == IBT_SUCCESS); 5374 5375 /* 5376 * Allocate Rx/combined CQ: 5377 * Theoretically, there is no point in having more than #rwqe 5378 * plus #swqe cqe's, except that the CQ will be signaled for 5379 * overflow when the last wqe completes, if none of the previous 5380 * cqe's have been polled. Thus, we allocate just a few less wqe's 5381 * to make sure such overflow does not occur. 5382 */ 5383 cq_attr.cq_sched = NULL; 5384 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 5385 5386 /* 5387 * Allocate Receive CQ. 5388 */ 5389 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) { 5390 cq_attr.cq_size = state->id_ud_num_rwqe + 1; 5391 } else { 5392 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5393 num_rwqe_change = state->id_ud_num_rwqe; 5394 state->id_ud_num_rwqe = cq_attr.cq_size - 1; 5395 } 5396 5397 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5398 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 5399 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 5400 "failed, ret=%d\n", ret); 5401 return (DDI_FAILURE); 5402 } 5403 5404 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count, 5405 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) { 5406 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 5407 "moderation failed, ret=%d\n", ret); 5408 } 5409 5410 /* make the #rx wc's the same as max rx chain size */ 5411 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 5412 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 5413 state->id_rxwcs_size, KM_SLEEP); 5414 5415 /* 5416 * Allocate Send CQ. 5417 */ 5418 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) { 5419 cq_attr.cq_size = state->id_ud_num_swqe + 1; 5420 } else { 5421 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5422 num_swqe_change = state->id_ud_num_swqe; 5423 state->id_ud_num_swqe = cq_attr.cq_size - 1; 5424 } 5425 5426 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5427 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 5428 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 5429 "failed, ret=%d\n", ret); 5430 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 5431 state->id_rxwcs_size); 5432 (void) ibt_free_cq(state->id_rcq_hdl); 5433 return (DDI_FAILURE); 5434 } 5435 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count, 5436 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) { 5437 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 5438 "moderation failed, ret=%d\n", ret); 5439 } 5440 5441 state->id_txwcs_size = IBD_TX_POLL_THRESH; 5442 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 5443 state->id_txwcs_size, KM_SLEEP); 5444 5445 /* 5446 * Print message in case we could not allocate as many wqe's 5447 * as was requested. 5448 */ 5449 if (num_rwqe_change) { 5450 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 5451 "%d", state->id_ud_num_rwqe, num_rwqe_change); 5452 } 5453 if (num_swqe_change) { 5454 ibd_print_warn(state, "Setting #swqe = %d instead of default " 5455 "%d", state->id_ud_num_swqe, num_swqe_change); 5456 } 5457 5458 return (DDI_SUCCESS); 5459 } 5460 5461 static int 5462 ibd_setup_ud_channel(ibd_state_t *state) 5463 { 5464 ibt_ud_chan_alloc_args_t ud_alloc_attr; 5465 ibt_ud_chan_query_attr_t ud_chan_attr; 5466 ibt_status_t ret; 5467 5468 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 5469 if (state->id_hca_res_lkey_capab) 5470 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 5471 if (state->id_lso_policy && state->id_lso_capable) 5472 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 5473 5474 ud_alloc_attr.ud_hca_port_num = state->id_port; 5475 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 5476 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 5477 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe; 5478 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe; 5479 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 5480 ud_alloc_attr.ud_scq = state->id_scq_hdl; 5481 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 5482 ud_alloc_attr.ud_pd = state->id_pd_hdl; 5483 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 5484 ud_alloc_attr.ud_clone_chan = NULL; 5485 5486 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 5487 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 5488 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 5489 "failed, ret=%d\n", ret); 5490 return (DDI_FAILURE); 5491 } 5492 5493 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 5494 &ud_chan_attr)) != IBT_SUCCESS) { 5495 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 5496 "failed, ret=%d\n", ret); 5497 (void) ibt_free_channel(state->id_chnl_hdl); 5498 return (DDI_FAILURE); 5499 } 5500 5501 state->id_qpnum = ud_chan_attr.ud_qpn; 5502 5503 return (DDI_SUCCESS); 5504 } 5505 5506 static int 5507 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 5508 { 5509 uint32_t progress = state->id_mac_state; 5510 uint_t attempts; 5511 ibt_status_t ret; 5512 ib_gid_t mgid; 5513 ibd_mce_t *mce; 5514 uint8_t jstate; 5515 timeout_id_t tid; 5516 5517 if (atomic_dec_32_nv(&state->id_running) != 0) 5518 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 5519 5520 /* 5521 * Before we try to stop/undo whatever we did in ibd_start(), 5522 * we need to mark the link state appropriately to prevent the 5523 * ip layer from using this instance for any new transfers. Note 5524 * that if the original state of the link was "up" when we're 5525 * here, we'll set the final link state to "unknown", to behave 5526 * in the same fashion as other ethernet drivers. 5527 */ 5528 mutex_enter(&state->id_link_mutex); 5529 if (cur_link_state == LINK_STATE_DOWN) { 5530 state->id_link_state = cur_link_state; 5531 } else { 5532 state->id_link_state = LINK_STATE_UNKNOWN; 5533 } 5534 mutex_exit(&state->id_link_mutex); 5535 bzero(&state->id_macaddr, sizeof (ipoib_mac_t)); 5536 mac_link_update(state->id_mh, state->id_link_state); 5537 5538 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 5539 if (progress & IBD_DRV_STARTED) { 5540 state->id_mac_state &= (~IBD_DRV_STARTED); 5541 } 5542 5543 if (progress & IBD_DRV_IN_LATE_HCA_INIT) { 5544 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT); 5545 } 5546 5547 /* Stop listen under Reliable Connected Mode */ 5548 if (progress & IBD_DRV_RC_LISTEN) { 5549 ASSERT(state->id_enable_rc); 5550 if (state->rc_listen_hdl != NULL) { 5551 ibd_rc_stop_listen(state); 5552 } 5553 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 5554 } 5555 5556 /* Stop timeout routine */ 5557 if (progress & IBD_DRV_RC_TIMEOUT) { 5558 ASSERT(state->id_enable_rc); 5559 mutex_enter(&state->rc_timeout_lock); 5560 state->rc_timeout_start = B_FALSE; 5561 tid = state->rc_timeout; 5562 state->rc_timeout = 0; 5563 mutex_exit(&state->rc_timeout_lock); 5564 if (tid != 0) 5565 (void) untimeout(tid); 5566 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT); 5567 } 5568 5569 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 5570 attempts = 100; 5571 while (state->id_ah_op == IBD_OP_ONGOING) { 5572 /* 5573 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB 5574 * port is connecting to a remote IPoIB port. Wait for 5575 * the end of this connecting operation. 5576 */ 5577 delay(drv_usectohz(100000)); 5578 if (--attempts == 0) { 5579 state->rc_stop_connect++; 5580 DPRINT(40, "ibd_undo_start: connecting"); 5581 break; 5582 } 5583 } 5584 mutex_enter(&state->id_sched_lock); 5585 state->id_sched_needed = 0; 5586 mutex_exit(&state->id_sched_lock); 5587 (void) ibd_rc_close_all_chan(state); 5588 } 5589 5590 /* 5591 * First, stop receive interrupts; this stops the driver from 5592 * handing up buffers to higher layers. Wait for receive buffers 5593 * to be returned and give up after 1 second. 5594 */ 5595 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 5596 attempts = 10; 5597 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 5598 0) > 0) { 5599 delay(drv_usectohz(100000)); 5600 if (--attempts == 0) { 5601 /* 5602 * There are pending bufs with the network 5603 * layer and we have no choice but to wait 5604 * for them to be done with. Reap all the 5605 * Tx/Rx completions that were posted since 5606 * we turned off the notification and 5607 * return failure. 5608 */ 5609 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 5610 DPRINT(2, "ibd_undo_start: " 5611 "reclaiming failed"); 5612 break; 5613 } 5614 } 5615 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 5616 } 5617 5618 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 5619 ibd_rc_fini_tx_largebuf_list(state); 5620 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 5621 } 5622 5623 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 5624 ASSERT(state->id_enable_rc); 5625 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 5626 if (state->id_ah_op == IBD_OP_ONGOING) { 5627 delay(drv_usectohz(10000)); 5628 if (state->id_ah_op == IBD_OP_ONGOING) { 5629 /* 5630 * "state->id_ah_op == IBD_OP_ONGOING" 5631 * means this IPoIB port is connecting 5632 * to a remote IPoIB port. We can't 5633 * delete SRQ here. 5634 */ 5635 state->rc_stop_connect++; 5636 DPRINT(40, "ibd_undo_start: " 5637 "connecting"); 5638 } else { 5639 ibd_rc_fini_srq_list(state); 5640 state->id_mac_state &= 5641 (~IBD_DRV_RC_SRQ_ALLOCD); 5642 } 5643 } else { 5644 ibd_rc_fini_srq_list(state); 5645 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 5646 } 5647 } else { 5648 DPRINT(40, "ibd_undo_start: srq bufs outstanding\n"); 5649 } 5650 } 5651 5652 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 5653 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 5654 5655 mutex_enter(&state->id_trap_lock); 5656 state->id_trap_stop = B_TRUE; 5657 while (state->id_trap_inprog > 0) 5658 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 5659 mutex_exit(&state->id_trap_lock); 5660 5661 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 5662 } 5663 5664 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 5665 /* 5666 * Flushing the channel ensures that all pending WQE's 5667 * are marked with flush_error and handed to the CQ. It 5668 * does not guarantee the invocation of the CQ handler. 5669 * This call is guaranteed to return successfully for 5670 * UD QPNs. 5671 */ 5672 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 5673 IBT_SUCCESS) { 5674 DPRINT(10, "ibd_undo_start: flush_channel " 5675 "failed, ret=%d", ret); 5676 } 5677 5678 /* 5679 * Give some time for the TX CQ handler to process the 5680 * completions. 5681 */ 5682 attempts = 10; 5683 mutex_enter(&state->id_tx_list.dl_mutex); 5684 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5685 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 5686 != state->id_ud_num_swqe) { 5687 if (--attempts == 0) 5688 break; 5689 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5690 mutex_exit(&state->id_tx_list.dl_mutex); 5691 delay(drv_usectohz(100000)); 5692 mutex_enter(&state->id_tx_list.dl_mutex); 5693 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5694 } 5695 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5696 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 5697 state->id_ud_num_swqe) { 5698 cmn_err(CE_WARN, "tx resources not freed\n"); 5699 } 5700 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5701 mutex_exit(&state->id_tx_list.dl_mutex); 5702 5703 attempts = 10; 5704 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5705 if (--attempts == 0) 5706 break; 5707 delay(drv_usectohz(100000)); 5708 } 5709 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 5710 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5711 cmn_err(CE_WARN, "rx resources not freed\n"); 5712 } 5713 5714 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 5715 } 5716 5717 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 5718 /* 5719 * Drop all residual full/non membership. This includes full 5720 * membership to the broadcast group, and any nonmembership 5721 * acquired during transmits. We do this after the Tx completion 5722 * handlers are done, since those might result in some late 5723 * leaves; this also eliminates a potential race with that 5724 * path wrt the mc full list insert/delete. Trap handling 5725 * has also been suppressed at this point. Thus, no locks 5726 * are required while traversing the mc full list. 5727 */ 5728 DPRINT(2, "ibd_undo_start: clear full cache entries"); 5729 mce = list_head(&state->id_mc_full); 5730 while (mce != NULL) { 5731 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5732 jstate = mce->mc_jstate; 5733 mce = list_next(&state->id_mc_full, mce); 5734 ibd_leave_group(state, mgid, jstate); 5735 } 5736 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 5737 } 5738 5739 if (progress & IBD_DRV_RXLIST_ALLOCD) { 5740 ibd_fini_rxlist(state); 5741 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 5742 } 5743 5744 if (progress & IBD_DRV_TXLIST_ALLOCD) { 5745 ibd_fini_txlist(state); 5746 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 5747 } 5748 5749 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 5750 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 5751 IBT_SUCCESS) { 5752 DPRINT(10, "ibd_undo_start: free_channel " 5753 "failed, ret=%d", ret); 5754 } 5755 5756 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 5757 } 5758 5759 if (progress & IBD_DRV_CQS_ALLOCD) { 5760 kmem_free(state->id_txwcs, 5761 sizeof (ibt_wc_t) * state->id_txwcs_size); 5762 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 5763 IBT_SUCCESS) { 5764 DPRINT(10, "ibd_undo_start: free_cq(scq) " 5765 "failed, ret=%d", ret); 5766 } 5767 5768 kmem_free(state->id_rxwcs, 5769 sizeof (ibt_wc_t) * state->id_rxwcs_size); 5770 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 5771 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 5772 "ret=%d", ret); 5773 } 5774 5775 state->id_txwcs = NULL; 5776 state->id_rxwcs = NULL; 5777 state->id_scq_hdl = NULL; 5778 state->id_rcq_hdl = NULL; 5779 5780 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 5781 } 5782 5783 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 5784 mutex_enter(&state->id_ac_mutex); 5785 mod_hash_destroy_hash(state->id_ah_active_hash); 5786 mutex_exit(&state->id_ac_mutex); 5787 ibd_acache_fini(state); 5788 5789 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 5790 } 5791 5792 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 5793 /* 5794 * If we'd created the ipoib broadcast group and had 5795 * successfully joined it, leave it now 5796 */ 5797 if (state->id_bgroup_created) { 5798 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 5799 jstate = IB_MC_JSTATE_FULL; 5800 (void) ibt_leave_mcg(state->id_sgid, mgid, 5801 state->id_sgid, jstate); 5802 } 5803 ibt_free_mcg_info(state->id_mcinfo, 1); 5804 5805 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 5806 } 5807 5808 return (DDI_SUCCESS); 5809 } 5810 5811 /* 5812 * These pair of routines are used to set/clear the condition that 5813 * the caller is likely to do something to change the id_mac_state. 5814 * If there's already someone doing either a start or a stop (possibly 5815 * due to the async handler detecting a pkey relocation event, a plumb 5816 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 5817 * that's done. 5818 */ 5819 static void 5820 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 5821 { 5822 mutex_enter(&state->id_macst_lock); 5823 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 5824 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 5825 5826 state->id_mac_state |= flag; 5827 mutex_exit(&state->id_macst_lock); 5828 } 5829 5830 static void 5831 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 5832 { 5833 mutex_enter(&state->id_macst_lock); 5834 state->id_mac_state &= (~flag); 5835 cv_signal(&state->id_macst_cv); 5836 mutex_exit(&state->id_macst_lock); 5837 } 5838 5839 /* 5840 * GLDv3 entry point to start hardware. 5841 */ 5842 /*ARGSUSED*/ 5843 static int 5844 ibd_m_start(void *arg) 5845 { 5846 ibd_state_t *state = arg; 5847 int ret; 5848 5849 if (state->id_type == IBD_PORT_DRIVER) 5850 return (EINVAL); 5851 5852 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5853 if (state->id_mac_state & IBD_DRV_IN_DELETION) { 5854 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5855 return (EIO); 5856 } 5857 5858 ret = ibd_start(state); 5859 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5860 return (ret); 5861 } 5862 5863 static int 5864 ibd_start(ibd_state_t *state) 5865 { 5866 int err; 5867 ibt_status_t ret; 5868 int late_hca_init = 0; 5869 5870 if (state->id_mac_state & IBD_DRV_STARTED) 5871 return (DDI_SUCCESS); 5872 5873 /* 5874 * We do not increment the running flag when calling ibd_start() as 5875 * a result of some event which moves the state away from late HCA 5876 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability. 5877 */ 5878 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 5879 (atomic_inc_32_nv(&state->id_running) != 1)) { 5880 DPRINT(10, "ibd_start: id_running is non-zero"); 5881 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 5882 atomic_dec_32(&state->id_running); 5883 return (EINVAL); 5884 } 5885 5886 /* 5887 * Get port details; if we fail here, something bad happened. 5888 * Fail plumb. 5889 */ 5890 if ((err = ibd_get_port_details(state)) != 0) { 5891 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 5892 goto start_fail; 5893 } 5894 /* 5895 * If state->id_link_state is DOWN, it indicates that either the port 5896 * is down, or the pkey is not available. In both cases, resort to late 5897 * initialization. Register for subnet notices, and return success. 5898 */ 5899 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 5900 if (state->id_link_state == LINK_STATE_DOWN) { 5901 late_hca_init = 1; 5902 goto late_hca_init_return; 5903 } 5904 5905 /* 5906 * Find the IPoIB broadcast group 5907 */ 5908 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 5909 /* Resort to late initialization */ 5910 late_hca_init = 1; 5911 goto reg_snet_notices; 5912 } 5913 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 5914 5915 /* 5916 * Initialize per-interface caches and lists; if we fail here, 5917 * it is most likely due to a lack of resources 5918 */ 5919 if (ibd_acache_init(state) != DDI_SUCCESS) { 5920 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 5921 err = ENOMEM; 5922 goto start_fail; 5923 } 5924 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 5925 5926 /* 5927 * Allocate send and receive completion queues 5928 */ 5929 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 5930 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 5931 err = ENOMEM; 5932 goto start_fail; 5933 } 5934 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 5935 5936 /* 5937 * Setup a UD channel 5938 */ 5939 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 5940 err = ENOMEM; 5941 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 5942 goto start_fail; 5943 } 5944 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 5945 5946 /* 5947 * Allocate and initialize the tx buffer list 5948 */ 5949 if (ibd_init_txlist(state) != DDI_SUCCESS) { 5950 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 5951 err = ENOMEM; 5952 goto start_fail; 5953 } 5954 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 5955 5956 /* 5957 * Create the send cq handler here 5958 */ 5959 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 5960 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 5961 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5962 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 5963 "failed, ret=%d", ret); 5964 err = EINVAL; 5965 goto start_fail; 5966 } 5967 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 5968 5969 /* 5970 * Allocate and initialize the rx buffer list 5971 */ 5972 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 5973 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 5974 err = ENOMEM; 5975 goto start_fail; 5976 } 5977 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 5978 5979 /* 5980 * Join IPoIB broadcast group 5981 */ 5982 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 5983 DPRINT(10, "ibd_start: ibd_join_group() failed"); 5984 err = ENOTACTIVE; 5985 goto start_fail; 5986 } 5987 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 5988 5989 /* 5990 * When we did mac_register() in ibd_attach(), we didn't register 5991 * the real macaddr and we didn't have the true port mtu. Now that 5992 * we're almost ready, set the local mac address and broadcast 5993 * addresses and update gldv3 about the real values of these 5994 * parameters. 5995 */ 5996 if (state->id_enable_rc) { 5997 ibd_h2n_mac(&state->id_macaddr, 5998 IBD_MAC_ADDR_RC + state->id_qpnum, 5999 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 6000 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 6001 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 6002 } else { 6003 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 6004 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 6005 } 6006 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 6007 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 6008 6009 if (!state->id_enable_rc) { 6010 (void) mac_maxsdu_update(state->id_mh, state->id_mtu 6011 - IPOIB_HDRSIZE); 6012 } 6013 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6014 6015 /* 6016 * Setup the receive cq handler 6017 */ 6018 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 6019 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 6020 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 6021 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 6022 "failed, ret=%d", ret); 6023 err = EINVAL; 6024 goto start_fail; 6025 } 6026 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 6027 6028 reg_snet_notices: 6029 /* 6030 * In case of normal initialization sequence, 6031 * Setup the subnet notices handler after we've initialized the acache/ 6032 * mcache and started the async thread, both of which are required for 6033 * the trap handler to function properly. 6034 * 6035 * Now that the async thread has been started (and we've already done 6036 * a mac_register() during attach so mac_tx_update() can be called 6037 * if necessary without any problem), we can enable the trap handler 6038 * to queue requests to the async thread. 6039 * 6040 * In case of late hca initialization, the subnet notices handler will 6041 * only handle MCG created/deleted event. The action performed as part 6042 * of handling these events is to start the interface. So, the 6043 * acache/mcache initialization is not a necessity in such cases for 6044 * registering the subnet notices handler. Also, if we are in 6045 * ibd_start() as a result of, say, some event handling after entering 6046 * late hca initialization phase no need to register again. 6047 */ 6048 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) { 6049 ibt_register_subnet_notices(state->id_ibt_hdl, 6050 ibd_snet_notices_handler, state); 6051 mutex_enter(&state->id_trap_lock); 6052 state->id_trap_stop = B_FALSE; 6053 mutex_exit(&state->id_trap_lock); 6054 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 6055 } 6056 6057 late_hca_init_return: 6058 if (late_hca_init == 1) { 6059 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT; 6060 /* 6061 * In case of late initialization, mark the link state as down, 6062 * immaterial of the actual link state as reported in the 6063 * port_info. 6064 */ 6065 state->id_link_state = LINK_STATE_DOWN; 6066 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6067 mac_link_update(state->id_mh, state->id_link_state); 6068 return (DDI_SUCCESS); 6069 } 6070 6071 if (state->id_enable_rc) { 6072 if (state->rc_enable_srq) { 6073 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 6074 if (ibd_rc_repost_srq_free_list(state) != 6075 IBT_SUCCESS) { 6076 err = ENOMEM; 6077 goto start_fail; 6078 } 6079 } else { 6080 /* Allocate SRQ resource */ 6081 if (ibd_rc_init_srq_list(state) != 6082 IBT_SUCCESS) { 6083 err = ENOMEM; 6084 goto start_fail; 6085 } 6086 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 6087 } 6088 } 6089 6090 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 6091 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 6092 "failed"); 6093 err = ENOMEM; 6094 goto start_fail; 6095 } 6096 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 6097 6098 /* RC: begin to listen only after everything is available */ 6099 if (ibd_rc_listen(state) != IBT_SUCCESS) { 6100 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 6101 err = EINVAL; 6102 goto start_fail; 6103 } 6104 state->id_mac_state |= IBD_DRV_RC_LISTEN; 6105 } 6106 6107 /* 6108 * Indicate link status to GLDv3 and higher layers. By default, 6109 * we assume we are in up state (which must have been true at 6110 * least at the time the broadcast mcg's were probed); if there 6111 * were any up/down transitions till the time we come here, the 6112 * async handler will have updated last known state, which we 6113 * use to tell GLDv3. The async handler will not send any 6114 * notifications to GLDv3 till we reach here in the initialization 6115 * sequence. 6116 */ 6117 mac_link_update(state->id_mh, state->id_link_state); 6118 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT; 6119 state->id_mac_state |= IBD_DRV_STARTED; 6120 6121 /* Start timer after everything is ready */ 6122 if (state->id_enable_rc) { 6123 mutex_enter(&state->rc_timeout_lock); 6124 state->rc_timeout_start = B_TRUE; 6125 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state, 6126 SEC_TO_TICK(ibd_rc_conn_timeout)); 6127 mutex_exit(&state->rc_timeout_lock); 6128 state->id_mac_state |= IBD_DRV_RC_TIMEOUT; 6129 } 6130 6131 return (DDI_SUCCESS); 6132 6133 start_fail: 6134 /* 6135 * If we ran into a problem during ibd_start() and ran into 6136 * some other problem during undoing our partial work, we can't 6137 * do anything about it. Ignore any errors we might get from 6138 * ibd_undo_start() and just return the original error we got. 6139 */ 6140 (void) ibd_undo_start(state, LINK_STATE_DOWN); 6141 return (err); 6142 } 6143 6144 /* 6145 * GLDv3 entry point to stop hardware from receiving packets. 6146 */ 6147 /*ARGSUSED*/ 6148 static void 6149 ibd_m_stop(void *arg) 6150 { 6151 ibd_state_t *state = (ibd_state_t *)arg; 6152 6153 if (state->id_type == IBD_PORT_DRIVER) 6154 return; 6155 6156 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6157 6158 (void) ibd_undo_start(state, state->id_link_state); 6159 6160 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6161 } 6162 6163 /* 6164 * GLDv3 entry point to modify device's mac address. We do not 6165 * allow address modifications. 6166 */ 6167 static int 6168 ibd_m_unicst(void *arg, const uint8_t *macaddr) 6169 { 6170 ibd_state_t *state = arg; 6171 6172 if (state->id_type == IBD_PORT_DRIVER) 6173 return (EINVAL); 6174 6175 /* 6176 * Don't bother even comparing the macaddr if we haven't 6177 * completed ibd_m_start(). 6178 */ 6179 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6180 return (0); 6181 6182 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 6183 return (0); 6184 else 6185 return (EINVAL); 6186 } 6187 6188 /* 6189 * The blocking part of the IBA join/leave operations are done out 6190 * of here on the async thread. 6191 */ 6192 static void 6193 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 6194 { 6195 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 6196 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 6197 6198 if (op == IBD_ASYNC_JOIN) { 6199 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 6200 ibd_print_warn(state, "Join multicast group failed :" 6201 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6202 } 6203 } else { 6204 /* 6205 * Here, we must search for the proper mcg_info and 6206 * use that to leave the group. 6207 */ 6208 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 6209 } 6210 } 6211 6212 /* 6213 * GLDv3 entry point for multicast enable/disable requests. 6214 * This function queues the operation to the async thread and 6215 * return success for a valid multicast address. 6216 */ 6217 static int 6218 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 6219 { 6220 ibd_state_t *state = (ibd_state_t *)arg; 6221 ipoib_mac_t maddr, *mcast; 6222 ib_gid_t mgid; 6223 ibd_req_t *req; 6224 6225 if (state->id_type == IBD_PORT_DRIVER) 6226 return (EINVAL); 6227 6228 /* 6229 * If we haven't completed ibd_m_start(), async thread wouldn't 6230 * have been started and id_bcaddr wouldn't be set, so there's 6231 * no point in continuing. 6232 */ 6233 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6234 return (0); 6235 6236 /* 6237 * The incoming multicast address might not be aligned properly 6238 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 6239 * it to look like one though, to get the offsets of the mc gid, 6240 * since we know we are not going to dereference any values with 6241 * the ipoib_mac_t pointer. 6242 */ 6243 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 6244 mcast = &maddr; 6245 6246 /* 6247 * Check validity of MCG address. We could additionally check 6248 * that a enable/disable is not being issued on the "broadcast" 6249 * mcg, but since this operation is only invokable by privileged 6250 * programs anyway, we allow the flexibility to those dlpi apps. 6251 * Note that we do not validate the "scope" of the IBA mcg. 6252 */ 6253 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 6254 return (EINVAL); 6255 6256 /* 6257 * fill in multicast pkey and scope 6258 */ 6259 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 6260 6261 /* 6262 * If someone is trying to JOIN/LEAVE the broadcast group, we do 6263 * nothing (i.e. we stay JOINed to the broadcast group done in 6264 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 6265 * requires to be joined to broadcast groups at all times. 6266 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 6267 * depends on this. 6268 */ 6269 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6270 return (0); 6271 6272 ibd_n2h_gid(mcast, &mgid); 6273 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6274 if (req == NULL) 6275 return (ENOMEM); 6276 6277 req->rq_gid = mgid; 6278 6279 if (add) { 6280 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 6281 mgid.gid_prefix, mgid.gid_guid); 6282 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 6283 } else { 6284 DPRINT(1, "ibd_m_multicst : unset_multicast : " 6285 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6286 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 6287 } 6288 return (0); 6289 } 6290 6291 /* 6292 * The blocking part of the IBA promiscuous operations are done 6293 * out of here on the async thread. The dlpireq parameter indicates 6294 * whether this invocation is due to a dlpi request or due to 6295 * a port up/down event. 6296 */ 6297 static void 6298 ibd_async_unsetprom(ibd_state_t *state) 6299 { 6300 ibd_mce_t *mce = list_head(&state->id_mc_non); 6301 ib_gid_t mgid; 6302 6303 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 6304 6305 while (mce != NULL) { 6306 mgid = mce->mc_info.mc_adds_vect.av_dgid; 6307 mce = list_next(&state->id_mc_non, mce); 6308 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 6309 } 6310 state->id_prom_op = IBD_OP_NOTSTARTED; 6311 } 6312 6313 /* 6314 * The blocking part of the IBA promiscuous operations are done 6315 * out of here on the async thread. The dlpireq parameter indicates 6316 * whether this invocation is due to a dlpi request or due to 6317 * a port up/down event. 6318 */ 6319 static void 6320 ibd_async_setprom(ibd_state_t *state) 6321 { 6322 ibt_mcg_attr_t mcg_attr; 6323 ibt_mcg_info_t *mcg_info; 6324 ib_gid_t mgid; 6325 uint_t numg; 6326 int i; 6327 char ret = IBD_OP_COMPLETED; 6328 6329 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 6330 6331 /* 6332 * Obtain all active MC groups on the IB fabric with 6333 * specified criteria (scope + Pkey + Qkey + mtu). 6334 */ 6335 bzero(&mcg_attr, sizeof (mcg_attr)); 6336 mcg_attr.mc_pkey = state->id_pkey; 6337 mcg_attr.mc_scope = state->id_scope; 6338 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 6339 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 6340 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 6341 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 6342 IBT_SUCCESS) { 6343 ibd_print_warn(state, "Could not get list of IBA multicast " 6344 "groups"); 6345 ret = IBD_OP_ERRORED; 6346 goto done; 6347 } 6348 6349 /* 6350 * Iterate over the returned mcg's and join as NonMember 6351 * to the IP mcg's. 6352 */ 6353 for (i = 0; i < numg; i++) { 6354 /* 6355 * Do a NonMember JOIN on the MC group. 6356 */ 6357 mgid = mcg_info[i].mc_adds_vect.av_dgid; 6358 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 6359 ibd_print_warn(state, "IBA promiscuous mode missed " 6360 "multicast gid %016llx:%016llx", 6361 (u_longlong_t)mgid.gid_prefix, 6362 (u_longlong_t)mgid.gid_guid); 6363 } 6364 6365 ibt_free_mcg_info(mcg_info, numg); 6366 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 6367 done: 6368 state->id_prom_op = ret; 6369 } 6370 6371 /* 6372 * GLDv3 entry point for multicast promiscuous enable/disable requests. 6373 * GLDv3 assumes phys state receives more packets than multi state, 6374 * which is not true for IPoIB. Thus, treat the multi and phys 6375 * promiscuous states the same way to work with GLDv3's assumption. 6376 */ 6377 static int 6378 ibd_m_promisc(void *arg, boolean_t on) 6379 { 6380 ibd_state_t *state = (ibd_state_t *)arg; 6381 ibd_req_t *req; 6382 6383 if (state->id_type == IBD_PORT_DRIVER) 6384 return (EINVAL); 6385 6386 /* 6387 * Async thread wouldn't have been started if we haven't 6388 * passed ibd_m_start() 6389 */ 6390 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6391 return (0); 6392 6393 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6394 if (req == NULL) 6395 return (ENOMEM); 6396 if (on) { 6397 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 6398 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 6399 } else { 6400 DPRINT(1, "ibd_m_promisc : unset_promisc"); 6401 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 6402 } 6403 6404 return (0); 6405 } 6406 6407 /* 6408 * GLDv3 entry point for gathering statistics. 6409 */ 6410 static int 6411 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 6412 { 6413 ibd_state_t *state = (ibd_state_t *)arg; 6414 6415 switch (stat) { 6416 case MAC_STAT_IFSPEED: 6417 *val = state->id_link_speed; 6418 break; 6419 case MAC_STAT_MULTIRCV: 6420 *val = state->id_multi_rcv; 6421 break; 6422 case MAC_STAT_BRDCSTRCV: 6423 *val = state->id_brd_rcv; 6424 break; 6425 case MAC_STAT_MULTIXMT: 6426 *val = state->id_multi_xmt; 6427 break; 6428 case MAC_STAT_BRDCSTXMT: 6429 *val = state->id_brd_xmt; 6430 break; 6431 case MAC_STAT_RBYTES: 6432 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 6433 + state->rc_rcv_copy_byte; 6434 break; 6435 case MAC_STAT_IPACKETS: 6436 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 6437 + state->rc_rcv_copy_pkt; 6438 break; 6439 case MAC_STAT_OBYTES: 6440 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 6441 break; 6442 case MAC_STAT_OPACKETS: 6443 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 6444 state->rc_xmt_fragmented_pkt + 6445 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 6446 break; 6447 case MAC_STAT_OERRORS: 6448 *val = state->id_ah_error; /* failed AH translation */ 6449 break; 6450 case MAC_STAT_IERRORS: 6451 *val = 0; 6452 break; 6453 case MAC_STAT_NOXMTBUF: 6454 *val = state->id_tx_short + state->rc_swqe_short + 6455 state->rc_xmt_buf_short; 6456 break; 6457 case MAC_STAT_NORCVBUF: 6458 default: 6459 return (ENOTSUP); 6460 } 6461 6462 return (0); 6463 } 6464 6465 static void 6466 ibd_async_txsched(ibd_state_t *state) 6467 { 6468 ibd_resume_transmission(state); 6469 } 6470 6471 static void 6472 ibd_resume_transmission(ibd_state_t *state) 6473 { 6474 int flag; 6475 int met_thresh = 0; 6476 int thresh = 0; 6477 int ret = -1; 6478 6479 mutex_enter(&state->id_sched_lock); 6480 if (state->id_sched_needed & IBD_RSRC_SWQE) { 6481 mutex_enter(&state->id_tx_list.dl_mutex); 6482 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6483 met_thresh = state->id_tx_list.dl_cnt + 6484 state->id_tx_rel_list.dl_cnt; 6485 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6486 mutex_exit(&state->id_tx_list.dl_mutex); 6487 thresh = IBD_FREE_SWQES_THRESH; 6488 flag = IBD_RSRC_SWQE; 6489 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 6490 ASSERT(state->id_lso != NULL); 6491 mutex_enter(&state->id_lso_lock); 6492 met_thresh = state->id_lso->bkt_nfree; 6493 thresh = IBD_FREE_LSOS_THRESH; 6494 mutex_exit(&state->id_lso_lock); 6495 flag = IBD_RSRC_LSOBUF; 6496 if (met_thresh > thresh) 6497 state->id_sched_lso_cnt++; 6498 } 6499 if (met_thresh > thresh) { 6500 state->id_sched_needed &= ~flag; 6501 state->id_sched_cnt++; 6502 ret = 0; 6503 } 6504 mutex_exit(&state->id_sched_lock); 6505 6506 if (ret == 0) 6507 mac_tx_update(state->id_mh); 6508 } 6509 6510 /* 6511 * Release the send wqe back into free list. 6512 */ 6513 static void 6514 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 6515 { 6516 /* 6517 * Add back on Tx list for reuse. 6518 */ 6519 ASSERT(tail->swqe_next == NULL); 6520 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6521 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 6522 tail->swqe_next = state->id_tx_rel_list.dl_head; 6523 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 6524 state->id_tx_rel_list.dl_cnt += n; 6525 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6526 } 6527 6528 /* 6529 * Acquire a send wqe from free list. 6530 * Returns error number and send wqe pointer. 6531 */ 6532 static ibd_swqe_t * 6533 ibd_acquire_swqe(ibd_state_t *state) 6534 { 6535 ibd_swqe_t *wqe; 6536 6537 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6538 if (state->id_tx_rel_list.dl_head != NULL) { 6539 /* transfer id_tx_rel_list to id_tx_list */ 6540 state->id_tx_list.dl_head = 6541 state->id_tx_rel_list.dl_head; 6542 state->id_tx_list.dl_cnt = 6543 state->id_tx_rel_list.dl_cnt; 6544 state->id_tx_list.dl_pending_sends = B_FALSE; 6545 6546 /* clear id_tx_rel_list */ 6547 state->id_tx_rel_list.dl_head = NULL; 6548 state->id_tx_rel_list.dl_cnt = 0; 6549 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6550 6551 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 6552 state->id_tx_list.dl_cnt -= 1; 6553 state->id_tx_list.dl_head = wqe->swqe_next; 6554 } else { /* no free swqe */ 6555 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6556 state->id_tx_list.dl_pending_sends = B_TRUE; 6557 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 6558 state->id_tx_short++; 6559 wqe = NULL; 6560 } 6561 return (wqe); 6562 } 6563 6564 static int 6565 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 6566 ibt_ud_dest_hdl_t ud_dest) 6567 { 6568 mblk_t *nmp; 6569 int iph_len, tcph_len; 6570 ibt_wr_lso_t *lso; 6571 uintptr_t ip_start, tcp_start; 6572 uint8_t *dst; 6573 uint_t pending, mblen; 6574 6575 /* 6576 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 6577 * we need to adjust it here for lso. 6578 */ 6579 lso = &(node->w_swr.wr.ud_lso); 6580 lso->lso_ud_dest = ud_dest; 6581 lso->lso_mss = mss; 6582 6583 /* 6584 * Calculate the LSO header size and set it in the UD LSO structure. 6585 * Note that the only assumption we make is that each of the IPoIB, 6586 * IP and TCP headers will be contained in a single mblk fragment; 6587 * together, the headers may span multiple mblk fragments. 6588 */ 6589 nmp = mp; 6590 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 6591 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 6592 ip_start = (uintptr_t)nmp->b_cont->b_rptr 6593 + (ip_start - (uintptr_t)(nmp->b_wptr)); 6594 nmp = nmp->b_cont; 6595 6596 } 6597 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 6598 6599 tcp_start = ip_start + iph_len; 6600 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 6601 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 6602 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 6603 nmp = nmp->b_cont; 6604 } 6605 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 6606 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 6607 6608 /* 6609 * If the lso header fits entirely within a single mblk fragment, 6610 * we'll avoid an additional copy of the lso header here and just 6611 * pass the b_rptr of the mblk directly. 6612 * 6613 * If this isn't true, we'd have to allocate for it explicitly. 6614 */ 6615 if (lso->lso_hdr_sz <= MBLKL(mp)) { 6616 lso->lso_hdr = mp->b_rptr; 6617 } else { 6618 /* On work completion, remember to free this allocated hdr */ 6619 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 6620 if (lso->lso_hdr == NULL) { 6621 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 6622 "sz = %d", lso->lso_hdr_sz); 6623 lso->lso_hdr_sz = 0; 6624 lso->lso_mss = 0; 6625 return (-1); 6626 } 6627 } 6628 6629 /* 6630 * Copy in the lso header only if we need to 6631 */ 6632 if (lso->lso_hdr != mp->b_rptr) { 6633 dst = lso->lso_hdr; 6634 pending = lso->lso_hdr_sz; 6635 6636 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 6637 mblen = MBLKL(nmp); 6638 if (pending > mblen) { 6639 bcopy(nmp->b_rptr, dst, mblen); 6640 dst += mblen; 6641 pending -= mblen; 6642 } else { 6643 bcopy(nmp->b_rptr, dst, pending); 6644 break; 6645 } 6646 } 6647 } 6648 6649 return (0); 6650 } 6651 6652 static void 6653 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 6654 { 6655 ibt_wr_lso_t *lso; 6656 6657 if ((!node) || (!mp)) 6658 return; 6659 6660 /* 6661 * Free any header space that we might've allocated if we 6662 * did an LSO 6663 */ 6664 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 6665 lso = &(node->w_swr.wr.ud_lso); 6666 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 6667 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 6668 lso->lso_hdr = NULL; 6669 lso->lso_hdr_sz = 0; 6670 } 6671 } 6672 } 6673 6674 static void 6675 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 6676 { 6677 uint_t i; 6678 uint_t num_posted; 6679 uint_t n_wrs; 6680 ibt_status_t ibt_status; 6681 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 6682 ibd_swqe_t *tx_head, *elem; 6683 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 6684 6685 /* post the one request, then check for more */ 6686 ibt_status = ibt_post_send(state->id_chnl_hdl, 6687 &node->w_swr, 1, NULL); 6688 if (ibt_status != IBT_SUCCESS) { 6689 ibd_print_warn(state, "ibd_post_send: " 6690 "posting one wr failed: ret=%d", ibt_status); 6691 ibd_tx_cleanup(state, node); 6692 } 6693 6694 tx_head = NULL; 6695 for (;;) { 6696 if (tx_head == NULL) { 6697 mutex_enter(&state->id_txpost_lock); 6698 tx_head = state->id_tx_head; 6699 if (tx_head == NULL) { 6700 state->id_tx_busy = 0; 6701 mutex_exit(&state->id_txpost_lock); 6702 return; 6703 } 6704 state->id_tx_head = NULL; 6705 mutex_exit(&state->id_txpost_lock); 6706 } 6707 6708 /* 6709 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 6710 * at a time if possible, and keep posting them. 6711 */ 6712 for (n_wrs = 0, elem = tx_head; 6713 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 6714 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 6715 nodes[n_wrs] = elem; 6716 wrs[n_wrs] = elem->w_swr; 6717 } 6718 tx_head = elem; 6719 6720 ASSERT(n_wrs != 0); 6721 6722 /* 6723 * If posting fails for some reason, we'll never receive 6724 * completion intimation, so we'll need to cleanup. But 6725 * we need to make sure we don't clean up nodes whose 6726 * wrs have been successfully posted. We assume that the 6727 * hca driver returns on the first failure to post and 6728 * therefore the first 'num_posted' entries don't need 6729 * cleanup here. 6730 */ 6731 num_posted = 0; 6732 ibt_status = ibt_post_send(state->id_chnl_hdl, 6733 wrs, n_wrs, &num_posted); 6734 if (ibt_status != IBT_SUCCESS) { 6735 ibd_print_warn(state, "ibd_post_send: " 6736 "posting multiple wrs failed: " 6737 "requested=%d, done=%d, ret=%d", 6738 n_wrs, num_posted, ibt_status); 6739 6740 for (i = num_posted; i < n_wrs; i++) 6741 ibd_tx_cleanup(state, nodes[i]); 6742 } 6743 } 6744 } 6745 6746 static int 6747 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 6748 uint_t lsohdr_sz) 6749 { 6750 ibt_wr_ds_t *sgl; 6751 ibt_status_t ibt_status; 6752 mblk_t *nmp; 6753 mblk_t *data_mp; 6754 uchar_t *bufp; 6755 size_t blksize; 6756 size_t skip; 6757 size_t avail; 6758 uint_t pktsize; 6759 uint_t frag_len; 6760 uint_t pending_hdr; 6761 int nmblks; 6762 int i; 6763 6764 /* 6765 * Let's skip ahead to the data if this is LSO 6766 */ 6767 data_mp = mp; 6768 pending_hdr = 0; 6769 if (lsohdr_sz) { 6770 pending_hdr = lsohdr_sz; 6771 for (nmp = mp; nmp; nmp = nmp->b_cont) { 6772 frag_len = nmp->b_wptr - nmp->b_rptr; 6773 if (frag_len > pending_hdr) 6774 break; 6775 pending_hdr -= frag_len; 6776 } 6777 data_mp = nmp; /* start of data past lso header */ 6778 ASSERT(data_mp != NULL); 6779 } 6780 6781 /* 6782 * Calculate the size of message data and number of msg blocks 6783 */ 6784 pktsize = 0; 6785 for (nmblks = 0, nmp = data_mp; nmp != NULL; 6786 nmp = nmp->b_cont, nmblks++) { 6787 pktsize += MBLKL(nmp); 6788 } 6789 pktsize -= pending_hdr; 6790 6791 /* 6792 * We only do ibt_map_mem_iov() if the pktsize is above the 6793 * "copy-threshold", and if the number of mp fragments is less than 6794 * the maximum acceptable. 6795 */ 6796 if ((state->id_hca_res_lkey_capab) && 6797 (pktsize > state->id_ud_tx_copy_thresh) && 6798 (nmblks < state->id_max_sqseg_hiwm)) { 6799 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6800 ibt_iov_attr_t iov_attr; 6801 6802 iov_attr.iov_as = NULL; 6803 iov_attr.iov = iov_arr; 6804 iov_attr.iov_buf = NULL; 6805 iov_attr.iov_list_len = nmblks; 6806 iov_attr.iov_wr_nds = state->id_max_sqseg; 6807 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 6808 iov_attr.iov_flags = IBT_IOV_SLEEP; 6809 6810 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 6811 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 6812 iov_arr[i].iov_len = MBLKL(nmp); 6813 if (i == 0) { 6814 iov_arr[i].iov_addr += pending_hdr; 6815 iov_arr[i].iov_len -= pending_hdr; 6816 } 6817 } 6818 6819 node->w_buftype = IBD_WQE_MAPPED; 6820 node->w_swr.wr_sgl = node->w_sgl; 6821 6822 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 6823 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 6824 if (ibt_status != IBT_SUCCESS) { 6825 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 6826 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 6827 goto ibd_copy_path; 6828 } 6829 6830 return (0); 6831 } 6832 6833 ibd_copy_path: 6834 if (pktsize <= state->id_tx_buf_sz) { 6835 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6836 node->w_swr.wr_nds = 1; 6837 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6838 node->w_buftype = IBD_WQE_TXBUF; 6839 6840 /* 6841 * Even though this is the copy path for transfers less than 6842 * id_tx_buf_sz, it could still be an LSO packet. If so, it 6843 * is possible the first data mblk fragment (data_mp) still 6844 * contains part of the LSO header that we need to skip. 6845 */ 6846 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6847 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 6848 blksize = MBLKL(nmp) - pending_hdr; 6849 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 6850 bufp += blksize; 6851 pending_hdr = 0; 6852 } 6853 6854 return (0); 6855 } 6856 6857 /* 6858 * Copy path for transfers greater than id_tx_buf_sz 6859 */ 6860 node->w_swr.wr_sgl = node->w_sgl; 6861 if (ibd_acquire_lsobufs(state, pktsize, 6862 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 6863 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 6864 return (-1); 6865 } 6866 node->w_buftype = IBD_WQE_LSOBUF; 6867 6868 /* 6869 * Copy the larger-than-id_tx_buf_sz packet into a set of 6870 * fixed-sized, pre-mapped LSO buffers. Note that we might 6871 * need to skip part of the LSO header in the first fragment 6872 * as before. 6873 */ 6874 nmp = data_mp; 6875 skip = pending_hdr; 6876 for (i = 0; i < node->w_swr.wr_nds; i++) { 6877 sgl = node->w_swr.wr_sgl + i; 6878 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 6879 avail = IBD_LSO_BUFSZ; 6880 while (nmp && avail) { 6881 blksize = MBLKL(nmp) - skip; 6882 if (blksize > avail) { 6883 bcopy(nmp->b_rptr + skip, bufp, avail); 6884 skip += avail; 6885 avail = 0; 6886 } else { 6887 bcopy(nmp->b_rptr + skip, bufp, blksize); 6888 skip = 0; 6889 avail -= blksize; 6890 bufp += blksize; 6891 nmp = nmp->b_cont; 6892 } 6893 } 6894 } 6895 6896 return (0); 6897 } 6898 6899 /* 6900 * Schedule a completion queue polling to reap the resource we're 6901 * short on. If we implement the change to reap tx completions 6902 * in a separate thread, we'll need to wake up that thread here. 6903 */ 6904 static int 6905 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 6906 { 6907 ibd_req_t *req; 6908 6909 mutex_enter(&state->id_sched_lock); 6910 state->id_sched_needed |= resource_type; 6911 mutex_exit(&state->id_sched_lock); 6912 6913 /* 6914 * If we are asked to queue a work entry, we need to do it 6915 */ 6916 if (q_flag) { 6917 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6918 if (req == NULL) 6919 return (-1); 6920 6921 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 6922 } 6923 6924 return (0); 6925 } 6926 6927 /* 6928 * The passed in packet has this format: 6929 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 6930 */ 6931 static boolean_t 6932 ibd_send(ibd_state_t *state, mblk_t *mp) 6933 { 6934 ibd_ace_t *ace; 6935 ibd_swqe_t *node; 6936 ipoib_mac_t *dest; 6937 ib_header_info_t *ipibp; 6938 ip6_t *ip6h; 6939 uint_t pktsize; 6940 uint32_t mss; 6941 uint32_t hckflags; 6942 uint32_t lsoflags = 0; 6943 uint_t lsohdr_sz = 0; 6944 int ret, len; 6945 boolean_t dofree = B_FALSE; 6946 boolean_t rc; 6947 /* if (rc_chan == NULL) send by UD; else send by RC; */ 6948 ibd_rc_chan_t *rc_chan; 6949 int nmblks; 6950 mblk_t *nmp; 6951 6952 /* 6953 * If we aren't done with the device initialization and start, 6954 * we shouldn't be here. 6955 */ 6956 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6957 return (B_FALSE); 6958 6959 /* 6960 * Obtain an address handle for the destination. 6961 */ 6962 ipibp = (ib_header_info_t *)mp->b_rptr; 6963 dest = (ipoib_mac_t *)&ipibp->ib_dst; 6964 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6965 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 6966 6967 rc_chan = NULL; 6968 ace = ibd_acache_lookup(state, dest, &ret, 1); 6969 if (state->id_enable_rc && (ace != NULL) && 6970 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 6971 if (ace->ac_chan == NULL) { 6972 state->rc_null_conn++; 6973 } else { 6974 if (ace->ac_chan->chan_state == 6975 IBD_RC_STATE_ACT_ESTAB) { 6976 rc_chan = ace->ac_chan; 6977 rc_chan->is_used = B_TRUE; 6978 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 6979 node = WQE_TO_SWQE( 6980 rc_chan->tx_wqe_list.dl_head); 6981 if (node != NULL) { 6982 rc_chan->tx_wqe_list.dl_cnt -= 1; 6983 rc_chan->tx_wqe_list.dl_head = 6984 node->swqe_next; 6985 } else { 6986 node = ibd_rc_acquire_swqes(rc_chan); 6987 } 6988 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 6989 6990 if (node == NULL) { 6991 state->rc_swqe_short++; 6992 mutex_enter(&state->id_sched_lock); 6993 state->id_sched_needed |= 6994 IBD_RSRC_RC_SWQE; 6995 mutex_exit(&state->id_sched_lock); 6996 ibd_dec_ref_ace(state, ace); 6997 return (B_FALSE); 6998 } 6999 } else { 7000 state->rc_no_estab_conn++; 7001 } 7002 } 7003 } 7004 7005 if (rc_chan == NULL) { 7006 mutex_enter(&state->id_tx_list.dl_mutex); 7007 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 7008 if (node != NULL) { 7009 state->id_tx_list.dl_cnt -= 1; 7010 state->id_tx_list.dl_head = node->swqe_next; 7011 } else { 7012 node = ibd_acquire_swqe(state); 7013 } 7014 mutex_exit(&state->id_tx_list.dl_mutex); 7015 if (node == NULL) { 7016 /* 7017 * If we don't have an swqe available, schedule a 7018 * transmit completion queue cleanup and hold off on 7019 * sending more packets until we have some free swqes 7020 */ 7021 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 7022 if (ace != NULL) { 7023 ibd_dec_ref_ace(state, ace); 7024 } 7025 return (B_FALSE); 7026 } 7027 7028 /* 7029 * If a poll cannot be scheduled, we have no choice but 7030 * to drop this packet 7031 */ 7032 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 7033 if (ace != NULL) { 7034 ibd_dec_ref_ace(state, ace); 7035 } 7036 return (B_TRUE); 7037 } 7038 } 7039 7040 /* 7041 * Initialize the commonly used fields in swqe to NULL to protect 7042 * against ibd_tx_cleanup accidentally misinterpreting these on a 7043 * failure. 7044 */ 7045 node->swqe_im_mblk = NULL; 7046 node->w_swr.wr_nds = 0; 7047 node->w_swr.wr_sgl = NULL; 7048 node->w_swr.wr_opcode = IBT_WRC_SEND; 7049 7050 /* 7051 * Calculate the size of message data and number of msg blocks 7052 */ 7053 pktsize = 0; 7054 for (nmblks = 0, nmp = mp; nmp != NULL; 7055 nmp = nmp->b_cont, nmblks++) { 7056 pktsize += MBLKL(nmp); 7057 } 7058 7059 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 7060 atomic_inc_64(&state->id_brd_xmt); 7061 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 7062 atomic_inc_64(&state->id_multi_xmt); 7063 7064 if (ace != NULL) { 7065 node->w_ahandle = ace; 7066 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 7067 } else { 7068 DPRINT(5, 7069 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 7070 ((ret == EFAULT) ? "failed" : "queued"), 7071 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 7072 htonl(dest->ipoib_gidpref[1]), 7073 htonl(dest->ipoib_gidsuff[0]), 7074 htonl(dest->ipoib_gidsuff[1])); 7075 state->rc_ace_not_found++; 7076 node->w_ahandle = NULL; 7077 7078 /* 7079 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 7080 * can not find a path for the specific dest address. We 7081 * should get rid of this kind of packet. We also should get 7082 * rid of the packet if we cannot schedule a poll via the 7083 * async thread. For the normal case, ibd will return the 7084 * packet to upper layer and wait for AH creating. 7085 * 7086 * Note that we always queue a work slot entry for the async 7087 * thread when we fail AH lookup (even in intr mode); this is 7088 * due to the convoluted way the code currently looks for AH. 7089 */ 7090 if (ret == EFAULT) { 7091 dofree = B_TRUE; 7092 rc = B_TRUE; 7093 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 7094 dofree = B_TRUE; 7095 rc = B_TRUE; 7096 } else { 7097 dofree = B_FALSE; 7098 rc = B_FALSE; 7099 } 7100 goto ibd_send_fail; 7101 } 7102 7103 /* 7104 * For ND6 packets, padding is at the front of the source lladdr. 7105 * Insert the padding at front. 7106 */ 7107 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 7108 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 7109 if (!pullupmsg(mp, IPV6_HDR_LEN + 7110 sizeof (ib_header_info_t))) { 7111 DPRINT(10, "ibd_send: pullupmsg failure "); 7112 dofree = B_TRUE; 7113 rc = B_TRUE; 7114 goto ibd_send_fail; 7115 } 7116 ipibp = (ib_header_info_t *)mp->b_rptr; 7117 } 7118 ip6h = (ip6_t *)((uchar_t *)ipibp + 7119 sizeof (ib_header_info_t)); 7120 len = ntohs(ip6h->ip6_plen); 7121 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7122 mblk_t *pad; 7123 7124 pad = allocb(4, 0); 7125 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 7126 linkb(mp, pad); 7127 if (MBLKL(mp) < sizeof (ib_header_info_t) + 7128 IPV6_HDR_LEN + len + 4) { 7129 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 7130 IPV6_HDR_LEN + len + 4)) { 7131 DPRINT(10, "ibd_send: pullupmsg " 7132 "failure "); 7133 dofree = B_TRUE; 7134 rc = B_TRUE; 7135 goto ibd_send_fail; 7136 } 7137 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 7138 sizeof (ib_header_info_t)); 7139 } 7140 7141 /* LINTED: E_CONSTANT_CONDITION */ 7142 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 7143 } 7144 } 7145 7146 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 7147 mp->b_rptr += sizeof (ib_addrs_t); 7148 pktsize -= sizeof (ib_addrs_t); 7149 7150 if (rc_chan) { /* send in RC mode */ 7151 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 7152 ibt_iov_attr_t iov_attr; 7153 uint_t i; 7154 size_t blksize; 7155 uchar_t *bufp; 7156 ibd_rc_tx_largebuf_t *lbufp; 7157 7158 atomic_add_64(&state->rc_xmt_bytes, pktsize); 7159 7160 /* 7161 * Upper layer does Tx checksum, we don't need do any 7162 * checksum here. 7163 */ 7164 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 7165 7166 /* 7167 * We only do ibt_map_mem_iov() if the pktsize is above 7168 * the "copy-threshold", and if the number of mp 7169 * fragments is less than the maximum acceptable. 7170 */ 7171 if (pktsize <= state->id_rc_tx_copy_thresh) { 7172 atomic_inc_64(&state->rc_xmt_small_pkt); 7173 /* 7174 * Only process unicast packet in Reliable Connected 7175 * mode. 7176 */ 7177 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 7178 node->w_swr.wr_nds = 1; 7179 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 7180 node->w_buftype = IBD_WQE_TXBUF; 7181 7182 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 7183 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7184 blksize = MBLKL(nmp); 7185 bcopy(nmp->b_rptr, bufp, blksize); 7186 bufp += blksize; 7187 } 7188 freemsg(mp); 7189 ASSERT(node->swqe_im_mblk == NULL); 7190 } else { 7191 if ((state->rc_enable_iov_map) && 7192 (nmblks < state->rc_max_sqseg_hiwm)) { 7193 7194 /* do ibt_map_mem_iov() */ 7195 iov_attr.iov_as = NULL; 7196 iov_attr.iov = iov_arr; 7197 iov_attr.iov_buf = NULL; 7198 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 7199 iov_attr.iov_lso_hdr_sz = 0; 7200 iov_attr.iov_flags = IBT_IOV_SLEEP; 7201 7202 i = 0; 7203 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7204 iov_arr[i].iov_len = MBLKL(nmp); 7205 if (iov_arr[i].iov_len != 0) { 7206 iov_arr[i].iov_addr = (caddr_t) 7207 (void *)nmp->b_rptr; 7208 i++; 7209 } 7210 } 7211 iov_attr.iov_list_len = i; 7212 node->w_swr.wr_sgl = node->w_sgl; 7213 7214 ret = ibt_map_mem_iov(state->id_hca_hdl, 7215 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 7216 &node->w_mi_hdl); 7217 if (ret != IBT_SUCCESS) { 7218 atomic_inc_64( 7219 &state->rc_xmt_map_fail_pkt); 7220 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 7221 ") failed, nmblks=%d, real_nmblks" 7222 "=%d, ret=0x%x", nmblks, i, ret); 7223 goto ibd_rc_large_copy; 7224 } 7225 7226 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 7227 node->w_buftype = IBD_WQE_MAPPED; 7228 node->swqe_im_mblk = mp; 7229 } else { 7230 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 7231 ibd_rc_large_copy: 7232 mutex_enter(&state->rc_tx_large_bufs_lock); 7233 if (state->rc_tx_largebuf_nfree == 0) { 7234 state->rc_xmt_buf_short++; 7235 mutex_exit 7236 (&state->rc_tx_large_bufs_lock); 7237 mutex_enter(&state->id_sched_lock); 7238 state->id_sched_needed |= 7239 IBD_RSRC_RC_TX_LARGEBUF; 7240 mutex_exit(&state->id_sched_lock); 7241 dofree = B_FALSE; 7242 rc = B_FALSE; 7243 /* 7244 * If we don't have Tx large bufs, 7245 * return failure. node->w_buftype 7246 * should not be IBD_WQE_RC_COPYBUF, 7247 * otherwise it will cause problem 7248 * in ibd_rc_tx_cleanup() 7249 */ 7250 node->w_buftype = IBD_WQE_TXBUF; 7251 goto ibd_send_fail; 7252 } 7253 7254 lbufp = state->rc_tx_largebuf_free_head; 7255 ASSERT(lbufp->lb_buf != NULL); 7256 state->rc_tx_largebuf_free_head = 7257 lbufp->lb_next; 7258 lbufp->lb_next = NULL; 7259 /* Update nfree count */ 7260 state->rc_tx_largebuf_nfree --; 7261 mutex_exit(&state->rc_tx_large_bufs_lock); 7262 bufp = lbufp->lb_buf; 7263 node->w_sgl[0].ds_va = 7264 (ib_vaddr_t)(uintptr_t)bufp; 7265 node->w_sgl[0].ds_key = 7266 state->rc_tx_mr_desc.md_lkey; 7267 node->w_sgl[0].ds_len = pktsize; 7268 node->w_swr.wr_sgl = node->w_sgl; 7269 node->w_swr.wr_nds = 1; 7270 node->w_buftype = IBD_WQE_RC_COPYBUF; 7271 node->w_rc_tx_largebuf = lbufp; 7272 7273 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7274 blksize = MBLKL(nmp); 7275 if (blksize != 0) { 7276 bcopy(nmp->b_rptr, bufp, 7277 blksize); 7278 bufp += blksize; 7279 } 7280 } 7281 freemsg(mp); 7282 ASSERT(node->swqe_im_mblk == NULL); 7283 } 7284 } 7285 7286 node->swqe_next = NULL; 7287 mutex_enter(&rc_chan->tx_post_lock); 7288 if (rc_chan->tx_busy) { 7289 if (rc_chan->tx_head) { 7290 rc_chan->tx_tail->swqe_next = 7291 SWQE_TO_WQE(node); 7292 } else { 7293 rc_chan->tx_head = node; 7294 } 7295 rc_chan->tx_tail = node; 7296 mutex_exit(&rc_chan->tx_post_lock); 7297 } else { 7298 rc_chan->tx_busy = 1; 7299 mutex_exit(&rc_chan->tx_post_lock); 7300 ibd_rc_post_send(rc_chan, node); 7301 } 7302 7303 return (B_TRUE); 7304 } /* send by RC */ 7305 7306 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 7307 /* 7308 * Too long pktsize. The packet size from GLD should <= 7309 * state->id_mtu + sizeof (ib_addrs_t) 7310 */ 7311 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 7312 ibd_req_t *req; 7313 7314 mutex_enter(&ace->tx_too_big_mutex); 7315 if (ace->tx_too_big_ongoing) { 7316 mutex_exit(&ace->tx_too_big_mutex); 7317 state->rc_xmt_reenter_too_long_pkt++; 7318 dofree = B_TRUE; 7319 } else { 7320 ace->tx_too_big_ongoing = B_TRUE; 7321 mutex_exit(&ace->tx_too_big_mutex); 7322 state->rc_xmt_icmp_too_long_pkt++; 7323 7324 req = kmem_cache_alloc(state->id_req_kmc, 7325 KM_NOSLEEP); 7326 if (req == NULL) { 7327 ibd_print_warn(state, "ibd_send: alloc " 7328 "ibd_req_t fail"); 7329 /* Drop it. */ 7330 dofree = B_TRUE; 7331 } else { 7332 req->rq_ptr = mp; 7333 req->rq_ptr2 = ace; 7334 ibd_queue_work_slot(state, req, 7335 IBD_ASYNC_RC_TOO_BIG); 7336 dofree = B_FALSE; 7337 } 7338 } 7339 } else { 7340 ibd_print_warn(state, "Reliable Connected mode is on. " 7341 "Multicast packet length %d > %d is too long to " 7342 "send packet (%d > %d), drop it", 7343 pktsize, state->id_mtu); 7344 state->rc_xmt_drop_too_long_pkt++; 7345 /* Drop it. */ 7346 dofree = B_TRUE; 7347 } 7348 rc = B_TRUE; 7349 goto ibd_send_fail; 7350 } 7351 7352 atomic_add_64(&state->id_xmt_bytes, pktsize); 7353 atomic_inc_64(&state->id_xmt_pkt); 7354 7355 /* 7356 * Do LSO and checksum related work here. For LSO send, adjust the 7357 * ud destination, the opcode and the LSO header information to the 7358 * work request. 7359 */ 7360 mac_lso_get(mp, &mss, &lsoflags); 7361 if ((lsoflags & HW_LSO) != HW_LSO) { 7362 node->w_swr.wr_opcode = IBT_WRC_SEND; 7363 lsohdr_sz = 0; 7364 } else { 7365 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 7366 /* 7367 * The routine can only fail if there's no memory; we 7368 * can only drop the packet if this happens 7369 */ 7370 ibd_print_warn(state, 7371 "ibd_send: no memory, lso posting failed"); 7372 dofree = B_TRUE; 7373 rc = B_TRUE; 7374 goto ibd_send_fail; 7375 } 7376 7377 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 7378 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 7379 } 7380 7381 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 7382 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 7383 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 7384 else 7385 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 7386 7387 /* 7388 * Prepare the sgl for posting; the routine can only fail if there's 7389 * no lso buf available for posting. If this is the case, we should 7390 * probably resched for lso bufs to become available and then try again. 7391 */ 7392 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 7393 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 7394 dofree = B_TRUE; 7395 rc = B_TRUE; 7396 } else { 7397 dofree = B_FALSE; 7398 rc = B_FALSE; 7399 } 7400 goto ibd_send_fail; 7401 } 7402 node->swqe_im_mblk = mp; 7403 7404 /* 7405 * Queue the wqe to hardware; since we can now simply queue a 7406 * post instead of doing it serially, we cannot assume anything 7407 * about the 'node' after ibd_post_send() returns. 7408 */ 7409 node->swqe_next = NULL; 7410 7411 mutex_enter(&state->id_txpost_lock); 7412 if (state->id_tx_busy) { 7413 if (state->id_tx_head) { 7414 state->id_tx_tail->swqe_next = 7415 SWQE_TO_WQE(node); 7416 } else { 7417 state->id_tx_head = node; 7418 } 7419 state->id_tx_tail = node; 7420 mutex_exit(&state->id_txpost_lock); 7421 } else { 7422 state->id_tx_busy = 1; 7423 mutex_exit(&state->id_txpost_lock); 7424 ibd_post_send(state, node); 7425 } 7426 7427 return (B_TRUE); 7428 7429 ibd_send_fail: 7430 if (node && mp) 7431 ibd_free_lsohdr(node, mp); 7432 7433 if (dofree) 7434 freemsg(mp); 7435 7436 if (node != NULL) { 7437 if (rc_chan) { 7438 ibd_rc_tx_cleanup(node); 7439 } else { 7440 ibd_tx_cleanup(state, node); 7441 } 7442 } 7443 7444 return (rc); 7445 } 7446 7447 /* 7448 * GLDv3 entry point for transmitting datagram. 7449 */ 7450 static mblk_t * 7451 ibd_m_tx(void *arg, mblk_t *mp) 7452 { 7453 ibd_state_t *state = (ibd_state_t *)arg; 7454 mblk_t *next; 7455 7456 if (state->id_type == IBD_PORT_DRIVER) { 7457 freemsgchain(mp); 7458 return (NULL); 7459 } 7460 7461 if ((state->id_link_state != LINK_STATE_UP) || 7462 !(state->id_mac_state & IBD_DRV_STARTED)) { 7463 freemsgchain(mp); 7464 mp = NULL; 7465 } 7466 7467 while (mp != NULL) { 7468 next = mp->b_next; 7469 mp->b_next = NULL; 7470 if (ibd_send(state, mp) == B_FALSE) { 7471 /* Send fail */ 7472 mp->b_next = next; 7473 break; 7474 } 7475 mp = next; 7476 } 7477 7478 return (mp); 7479 } 7480 7481 /* 7482 * this handles Tx and Rx completions. With separate CQs, this handles 7483 * only Rx completions. 7484 */ 7485 static uint_t 7486 ibd_intr(caddr_t arg) 7487 { 7488 ibd_state_t *state = (ibd_state_t *)arg; 7489 7490 ibd_poll_rcq(state, state->id_rcq_hdl); 7491 7492 return (DDI_INTR_CLAIMED); 7493 } 7494 7495 /* 7496 * Poll and fully drain the send cq 7497 */ 7498 static void 7499 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7500 { 7501 ibt_wc_t *wcs = state->id_txwcs; 7502 uint_t numwcs = state->id_txwcs_size; 7503 ibd_wqe_t *wqe; 7504 ibd_swqe_t *head, *tail; 7505 ibt_wc_t *wc; 7506 uint_t num_polled; 7507 int i; 7508 7509 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7510 head = tail = NULL; 7511 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7512 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 7513 if (wc->wc_status != IBT_WC_SUCCESS) { 7514 /* 7515 * Channel being torn down. 7516 */ 7517 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7518 DPRINT(5, "ibd_drain_scq: flush error"); 7519 DPRINT(10, "ibd_drain_scq: Bad " 7520 "status %d", wc->wc_status); 7521 } else { 7522 DPRINT(10, "ibd_drain_scq: " 7523 "unexpected wc_status %d", 7524 wc->wc_status); 7525 } 7526 /* 7527 * Fallthrough to invoke the Tx handler to 7528 * release held resources, e.g., AH refcount. 7529 */ 7530 } 7531 /* 7532 * Add this swqe to the list to be cleaned up. 7533 */ 7534 if (head) 7535 tail->swqe_next = wqe; 7536 else 7537 head = WQE_TO_SWQE(wqe); 7538 tail = WQE_TO_SWQE(wqe); 7539 } 7540 tail->swqe_next = NULL; 7541 ibd_tx_cleanup_list(state, head, tail); 7542 7543 /* 7544 * Resume any blocked transmissions if possible 7545 */ 7546 ibd_resume_transmission(state); 7547 } 7548 } 7549 7550 /* 7551 * Poll and fully drain the receive cq 7552 */ 7553 static void 7554 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7555 { 7556 ibt_wc_t *wcs = state->id_rxwcs; 7557 uint_t numwcs = state->id_rxwcs_size; 7558 ibd_rwqe_t *rwqe; 7559 ibt_wc_t *wc; 7560 uint_t num_polled; 7561 int i; 7562 mblk_t *head, *tail, *mp; 7563 7564 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7565 head = tail = NULL; 7566 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7567 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 7568 if (wc->wc_status != IBT_WC_SUCCESS) { 7569 /* 7570 * Channel being torn down. 7571 */ 7572 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7573 DPRINT(5, "ibd_drain_rcq: " 7574 "expected flushed rwqe"); 7575 } else { 7576 DPRINT(5, "ibd_drain_rcq: " 7577 "unexpected wc_status %d", 7578 wc->wc_status); 7579 } 7580 atomic_inc_32( 7581 &state->id_rx_list.dl_bufs_outstanding); 7582 freemsg(rwqe->rwqe_im_mblk); 7583 continue; 7584 } 7585 mp = ibd_process_rx(state, rwqe, wc); 7586 if (mp == NULL) 7587 continue; 7588 7589 /* 7590 * Add this mp to the list to send to the nw layer. 7591 */ 7592 if (head) 7593 tail->b_next = mp; 7594 else 7595 head = mp; 7596 tail = mp; 7597 } 7598 if (head) 7599 mac_rx(state->id_mh, state->id_rh, head); 7600 7601 /* 7602 * Account for #rwqes polled. 7603 * Post more here, if less than one fourth full. 7604 */ 7605 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 7606 (state->id_ud_num_rwqe / 4)) 7607 ibd_post_recv_intr(state); 7608 } 7609 } 7610 7611 /* 7612 * Common code for interrupt handling as well as for polling 7613 * for all completed wqe's while detaching. 7614 */ 7615 static void 7616 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7617 { 7618 int flag, redo_flag; 7619 int redo = 1; 7620 7621 flag = IBD_CQ_POLLING; 7622 redo_flag = IBD_REDO_CQ_POLLING; 7623 7624 mutex_enter(&state->id_scq_poll_lock); 7625 if (state->id_scq_poll_busy & flag) { 7626 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 7627 state->id_scq_poll_busy |= redo_flag; 7628 mutex_exit(&state->id_scq_poll_lock); 7629 return; 7630 } 7631 state->id_scq_poll_busy |= flag; 7632 mutex_exit(&state->id_scq_poll_lock); 7633 7634 /* 7635 * In some cases (eg detaching), this code can be invoked on 7636 * any cpu after disabling cq notification (thus no concurrency 7637 * exists). Apart from that, the following applies normally: 7638 * Transmit completion handling could be from any cpu if 7639 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 7640 * is interrupt driven. 7641 */ 7642 7643 /* 7644 * Poll and drain the CQ 7645 */ 7646 ibd_drain_scq(state, cq_hdl); 7647 7648 /* 7649 * Enable CQ notifications and redrain the cq to catch any 7650 * completions we might have missed after the ibd_drain_scq() 7651 * above and before the ibt_enable_cq_notify() that follows. 7652 * Finally, service any new requests to poll the cq that 7653 * could've come in after the ibt_enable_cq_notify(). 7654 */ 7655 do { 7656 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 7657 IBT_SUCCESS) { 7658 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7659 } 7660 7661 ibd_drain_scq(state, cq_hdl); 7662 7663 mutex_enter(&state->id_scq_poll_lock); 7664 if (state->id_scq_poll_busy & redo_flag) 7665 state->id_scq_poll_busy &= ~redo_flag; 7666 else { 7667 state->id_scq_poll_busy &= ~flag; 7668 redo = 0; 7669 } 7670 mutex_exit(&state->id_scq_poll_lock); 7671 7672 } while (redo); 7673 } 7674 7675 /* 7676 * Common code for interrupt handling as well as for polling 7677 * for all completed wqe's while detaching. 7678 */ 7679 static void 7680 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 7681 { 7682 int flag, redo_flag; 7683 int redo = 1; 7684 7685 flag = IBD_CQ_POLLING; 7686 redo_flag = IBD_REDO_CQ_POLLING; 7687 7688 mutex_enter(&state->id_rcq_poll_lock); 7689 if (state->id_rcq_poll_busy & flag) { 7690 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 7691 state->id_rcq_poll_busy |= redo_flag; 7692 mutex_exit(&state->id_rcq_poll_lock); 7693 return; 7694 } 7695 state->id_rcq_poll_busy |= flag; 7696 mutex_exit(&state->id_rcq_poll_lock); 7697 7698 /* 7699 * Poll and drain the CQ 7700 */ 7701 ibd_drain_rcq(state, rcq); 7702 7703 /* 7704 * Enable CQ notifications and redrain the cq to catch any 7705 * completions we might have missed after the ibd_drain_cq() 7706 * above and before the ibt_enable_cq_notify() that follows. 7707 * Finally, service any new requests to poll the cq that 7708 * could've come in after the ibt_enable_cq_notify(). 7709 */ 7710 do { 7711 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 7712 IBT_SUCCESS) { 7713 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7714 } 7715 7716 ibd_drain_rcq(state, rcq); 7717 7718 mutex_enter(&state->id_rcq_poll_lock); 7719 if (state->id_rcq_poll_busy & redo_flag) 7720 state->id_rcq_poll_busy &= ~redo_flag; 7721 else { 7722 state->id_rcq_poll_busy &= ~flag; 7723 redo = 0; 7724 } 7725 mutex_exit(&state->id_rcq_poll_lock); 7726 7727 } while (redo); 7728 } 7729 7730 /* 7731 * Unmap the memory area associated with a given swqe. 7732 */ 7733 void 7734 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 7735 { 7736 ibt_status_t stat; 7737 7738 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 7739 7740 if (swqe->w_mi_hdl) { 7741 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 7742 swqe->w_mi_hdl)) != IBT_SUCCESS) { 7743 DPRINT(10, 7744 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 7745 } 7746 swqe->w_mi_hdl = NULL; 7747 } 7748 swqe->w_swr.wr_nds = 0; 7749 } 7750 7751 void 7752 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 7753 { 7754 /* 7755 * The recycling logic can be eliminated from here 7756 * and put into the async thread if we create another 7757 * list to hold ACE's for unjoined mcg's. 7758 */ 7759 if (DEC_REF_DO_CYCLE(ace)) { 7760 ibd_mce_t *mce; 7761 7762 /* 7763 * Check with the lock taken: we decremented 7764 * reference count without the lock, and some 7765 * transmitter might already have bumped the 7766 * reference count (possible in case of multicast 7767 * disable when we leave the AH on the active 7768 * list). If not still 0, get out, leaving the 7769 * recycle bit intact. 7770 * 7771 * Atomically transition the AH from active 7772 * to free list, and queue a work request to 7773 * leave the group and destroy the mce. No 7774 * transmitter can be looking at the AH or 7775 * the MCE in between, since we have the 7776 * ac_mutex lock. In the SendOnly reap case, 7777 * it is not necessary to hold the ac_mutex 7778 * and recheck the ref count (since the AH was 7779 * taken off the active list), we just do it 7780 * to have uniform processing with the Full 7781 * reap case. 7782 */ 7783 mutex_enter(&state->id_ac_mutex); 7784 mce = ace->ac_mce; 7785 if (GET_REF_CYCLE(ace) == 0) { 7786 CLEAR_REFCYCLE(ace); 7787 /* 7788 * Identify the case of fullmember reap as 7789 * opposed to mcg trap reap. Also, port up 7790 * might set ac_mce to NULL to indicate Tx 7791 * cleanup should do no more than put the 7792 * AH in the free list (see ibd_async_link). 7793 */ 7794 if (mce != NULL) { 7795 ace->ac_mce = NULL; 7796 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 7797 /* 7798 * mc_req was initialized at mce 7799 * creation time. 7800 */ 7801 ibd_queue_work_slot(state, 7802 &mce->mc_req, IBD_ASYNC_REAP); 7803 } 7804 IBD_ACACHE_INSERT_FREE(state, ace); 7805 } 7806 mutex_exit(&state->id_ac_mutex); 7807 } 7808 } 7809 7810 /* 7811 * Common code that deals with clean ups after a successful or 7812 * erroneous transmission attempt. 7813 */ 7814 static void 7815 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 7816 { 7817 ibd_ace_t *ace = swqe->w_ahandle; 7818 7819 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 7820 7821 /* 7822 * If this was a dynamic mapping in ibd_send(), we need to 7823 * unmap here. If this was an lso buffer we'd used for sending, 7824 * we need to release the lso buf to the pool, since the resource 7825 * is scarce. However, if this was simply a normal send using 7826 * the copybuf (present in each swqe), we don't need to release it. 7827 */ 7828 if (swqe->swqe_im_mblk != NULL) { 7829 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7830 ibd_unmap_mem(state, swqe); 7831 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7832 ibd_release_lsobufs(state, 7833 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7834 } 7835 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7836 freemsg(swqe->swqe_im_mblk); 7837 swqe->swqe_im_mblk = NULL; 7838 } 7839 7840 /* 7841 * Drop the reference count on the AH; it can be reused 7842 * now for a different destination if there are no more 7843 * posted sends that will use it. This can be eliminated 7844 * if we can always associate each Tx buffer with an AH. 7845 * The ace can be null if we are cleaning up from the 7846 * ibd_send() error path. 7847 */ 7848 if (ace != NULL) { 7849 ibd_dec_ref_ace(state, ace); 7850 } 7851 7852 /* 7853 * Release the send wqe for reuse. 7854 */ 7855 swqe->swqe_next = NULL; 7856 ibd_release_swqe(state, swqe, swqe, 1); 7857 } 7858 7859 static void 7860 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 7861 { 7862 ibd_ace_t *ace; 7863 ibd_swqe_t *swqe; 7864 int n = 0; 7865 7866 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 7867 7868 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 7869 7870 /* 7871 * If this was a dynamic mapping in ibd_send(), we need to 7872 * unmap here. If this was an lso buffer we'd used for sending, 7873 * we need to release the lso buf to the pool, since the 7874 * resource is scarce. However, if this was simply a normal 7875 * send using the copybuf (present in each swqe), we don't need 7876 * to release it. 7877 */ 7878 if (swqe->swqe_im_mblk != NULL) { 7879 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7880 ibd_unmap_mem(state, swqe); 7881 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7882 ibd_release_lsobufs(state, 7883 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7884 } 7885 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7886 freemsg(swqe->swqe_im_mblk); 7887 swqe->swqe_im_mblk = NULL; 7888 } 7889 7890 /* 7891 * Drop the reference count on the AH; it can be reused 7892 * now for a different destination if there are no more 7893 * posted sends that will use it. This can be eliminated 7894 * if we can always associate each Tx buffer with an AH. 7895 * The ace can be null if we are cleaning up from the 7896 * ibd_send() error path. 7897 */ 7898 ace = swqe->w_ahandle; 7899 if (ace != NULL) { 7900 ibd_dec_ref_ace(state, ace); 7901 } 7902 n++; 7903 } 7904 7905 /* 7906 * Release the send wqes for reuse. 7907 */ 7908 ibd_release_swqe(state, head, tail, n); 7909 } 7910 7911 /* 7912 * Processing to be done after receipt of a packet; hand off to GLD 7913 * in the format expected by GLD. The received packet has this 7914 * format: 2b sap :: 00 :: data. 7915 */ 7916 static mblk_t * 7917 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 7918 { 7919 ib_header_info_t *phdr; 7920 mblk_t *mp; 7921 ipoib_hdr_t *ipibp; 7922 ipha_t *iphap; 7923 ip6_t *ip6h; 7924 int len; 7925 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 7926 uint32_t bufs; 7927 7928 /* 7929 * Track number handed to upper layer that need to be returned. 7930 */ 7931 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 7932 7933 /* Never run out of rwqes, use allocb when running low */ 7934 if (bufs >= state->id_rx_bufs_outstanding_limit) { 7935 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7936 atomic_inc_32(&state->id_rx_allocb); 7937 mp = allocb(pkt_len, BPRI_HI); 7938 if (mp) { 7939 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 7940 ibd_post_recv(state, rwqe); 7941 } else { /* no memory */ 7942 atomic_inc_32(&state->id_rx_allocb_failed); 7943 ibd_post_recv(state, rwqe); 7944 return (NULL); 7945 } 7946 } else { 7947 mp = rwqe->rwqe_im_mblk; 7948 } 7949 7950 7951 /* 7952 * Adjust write pointer depending on how much data came in. 7953 */ 7954 mp->b_wptr = mp->b_rptr + pkt_len; 7955 7956 /* 7957 * Make sure this is NULL or we're in trouble. 7958 */ 7959 if (mp->b_next != NULL) { 7960 ibd_print_warn(state, 7961 "ibd_process_rx: got duplicate mp from rcq?"); 7962 mp->b_next = NULL; 7963 } 7964 7965 /* 7966 * the IB link will deliver one of the IB link layer 7967 * headers called, the Global Routing Header (GRH). 7968 * ibd driver uses the information in GRH to build the 7969 * Header_info structure and pass it with the datagram up 7970 * to GLDv3. 7971 * If the GRH is not valid, indicate to GLDv3 by setting 7972 * the VerTcFlow field to 0. 7973 */ 7974 phdr = (ib_header_info_t *)mp->b_rptr; 7975 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 7976 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 7977 7978 /* if it is loop back packet, just drop it. */ 7979 if (state->id_enable_rc) { 7980 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 7981 &state->rc_macaddr_loopback, 7982 IPOIB_ADDRL) == 0) { 7983 freemsg(mp); 7984 return (NULL); 7985 } 7986 } else { 7987 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 7988 IPOIB_ADDRL) == 0) { 7989 freemsg(mp); 7990 return (NULL); 7991 } 7992 } 7993 7994 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 7995 sizeof (ipoib_mac_t)); 7996 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 7997 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 7998 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 7999 } else { 8000 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 8001 } 8002 } else { 8003 /* 8004 * It can not be a IBA multicast packet. Must have been 8005 * unicast for us. Just copy the interface address to dst. 8006 */ 8007 phdr->ib_grh.ipoib_vertcflow = 0; 8008 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 8009 sizeof (ipoib_mac_t)); 8010 } 8011 8012 /* 8013 * For ND6 packets, padding is at the front of the source/target 8014 * lladdr. However the inet6 layer is not aware of it, hence remove 8015 * the padding from such packets. 8016 */ 8017 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 8018 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 8019 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 8020 len = ntohs(ip6h->ip6_plen); 8021 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 8022 /* LINTED: E_CONSTANT_CONDITION */ 8023 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 8024 } 8025 } 8026 8027 /* 8028 * Update statistics 8029 */ 8030 atomic_add_64(&state->id_rcv_bytes, pkt_len); 8031 atomic_inc_64(&state->id_rcv_pkt); 8032 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 8033 atomic_inc_64(&state->id_brd_rcv); 8034 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 8035 atomic_inc_64(&state->id_multi_rcv); 8036 8037 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 8038 /* 8039 * Set receive checksum status in mp 8040 * Hardware checksumming can be considered valid only if: 8041 * 1. CQE.IP_OK bit is set 8042 * 2. CQE.CKSUM = 0xffff 8043 * 3. IPv6 routing header is not present in the packet 8044 * 4. If there are no IP_OPTIONS in the IP HEADER 8045 */ 8046 8047 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 8048 (wc->wc_cksum == 0xFFFF) && 8049 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 8050 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 8051 } 8052 8053 return (mp); 8054 } 8055 8056 /* 8057 * Callback code invoked from STREAMs when the receive data buffer is 8058 * free for recycling. 8059 */ 8060 static void 8061 ibd_freemsg_cb(char *arg) 8062 { 8063 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 8064 ibd_state_t *state = rwqe->w_state; 8065 8066 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 8067 8068 /* 8069 * If the driver is stopped, just free the rwqe. 8070 */ 8071 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 8072 DPRINT(6, "ibd_freemsg: wqe being freed"); 8073 rwqe->rwqe_im_mblk = NULL; 8074 ibd_free_rwqe(state, rwqe); 8075 return; 8076 } 8077 8078 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 8079 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 8080 if (rwqe->rwqe_im_mblk == NULL) { 8081 ibd_free_rwqe(state, rwqe); 8082 DPRINT(6, "ibd_freemsg: desballoc failed"); 8083 return; 8084 } 8085 8086 ibd_post_recv(state, rwqe); 8087 } 8088 8089 static uint_t 8090 ibd_tx_recycle(caddr_t arg) 8091 { 8092 ibd_state_t *state = (ibd_state_t *)arg; 8093 8094 /* 8095 * Poll for completed entries 8096 */ 8097 ibd_poll_scq(state, state->id_scq_hdl); 8098 8099 return (DDI_INTR_CLAIMED); 8100 } 8101 8102 #ifdef IBD_LOGGING 8103 static void 8104 ibd_log_init(void) 8105 { 8106 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 8107 ibd_lbuf_ndx = 0; 8108 8109 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 8110 } 8111 8112 static void 8113 ibd_log_fini(void) 8114 { 8115 if (ibd_lbuf) 8116 kmem_free(ibd_lbuf, IBD_LOG_SZ); 8117 ibd_lbuf_ndx = 0; 8118 ibd_lbuf = NULL; 8119 8120 mutex_destroy(&ibd_lbuf_lock); 8121 } 8122 8123 static void 8124 ibd_log(const char *fmt, ...) 8125 { 8126 va_list ap; 8127 uint32_t off; 8128 uint32_t msglen; 8129 char tmpbuf[IBD_DMAX_LINE]; 8130 8131 if (ibd_lbuf == NULL) 8132 return; 8133 8134 va_start(ap, fmt); 8135 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 8136 va_end(ap); 8137 8138 if (msglen >= IBD_DMAX_LINE) 8139 msglen = IBD_DMAX_LINE - 1; 8140 8141 mutex_enter(&ibd_lbuf_lock); 8142 8143 off = ibd_lbuf_ndx; /* current msg should go here */ 8144 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 8145 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 8146 8147 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 8148 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 8149 8150 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 8151 ibd_lbuf_ndx = 0; 8152 8153 mutex_exit(&ibd_lbuf_lock); 8154 8155 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 8156 } 8157 #endif 8158 8159 /* ARGSUSED */ 8160 static int 8161 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8162 int *rvalp) 8163 { 8164 ibd_create_ioctl_t *cmd = karg; 8165 ibd_state_t *state, *port_state, *p; 8166 int i, err, rval = 0; 8167 mac_register_t *macp; 8168 ibt_hca_portinfo_t *pinfop = NULL; 8169 ibt_status_t ibt_status; 8170 uint_t psize, pinfosz; 8171 boolean_t force_create = B_FALSE; 8172 8173 cmd->ibdioc.ioc_status = 0; 8174 8175 if (cmd->ibdioc.ioc_port_inst < 0) { 8176 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8177 return (EINVAL); 8178 } 8179 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst); 8180 if (port_state == NULL) { 8181 DPRINT(10, "ibd_create_partition: failed to get state %d", 8182 cmd->ibdioc.ioc_port_inst); 8183 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8184 return (EINVAL); 8185 } 8186 8187 /* Limited PKeys not supported */ 8188 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) { 8189 rval = EINVAL; 8190 goto part_create_return; 8191 } 8192 8193 if (cmd->ioc_force_create == 0) { 8194 /* 8195 * Check if the port pkey table contains the pkey for which 8196 * this partition is being created. 8197 */ 8198 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8199 port_state->id_port, &pinfop, &psize, &pinfosz); 8200 8201 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8202 rval = EINVAL; 8203 goto part_create_return; 8204 } 8205 8206 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) { 8207 rval = ENETDOWN; 8208 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN; 8209 goto part_create_return; 8210 } 8211 8212 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) { 8213 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) { 8214 break; 8215 } 8216 } 8217 if (i == pinfop->p_pkey_tbl_sz) { 8218 rval = EINVAL; 8219 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT; 8220 goto part_create_return; 8221 } 8222 } else { 8223 force_create = B_TRUE; 8224 } 8225 8226 mutex_enter(&ibd_objlist_lock); 8227 for (p = ibd_objlist_head; p; p = p->id_next) { 8228 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) && 8229 (p->id_pkey == cmd->ioc_pkey) && 8230 (p->id_plinkid == cmd->ioc_partid)) { 8231 mutex_exit(&ibd_objlist_lock); 8232 rval = EEXIST; 8233 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS; 8234 goto part_create_return; 8235 } 8236 } 8237 mutex_exit(&ibd_objlist_lock); 8238 8239 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP); 8240 8241 state->id_type = IBD_PARTITION_OBJ; 8242 8243 state->id_plinkid = cmd->ioc_partid; 8244 state->id_dlinkid = cmd->ibdioc.ioc_linkid; 8245 state->id_port_inst = cmd->ibdioc.ioc_port_inst; 8246 8247 state->id_dip = port_state->id_dip; 8248 state->id_port = port_state->id_port; 8249 state->id_pkey = cmd->ioc_pkey; 8250 state->id_hca_guid = port_state->id_hca_guid; 8251 state->id_port_guid = port_state->id_port_guid; 8252 state->id_force_create = force_create; 8253 8254 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 8255 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 8256 8257 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) { 8258 rval = EIO; 8259 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE; 8260 goto fail; 8261 } 8262 8263 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 8264 rval = EAGAIN; 8265 goto fail; 8266 } 8267 8268 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 8269 macp->m_dip = port_state->id_dip; 8270 macp->m_instance = (uint_t)-1; 8271 macp->m_driver = state; 8272 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 8273 macp->m_callbacks = &ibd_m_callbacks; 8274 macp->m_min_sdu = 0; 8275 if (state->id_enable_rc) { 8276 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 8277 } else { 8278 macp->m_max_sdu = IBD_DEF_MAX_SDU; 8279 } 8280 macp->m_priv_props = ibd_priv_props; 8281 8282 err = mac_register(macp, &state->id_mh); 8283 mac_free(macp); 8284 8285 if (err != 0) { 8286 DPRINT(10, "ibd_create_partition: mac_register() failed %d", 8287 err); 8288 rval = err; 8289 goto fail; 8290 } 8291 8292 err = dls_devnet_create(state->id_mh, 8293 cmd->ioc_partid, crgetzoneid(credp)); 8294 if (err != 0) { 8295 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed " 8296 "%d", err); 8297 rval = err; 8298 (void) mac_unregister(state->id_mh); 8299 goto fail; 8300 } 8301 8302 /* 8303 * Add the new partition state structure to the list 8304 */ 8305 mutex_enter(&ibd_objlist_lock); 8306 if (ibd_objlist_head) 8307 state->id_next = ibd_objlist_head; 8308 8309 ibd_objlist_head = state; 8310 mutex_exit(&ibd_objlist_lock); 8311 8312 part_create_return: 8313 if (pinfop) { 8314 ibt_free_portinfo(pinfop, pinfosz); 8315 } 8316 return (rval); 8317 8318 fail: 8319 if (pinfop) { 8320 ibt_free_portinfo(pinfop, pinfosz); 8321 } 8322 ibd_part_unattach(state); 8323 kmem_free(state, sizeof (ibd_state_t)); 8324 return (rval); 8325 } 8326 8327 /* ARGSUSED */ 8328 static int 8329 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8330 int *rvalp) 8331 { 8332 int err; 8333 datalink_id_t tmpid; 8334 ibd_state_t *node, *prev; 8335 ibd_delete_ioctl_t *cmd = karg; 8336 8337 prev = NULL; 8338 8339 mutex_enter(&ibd_objlist_lock); 8340 node = ibd_objlist_head; 8341 8342 /* Find the ibd state structure corresponding to the partition */ 8343 while (node != NULL) { 8344 if (node->id_plinkid == cmd->ioc_partid) 8345 break; 8346 prev = node; 8347 node = node->id_next; 8348 } 8349 8350 if (node == NULL) { 8351 mutex_exit(&ibd_objlist_lock); 8352 return (ENOENT); 8353 } 8354 8355 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) { 8356 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed " 8357 "%d", err); 8358 mutex_exit(&ibd_objlist_lock); 8359 return (err); 8360 } 8361 8362 /* 8363 * Call ibd_part_unattach() only after making sure that the instance has 8364 * not been started yet and is also not in late hca init mode. 8365 */ 8366 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8367 8368 err = 0; 8369 if ((node->id_mac_state & IBD_DRV_STARTED) || 8370 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) || 8371 (ibd_part_busy(node) != DDI_SUCCESS) || 8372 ((err = mac_disable(node->id_mh)) != 0)) { 8373 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid, 8374 crgetzoneid(credp)); 8375 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8376 mutex_exit(&ibd_objlist_lock); 8377 return (err != 0 ? err : EBUSY); 8378 } 8379 8380 node->id_mac_state |= IBD_DRV_IN_DELETION; 8381 8382 ibd_part_unattach(node); 8383 8384 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8385 8386 /* Remove the partition state structure from the linked list */ 8387 if (prev == NULL) 8388 ibd_objlist_head = node->id_next; 8389 else 8390 prev->id_next = node->id_next; 8391 mutex_exit(&ibd_objlist_lock); 8392 8393 if ((err = mac_unregister(node->id_mh)) != 0) { 8394 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d", 8395 err); 8396 } 8397 8398 cv_destroy(&node->id_macst_cv); 8399 mutex_destroy(&node->id_macst_lock); 8400 8401 kmem_free(node, sizeof (ibd_state_t)); 8402 8403 return (0); 8404 } 8405 8406 /* ARGSUSED */ 8407 static int 8408 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred, 8409 int *rvalp) 8410 { 8411 ibd_ioctl_t cmd; 8412 ibpart_ioctl_t partioc; 8413 ibport_ioctl_t portioc; 8414 #ifdef _MULTI_DATAMODEL 8415 ibport_ioctl32_t portioc32; 8416 #endif 8417 ibd_state_t *state, *port_state; 8418 int size; 8419 ibt_hca_portinfo_t *pinfop = NULL; 8420 ibt_status_t ibt_status; 8421 uint_t psize, pinfosz; 8422 int rval = 0; 8423 8424 size = sizeof (ibd_ioctl_t); 8425 if (ddi_copyin((void *)arg, &cmd, size, mode)) { 8426 return (EFAULT); 8427 } 8428 cmd.ioc_status = 0; 8429 switch (cmd.ioc_info_cmd) { 8430 case IBD_INFO_CMD_IBPART: 8431 size = sizeof (ibpart_ioctl_t); 8432 if (ddi_copyin((void *)arg, &partioc, size, mode)) { 8433 return (EFAULT); 8434 } 8435 8436 mutex_enter(&ibd_objlist_lock); 8437 /* Find the ibd state structure corresponding the partition */ 8438 for (state = ibd_objlist_head; state; state = state->id_next) { 8439 if (state->id_plinkid == cmd.ioc_linkid) { 8440 break; 8441 } 8442 } 8443 8444 if (state == NULL) { 8445 mutex_exit(&ibd_objlist_lock); 8446 return (ENOENT); 8447 } 8448 8449 partioc.ibdioc.ioc_linkid = state->id_dlinkid; 8450 partioc.ibdioc.ioc_port_inst = state->id_port_inst; 8451 partioc.ibdioc.ioc_portnum = state->id_port; 8452 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid; 8453 partioc.ibdioc.ioc_portguid = state->id_port_guid; 8454 partioc.ibdioc.ioc_status = 0; 8455 partioc.ioc_partid = state->id_plinkid; 8456 partioc.ioc_pkey = state->id_pkey; 8457 partioc.ioc_force_create = state->id_force_create; 8458 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) { 8459 mutex_exit(&ibd_objlist_lock); 8460 return (EFAULT); 8461 } 8462 mutex_exit(&ibd_objlist_lock); 8463 8464 break; 8465 8466 case IBD_INFO_CMD_IBPORT: 8467 if ((cmd.ioc_port_inst < 0) || ((port_state = 8468 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8469 DPRINT(10, "ibd_create_partition: failed to get" 8470 " state %d", cmd.ioc_port_inst); 8471 size = sizeof (ibd_ioctl_t); 8472 cmd.ioc_status = IBD_INVALID_PORT_INST; 8473 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8474 mode)) { 8475 return (EFAULT); 8476 } 8477 return (EINVAL); 8478 } 8479 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8480 port_state->id_port, &pinfop, &psize, &pinfosz); 8481 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8482 return (EINVAL); 8483 } 8484 #ifdef _MULTI_DATAMODEL 8485 switch (ddi_model_convert_from(mode & FMODELS)) { 8486 case DDI_MODEL_ILP32: { 8487 size = sizeof (ibport_ioctl32_t); 8488 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8489 rval = EFAULT; 8490 goto fail; 8491 } 8492 portioc32.ibdioc.ioc_status = 0; 8493 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8494 portioc32.ibdioc.ioc_hcaguid = 8495 port_state->id_hca_guid; 8496 portioc32.ibdioc.ioc_portguid = 8497 port_state->id_port_guid; 8498 if (portioc32.ioc_pkey_tbl_sz != 8499 pinfop->p_pkey_tbl_sz) { 8500 rval = EINVAL; 8501 size = sizeof (ibd_ioctl_t); 8502 portioc32.ibdioc.ioc_status = 8503 IBD_INVALID_PKEY_TBL_SIZE; 8504 if (ddi_copyout((void *)&portioc32.ibdioc, 8505 (void *)arg, size, mode)) { 8506 rval = EFAULT; 8507 goto fail; 8508 } 8509 goto fail; 8510 } 8511 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8512 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8513 (void *)(uintptr_t)portioc32.ioc_pkeys, size, 8514 mode)) { 8515 rval = EFAULT; 8516 goto fail; 8517 } 8518 size = sizeof (ibport_ioctl32_t); 8519 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8520 mode)) { 8521 rval = EFAULT; 8522 goto fail; 8523 } 8524 break; 8525 } 8526 case DDI_MODEL_NONE: 8527 size = sizeof (ibport_ioctl_t); 8528 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8529 rval = EFAULT; 8530 goto fail; 8531 } 8532 portioc.ibdioc.ioc_status = 0; 8533 portioc.ibdioc.ioc_portnum = port_state->id_port; 8534 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8535 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8536 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8537 rval = EINVAL; 8538 size = sizeof (ibd_ioctl_t); 8539 portioc.ibdioc.ioc_status = 8540 IBD_INVALID_PKEY_TBL_SIZE; 8541 if (ddi_copyout((void *)&portioc.ibdioc, 8542 (void *)arg, size, mode)) { 8543 rval = EFAULT; 8544 goto fail; 8545 } 8546 goto fail; 8547 } 8548 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8549 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8550 (void *)(portioc.ioc_pkeys), size, mode)) { 8551 rval = EFAULT; 8552 goto fail; 8553 } 8554 size = sizeof (ibport_ioctl_t); 8555 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8556 mode)) { 8557 rval = EFAULT; 8558 goto fail; 8559 } 8560 break; 8561 } 8562 #else /* ! _MULTI_DATAMODEL */ 8563 size = sizeof (ibport_ioctl_t); 8564 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8565 rval = EFAULT; 8566 goto fail; 8567 } 8568 portioc.ibdioc.ioc_status = 0; 8569 portioc.ibdioc.ioc_portnum = port_state->id_port; 8570 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8571 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8572 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8573 rval = EINVAL; 8574 size = sizeof (ibd_ioctl_t); 8575 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; 8576 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, 8577 size, mode)) { 8578 rval = EFAULT; 8579 goto fail; 8580 } 8581 goto fail; 8582 } 8583 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8584 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8585 (void *)(portioc.ioc_pkeys), size, mode)) { 8586 rval = EFAULT; 8587 goto fail; 8588 } 8589 size = sizeof (ibport_ioctl_t); 8590 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8591 mode)) { 8592 rval = EFAULT; 8593 goto fail; 8594 } 8595 #endif /* _MULTI_DATAMODEL */ 8596 8597 break; 8598 8599 case IBD_INFO_CMD_PKEYTBLSZ: 8600 if ((cmd.ioc_port_inst < 0) || ((port_state = 8601 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8602 DPRINT(10, "ibd_create_partition: failed to get" 8603 " state %d", cmd.ioc_port_inst); 8604 size = sizeof (ibd_ioctl_t); 8605 cmd.ioc_status = IBD_INVALID_PORT_INST; 8606 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8607 mode)) { 8608 return (EFAULT); 8609 } 8610 return (EINVAL); 8611 } 8612 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8613 port_state->id_port, &pinfop, &psize, &pinfosz); 8614 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8615 return (EINVAL); 8616 } 8617 #ifdef _MULTI_DATAMODEL 8618 switch (ddi_model_convert_from(mode & FMODELS)) { 8619 case DDI_MODEL_ILP32: { 8620 size = sizeof (ibport_ioctl32_t); 8621 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8622 rval = EFAULT; 8623 goto fail; 8624 } 8625 portioc32.ibdioc.ioc_status = 0; 8626 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8627 portioc32.ibdioc.ioc_hcaguid = 8628 port_state->id_hca_guid; 8629 portioc32.ibdioc.ioc_portguid = 8630 port_state->id_port_guid; 8631 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8632 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8633 mode)) { 8634 rval = EFAULT; 8635 goto fail; 8636 } 8637 break; 8638 } 8639 case DDI_MODEL_NONE: 8640 size = sizeof (ibport_ioctl_t); 8641 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8642 rval = EFAULT; 8643 goto fail; 8644 } 8645 portioc.ibdioc.ioc_status = 0; 8646 portioc.ibdioc.ioc_portnum = port_state->id_port; 8647 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8648 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8649 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8650 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8651 mode)) { 8652 rval = EFAULT; 8653 goto fail; 8654 } 8655 break; 8656 } 8657 #else /* ! _MULTI_DATAMODEL */ 8658 size = sizeof (ibport_ioctl_t); 8659 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8660 rval = EFAULT; 8661 goto fail; 8662 } 8663 portioc.ibdioc.ioc_status = 0; 8664 portioc.ibdioc.ioc_portnum = port_state->id_port; 8665 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8666 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8667 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8668 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8669 mode)) { 8670 rval = EFAULT; 8671 goto fail; 8672 } 8673 #endif /* _MULTI_DATAMODEL */ 8674 break; 8675 8676 default: 8677 return (EINVAL); 8678 8679 } /* switch (cmd.ioc_info_cmd) */ 8680 fail: 8681 if (pinfop) { 8682 ibt_free_portinfo(pinfop, pinfosz); 8683 } 8684 return (rval); 8685 } 8686 8687 /* ARGSUSED */ 8688 static void 8689 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl, 8690 ibt_async_code_t code, ibt_async_event_t *event) 8691 { 8692 ibd_state_t *state = (ibd_state_t *)arg; 8693 link_state_t lstate; 8694 8695 switch (code) { 8696 case IBT_EVENT_PORT_UP: 8697 case IBT_ERROR_PORT_DOWN: 8698 if (ibd_get_port_state(state, &lstate) != 0) 8699 break; 8700 8701 if (state->id_link_state != lstate) { 8702 state->id_link_state = lstate; 8703 mac_link_update(state->id_mh, lstate); 8704 } 8705 break; 8706 default: 8707 break; 8708 } 8709 } 8710 8711 static int 8712 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate) 8713 { 8714 ibt_hca_portinfo_t *port_infop; 8715 uint_t psize, port_infosz; 8716 ibt_status_t ret; 8717 8718 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 8719 &port_infop, &psize, &port_infosz); 8720 if ((ret != IBT_SUCCESS) || (psize != 1)) 8721 return (-1); 8722 8723 state->id_sgid = *port_infop->p_sgid_tbl; 8724 state->id_link_speed = ibd_get_portspeed(state); 8725 8726 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) 8727 *lstate = LINK_STATE_UP; 8728 else 8729 *lstate = LINK_STATE_DOWN; 8730 8731 ibt_free_portinfo(port_infop, port_infosz); 8732 return (0); 8733 } 8734 8735 static int 8736 ibd_port_attach(dev_info_t *dip) 8737 { 8738 ibd_state_t *state; 8739 link_state_t lstate; 8740 int instance; 8741 ibt_status_t ret; 8742 8743 /* 8744 * Allocate softstate structure 8745 */ 8746 instance = ddi_get_instance(dip); 8747 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) { 8748 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed"); 8749 return (DDI_FAILURE); 8750 } 8751 8752 state = ddi_get_soft_state(ibd_list, instance); 8753 8754 state->id_dip = dip; 8755 state->id_type = IBD_PORT_DRIVER; 8756 8757 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 8758 "port-number", 0)) == 0) { 8759 DPRINT(10, "ibd_port_attach: invalid port number (%d)", 8760 state->id_port); 8761 return (DDI_FAILURE); 8762 } 8763 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8764 "hca-guid", 0)) == 0) { 8765 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)", 8766 state->id_hca_guid); 8767 return (DDI_FAILURE); 8768 } 8769 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8770 "port-guid", 0)) == 0) { 8771 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)", 8772 state->id_port_guid); 8773 return (DDI_FAILURE); 8774 } 8775 8776 /* 8777 * Attach to IBTL 8778 */ 8779 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state, 8780 &state->id_ibt_hdl)) != IBT_SUCCESS) { 8781 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d", 8782 ret); 8783 goto done; 8784 } 8785 8786 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 8787 8788 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 8789 &state->id_hca_hdl)) != IBT_SUCCESS) { 8790 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8791 ret); 8792 goto done; 8793 } 8794 state->id_mac_state |= IBD_DRV_HCA_OPENED; 8795 8796 /* Update link status */ 8797 8798 if (ibd_get_port_state(state, &lstate) != 0) { 8799 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8800 ret); 8801 goto done; 8802 } 8803 state->id_link_state = lstate; 8804 /* 8805 * Register ibd interfaces with the Nemo framework 8806 */ 8807 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 8808 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()"); 8809 goto done; 8810 } 8811 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 8812 8813 mac_link_update(state->id_mh, lstate); 8814 8815 return (DDI_SUCCESS); 8816 done: 8817 (void) ibd_port_unattach(state, dip); 8818 return (DDI_FAILURE); 8819 } 8820 8821 static int 8822 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip) 8823 { 8824 int instance; 8825 uint32_t progress = state->id_mac_state; 8826 ibt_status_t ret; 8827 8828 if (progress & IBD_DRV_MAC_REGISTERED) { 8829 (void) mac_unregister(state->id_mh); 8830 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 8831 } 8832 8833 if (progress & IBD_DRV_HCA_OPENED) { 8834 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 8835 IBT_SUCCESS) { 8836 ibd_print_warn(state, "failed to close " 8837 "HCA device, ret=%d", ret); 8838 } 8839 state->id_hca_hdl = NULL; 8840 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 8841 } 8842 8843 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 8844 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 8845 ibd_print_warn(state, 8846 "ibt_detach() failed, ret=%d", ret); 8847 } 8848 state->id_ibt_hdl = NULL; 8849 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 8850 } 8851 instance = ddi_get_instance(dip); 8852 ddi_soft_state_free(ibd_list, instance); 8853 8854 return (DDI_SUCCESS); 8855 } 8856 8857 ibt_status_t 8858 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr) 8859 { 8860 ibd_state_t *state; 8861 8862 mutex_enter(&ibd_objlist_lock); 8863 8864 /* Find the ibd state structure corresponding the partition */ 8865 for (state = ibd_objlist_head; state; state = state->id_next) { 8866 if (state->id_plinkid == linkid) { 8867 break; 8868 } 8869 } 8870 8871 if (state == NULL) { 8872 mutex_exit(&ibd_objlist_lock); 8873 return (IBT_NO_SUCH_OBJECT); 8874 } 8875 8876 attr->pa_dlinkid = state->id_dlinkid; 8877 attr->pa_plinkid = state->id_plinkid; 8878 attr->pa_port = state->id_port; 8879 attr->pa_hca_guid = state->id_hca_guid; 8880 attr->pa_port_guid = state->id_port_guid; 8881 attr->pa_pkey = state->id_pkey; 8882 8883 mutex_exit(&ibd_objlist_lock); 8884 8885 return (IBT_SUCCESS); 8886 } 8887 8888 ibt_status_t 8889 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts) 8890 { 8891 ibd_state_t *state; 8892 int n = 0; 8893 ibt_part_attr_t *attr; 8894 8895 mutex_enter(&ibd_objlist_lock); 8896 8897 for (state = ibd_objlist_head; state; state = state->id_next) 8898 n++; 8899 8900 *nparts = n; 8901 if (n == 0) { 8902 *attr_list = NULL; 8903 mutex_exit(&ibd_objlist_lock); 8904 return (IBT_SUCCESS); 8905 } 8906 8907 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP); 8908 attr = *attr_list; 8909 for (state = ibd_objlist_head; state; state = state->id_next) { 8910 #ifdef DEBUG 8911 ASSERT(n > 0); 8912 n--; 8913 #endif 8914 attr->pa_dlinkid = state->id_dlinkid; 8915 attr->pa_plinkid = state->id_plinkid; 8916 attr->pa_port = state->id_port; 8917 attr->pa_hca_guid = state->id_hca_guid; 8918 attr->pa_port_guid = state->id_port_guid; 8919 attr->pa_pkey = state->id_pkey; 8920 attr++; 8921 } 8922 8923 mutex_exit(&ibd_objlist_lock); 8924 return (IBT_SUCCESS); 8925 } 8926