1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Per-interface tunables 68 * 69 * ibd_tx_copy_thresh 70 * This sets the threshold at which ibd will attempt to do a bcopy of the 71 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 72 * is restricted by various parameters, so setting of this value must be 73 * made after careful considerations only. For instance, IB HCAs currently 74 * impose a relatively small limit (when compared to ethernet NICs) on the 75 * length of the SGL for transmit. On the other hand, the ip stack could 76 * send down mp chains that are quite long when LSO is enabled. 77 * 78 * ibd_num_swqe 79 * Number of "send WQE" elements that will be allocated and used by ibd. 80 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 81 * buffer in each of these send wqes must be taken into account. This 82 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 83 * currently set to the same value of ibd_tx_copy_thresh, but may be 84 * changed independently if needed). 85 * 86 * ibd_num_rwqe 87 * Number of "receive WQE" elements that will be allocated and used by 88 * ibd. This parameter is limited by the maximum channel size of the HCA. 89 * Each buffer in the receive wqe will be of MTU size. 90 * 91 * ibd_num_lso_bufs 92 * Number of "larger-than-MTU" copy buffers to use for cases when the 93 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 94 * and too large to be used with regular MTU-sized copy buffers. It is 95 * not recommended to tune this variable without understanding the 96 * application environment and/or memory resources. The size of each of 97 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 98 * 99 * ibd_num_ah 100 * Number of AH cache entries to allocate 101 * 102 * ibd_hash_size 103 * Hash table size for the active AH list 104 * 105 * ibd_separate_cqs 106 * ibd_txcomp_poll 107 * These boolean variables (1 or 0) may be used to tune the behavior of 108 * ibd in managing the send and receive completion queues and in deciding 109 * whether or not transmit completions should be polled or interrupt 110 * driven (when the completion queues are separate). If both the completion 111 * queues are interrupt driven, it may not be possible for the handlers to 112 * be invoked concurrently, depending on how the interrupts are tied on 113 * the PCI intr line. Note that some combination of these two parameters 114 * may not be meaningful (and therefore not allowed). 115 * 116 * ibd_tx_softintr 117 * ibd_rx_softintr 118 * The softintr mechanism allows ibd to avoid event queue overflows if 119 * the receive/completion handlers are to be expensive. These are enabled 120 * by default. 121 * 122 * ibd_log_sz 123 * This specifies the size of the ibd log buffer in bytes. The buffer is 124 * allocated and logging is enabled only when IBD_LOGGING is defined. 125 * 126 */ 127 uint_t ibd_tx_copy_thresh = 0x1000; 128 uint_t ibd_num_swqe = 4000; 129 uint_t ibd_num_rwqe = 4000; 130 uint_t ibd_num_lso_bufs = 0x400; 131 uint_t ibd_num_ah = 64; 132 uint_t ibd_hash_size = 32; 133 uint_t ibd_separate_cqs = 1; 134 uint_t ibd_txcomp_poll = 0; 135 uint_t ibd_rx_softintr = 1; 136 uint_t ibd_tx_softintr = 1; 137 uint_t ibd_create_broadcast_group = 1; 138 uint_t ibd_force_lso_disable = 1; 139 #ifdef IBD_LOGGING 140 uint_t ibd_log_sz = 0x20000; 141 #endif 142 143 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 144 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 145 #define IBD_NUM_SWQE ibd_num_swqe 146 #define IBD_NUM_RWQE ibd_num_rwqe 147 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 148 #define IBD_NUM_AH ibd_num_ah 149 #define IBD_HASH_SIZE ibd_hash_size 150 #ifdef IBD_LOGGING 151 #define IBD_LOG_SZ ibd_log_sz 152 #endif 153 154 /* 155 * Receive CQ moderation parameters: NOT tunables 156 */ 157 static uint_t ibd_rxcomp_count = 4; 158 static uint_t ibd_rxcomp_usec = 10; 159 160 /* 161 * Send CQ moderation parameters: NOT tunables 162 */ 163 #define IBD_TXCOMP_COUNT 10 164 #define IBD_TXCOMP_USEC 300 165 166 /* 167 * Thresholds 168 * 169 * When waiting for resources (swqes or lso buffers) to become available, 170 * the first two thresholds below determine how long to wait before informing 171 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 172 * determines how low the available swqes should go before we start polling 173 * the completion queue. 174 */ 175 #define IBD_FREE_LSOS_THRESH 8 176 #define IBD_FREE_SWQES_THRESH 20 177 #define IBD_TX_POLL_THRESH 80 178 179 /* 180 * When doing multiple-send-wr or multiple-recv-wr posts, this value 181 * determines how many to do at a time (in a single ibt_post_send/recv). 182 */ 183 #define IBD_MAX_POST_MULTIPLE 4 184 185 /* 186 * Maximum length for returning chained mps back to crossbow 187 */ 188 #define IBD_MAX_RX_MP_LEN 16 189 190 /* 191 * LSO parameters 192 */ 193 #define IBD_LSO_MAXLEN 65536 194 #define IBD_LSO_BUFSZ 8192 195 #define IBD_PROP_LSO_POLICY "lso-policy" 196 197 /* 198 * Completion queue polling control 199 */ 200 #define IBD_RX_CQ_POLLING 0x1 201 #define IBD_TX_CQ_POLLING 0x2 202 #define IBD_REDO_RX_CQ_POLLING 0x4 203 #define IBD_REDO_TX_CQ_POLLING 0x8 204 205 /* 206 * Flag bits for resources to reap 207 */ 208 #define IBD_RSRC_SWQE 0x1 209 #define IBD_RSRC_LSOBUF 0x2 210 211 /* 212 * Async operation types 213 */ 214 #define IBD_ASYNC_GETAH 1 215 #define IBD_ASYNC_JOIN 2 216 #define IBD_ASYNC_LEAVE 3 217 #define IBD_ASYNC_PROMON 4 218 #define IBD_ASYNC_PROMOFF 5 219 #define IBD_ASYNC_REAP 6 220 #define IBD_ASYNC_TRAP 7 221 #define IBD_ASYNC_SCHED 8 222 #define IBD_ASYNC_LINK 9 223 #define IBD_ASYNC_EXIT 10 224 225 /* 226 * Async operation states 227 */ 228 #define IBD_OP_NOTSTARTED 0 229 #define IBD_OP_ONGOING 1 230 #define IBD_OP_COMPLETED 2 231 #define IBD_OP_ERRORED 3 232 #define IBD_OP_ROUTERED 4 233 234 /* 235 * State of IBD driver initialization during attach/m_start 236 */ 237 #define IBD_DRV_STATE_INITIALIZED 0x00001 238 #define IBD_DRV_RXINTR_ADDED 0x00002 239 #define IBD_DRV_TXINTR_ADDED 0x00004 240 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 241 #define IBD_DRV_HCA_OPENED 0x00010 242 #define IBD_DRV_PD_ALLOCD 0x00020 243 #define IBD_DRV_MAC_REGISTERED 0x00040 244 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 245 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 246 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 247 #define IBD_DRV_CQS_ALLOCD 0x00400 248 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 249 #define IBD_DRV_TXLIST_ALLOCD 0x01000 250 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 251 #define IBD_DRV_RXLIST_ALLOCD 0x04000 252 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 253 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 254 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 255 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 256 #define IBD_DRV_STARTED 0x80000 257 258 /* 259 * Start/stop in-progress flags; note that restart must always remain 260 * the OR of start and stop flag values. 261 */ 262 #define IBD_DRV_START_IN_PROGRESS 0x10000000 263 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 264 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 265 266 /* 267 * Miscellaneous constants 268 */ 269 #define IBD_SEND 0 270 #define IBD_RECV 1 271 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 272 #define IBD_DEF_MAX_SDU 2044 273 #define IBD_DEFAULT_QKEY 0xB1B 274 #ifdef IBD_LOGGING 275 #define IBD_DMAX_LINE 100 276 #endif 277 278 /* 279 * Enumerations for link states 280 */ 281 typedef enum { 282 IBD_LINK_DOWN, 283 IBD_LINK_UP, 284 IBD_LINK_UP_ABSENT 285 } ibd_link_op_t; 286 287 /* 288 * Driver State Pointer 289 */ 290 void *ibd_list; 291 292 /* 293 * Logging 294 */ 295 #ifdef IBD_LOGGING 296 kmutex_t ibd_lbuf_lock; 297 uint8_t *ibd_lbuf; 298 uint32_t ibd_lbuf_ndx; 299 #endif 300 301 /* 302 * Required system entry points 303 */ 304 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 305 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 306 307 /* 308 * Required driver entry points for GLDv3 309 */ 310 static int ibd_m_stat(void *, uint_t, uint64_t *); 311 static int ibd_m_start(void *); 312 static void ibd_m_stop(void *); 313 static int ibd_m_promisc(void *, boolean_t); 314 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 315 static int ibd_m_unicst(void *, const uint8_t *); 316 static mblk_t *ibd_m_tx(void *, mblk_t *); 317 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 318 319 /* 320 * Private driver entry points for GLDv3 321 */ 322 323 /* 324 * Initialization 325 */ 326 static int ibd_state_init(ibd_state_t *, dev_info_t *); 327 static int ibd_init_txlist(ibd_state_t *); 328 static int ibd_init_rxlist(ibd_state_t *); 329 static int ibd_acache_init(ibd_state_t *); 330 #ifdef IBD_LOGGING 331 static void ibd_log_init(void); 332 #endif 333 334 /* 335 * Termination/cleanup 336 */ 337 static void ibd_state_fini(ibd_state_t *); 338 static void ibd_fini_txlist(ibd_state_t *); 339 static void ibd_fini_rxlist(ibd_state_t *); 340 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 341 static void ibd_acache_fini(ibd_state_t *); 342 #ifdef IBD_LOGGING 343 static void ibd_log_fini(void); 344 #endif 345 346 /* 347 * Allocation/acquire/map routines 348 */ 349 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t); 350 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 351 static int ibd_alloc_tx_copybufs(ibd_state_t *); 352 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 353 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **); 354 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 355 uint32_t *); 356 357 /* 358 * Free/release/unmap routines 359 */ 360 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 361 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 362 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *); 363 static void ibd_free_tx_copybufs(ibd_state_t *); 364 static void ibd_free_tx_lsobufs(ibd_state_t *); 365 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *); 366 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 367 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 368 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 369 370 /* 371 * Handlers/callback routines 372 */ 373 static uint_t ibd_intr(char *); 374 static uint_t ibd_tx_recycle(char *); 375 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 376 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 377 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 378 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t); 379 static void ibd_freemsg_cb(char *); 380 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 381 ibt_async_event_t *); 382 static void ibd_snet_notices_handler(void *, ib_gid_t, 383 ibt_subnet_event_code_t, ibt_subnet_event_t *); 384 385 /* 386 * Send/receive routines 387 */ 388 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 389 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 390 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t); 391 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 392 static void ibd_flush_rx(ibd_state_t *, mblk_t *); 393 394 /* 395 * Threads 396 */ 397 static void ibd_async_work(ibd_state_t *); 398 399 /* 400 * Async tasks 401 */ 402 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 403 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 404 static void ibd_async_setprom(ibd_state_t *); 405 static void ibd_async_unsetprom(ibd_state_t *); 406 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 407 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 408 static void ibd_async_txsched(ibd_state_t *); 409 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 410 411 /* 412 * Async task helpers 413 */ 414 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 415 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 416 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 417 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 418 ipoib_mac_t *, ipoib_mac_t *); 419 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 420 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 421 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 422 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 423 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 424 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 425 static uint64_t ibd_get_portspeed(ibd_state_t *); 426 static boolean_t ibd_async_safe(ibd_state_t *); 427 static void ibd_async_done(ibd_state_t *); 428 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 429 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 430 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 431 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 432 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 433 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 434 435 /* 436 * Helpers for attach/start routines 437 */ 438 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 439 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 440 static int ibd_unattach(ibd_state_t *, dev_info_t *); 441 static int ibd_get_port_details(ibd_state_t *); 442 static int ibd_alloc_cqs(ibd_state_t *); 443 static int ibd_setup_ud_channel(ibd_state_t *); 444 static int ibd_start(ibd_state_t *); 445 static int ibd_undo_start(ibd_state_t *, link_state_t); 446 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 447 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 448 449 450 /* 451 * Miscellaneous helpers 452 */ 453 static int ibd_sched_poll(ibd_state_t *, int, int); 454 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 455 static int ibd_resume_transmission(ibd_state_t *); 456 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 457 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 458 static void *list_get_head(list_t *); 459 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 460 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 461 static void ibd_print_warn(ibd_state_t *, char *, ...); 462 #ifdef IBD_LOGGING 463 static void ibd_log(const char *, ...); 464 #endif 465 466 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 467 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 468 469 /* Module Driver Info */ 470 static struct modldrv ibd_modldrv = { 471 &mod_driverops, /* This one is a driver */ 472 "InfiniBand GLDv3 Driver", /* short description */ 473 &ibd_dev_ops /* driver specific ops */ 474 }; 475 476 /* Module Linkage */ 477 static struct modlinkage ibd_modlinkage = { 478 MODREV_1, (void *)&ibd_modldrv, NULL 479 }; 480 481 /* 482 * Module (static) info passed to IBTL during ibt_attach 483 */ 484 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 485 IBTI_V_CURR, 486 IBT_NETWORK, 487 ibd_async_handler, 488 NULL, 489 "IPIB" 490 }; 491 492 /* 493 * GLDv3 entry points 494 */ 495 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 496 static mac_callbacks_t ibd_m_callbacks = { 497 IBD_M_CALLBACK_FLAGS, 498 ibd_m_stat, 499 ibd_m_start, 500 ibd_m_stop, 501 ibd_m_promisc, 502 ibd_m_multicst, 503 ibd_m_unicst, 504 ibd_m_tx, 505 NULL, 506 ibd_m_getcapab 507 }; 508 509 /* 510 * Fill/clear <scope> and <p_key> in multicast/broadcast address 511 */ 512 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 513 { \ 514 *(uint32_t *)((char *)(maddr) + 4) |= \ 515 htonl((uint32_t)(scope) << 16); \ 516 *(uint32_t *)((char *)(maddr) + 8) |= \ 517 htonl((uint32_t)(pkey) << 16); \ 518 } 519 520 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 521 { \ 522 *(uint32_t *)((char *)(maddr) + 4) &= \ 523 htonl(~((uint32_t)0xF << 16)); \ 524 *(uint32_t *)((char *)(maddr) + 8) &= \ 525 htonl(~((uint32_t)0xFFFF << 16)); \ 526 } 527 528 /* 529 * Rudimentary debugging support 530 */ 531 #ifdef DEBUG 532 int ibd_debuglevel = 100; 533 static void 534 debug_print(int l, char *fmt, ...) 535 { 536 va_list ap; 537 538 if (l < ibd_debuglevel) 539 return; 540 va_start(ap, fmt); 541 vcmn_err(CE_CONT, fmt, ap); 542 va_end(ap); 543 } 544 #define DPRINT debug_print 545 #else 546 #define DPRINT 547 #endif 548 549 /* 550 * Common routine to print warning messages; adds in hca guid, port number 551 * and pkey to be able to identify the IBA interface. 552 */ 553 static void 554 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 555 { 556 ib_guid_t hca_guid; 557 char ibd_print_buf[256]; 558 int len; 559 va_list ap; 560 561 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 562 0, "hca-guid", 0); 563 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 564 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 565 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 566 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 567 va_start(ap, fmt); 568 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 569 fmt, ap); 570 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 571 va_end(ap); 572 } 573 574 /* 575 * Warlock directives 576 */ 577 578 /* 579 * id_lso_lock 580 * 581 * state->id_lso->bkt_nfree may be accessed without a lock to 582 * determine the threshold at which we have to ask the nw layer 583 * to resume transmission (see ibd_resume_transmission()). 584 */ 585 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 586 ibd_state_t::id_lso)) 587 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 588 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 589 590 /* 591 * id_cq_poll_lock 592 */ 593 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock, 594 ibd_state_t::id_cq_poll_busy)) 595 596 /* 597 * id_txpost_lock 598 */ 599 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 600 ibd_state_t::id_tx_head)) 601 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 602 ibd_state_t::id_tx_busy)) 603 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 604 ibd_state_t::id_tx_tailp)) 605 606 /* 607 * id_rxpost_lock 608 */ 609 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 610 ibd_state_t::id_rx_head)) 611 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 612 ibd_state_t::id_rx_busy)) 613 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 614 ibd_state_t::id_rx_tailp)) 615 616 /* 617 * id_acache_req_lock 618 */ 619 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 620 ibd_state_t::id_acache_req_cv)) 621 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 622 ibd_state_t::id_req_list)) 623 624 /* 625 * id_ac_mutex 626 * 627 * This mutex is actually supposed to protect id_ah_op as well, 628 * but this path of the code isn't clean (see update of id_ah_op 629 * in ibd_async_acache(), immediately after the call to 630 * ibd_async_mcache()). For now, we'll skip this check by 631 * declaring that id_ah_op is protected by some internal scheme 632 * that warlock isn't aware of. 633 */ 634 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 635 ibd_state_t::id_ah_active)) 636 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 637 ibd_state_t::id_ah_free)) 638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 639 ibd_state_t::id_ah_addr)) 640 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 641 ibd_state_t::id_ah_op)) 642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 643 ibd_state_t::id_ah_error)) 644 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 645 646 /* 647 * id_mc_mutex 648 */ 649 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 650 ibd_state_t::id_mc_full)) 651 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 652 ibd_state_t::id_mc_non)) 653 654 /* 655 * id_trap_lock 656 */ 657 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 658 ibd_state_t::id_trap_cv)) 659 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 660 ibd_state_t::id_trap_stop)) 661 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 662 ibd_state_t::id_trap_inprog)) 663 664 /* 665 * id_prom_op 666 */ 667 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 668 ibd_state_t::id_prom_op)) 669 670 /* 671 * id_sched_lock 672 */ 673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 674 ibd_state_t::id_sched_needed)) 675 676 /* 677 * id_link_mutex 678 */ 679 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 680 ibd_state_t::id_link_state)) 681 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 682 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 683 ibd_state_t::id_link_speed)) 684 685 /* 686 * id_tx_list.dl_mutex 687 */ 688 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 689 ibd_state_t::id_tx_list.dl_head)) 690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 691 ibd_state_t::id_tx_list.dl_tail)) 692 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 693 ibd_state_t::id_tx_list.dl_pending_sends)) 694 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 695 ibd_state_t::id_tx_list.dl_cnt)) 696 697 /* 698 * id_rx_list.dl_mutex 699 */ 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 701 ibd_state_t::id_rx_list.dl_head)) 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 703 ibd_state_t::id_rx_list.dl_tail)) 704 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 705 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 706 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 707 ibd_state_t::id_rx_list.dl_cnt)) 708 709 710 /* 711 * Items protected by atomic updates 712 */ 713 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 714 ibd_state_s::id_brd_rcv 715 ibd_state_s::id_brd_xmt 716 ibd_state_s::id_multi_rcv 717 ibd_state_s::id_multi_xmt 718 ibd_state_s::id_num_intrs 719 ibd_state_s::id_rcv_bytes 720 ibd_state_s::id_rcv_pkt 721 ibd_state_s::id_tx_short 722 ibd_state_s::id_xmt_bytes 723 ibd_state_s::id_xmt_pkt)) 724 725 /* 726 * Non-mutex protection schemes for data elements. Almost all of 727 * these are non-shared items. 728 */ 729 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 730 callb_cpr 731 ib_gid_s 732 ib_header_info 733 ibd_acache_rq 734 ibd_acache_s::ac_mce 735 ibd_mcache::mc_fullreap 736 ibd_mcache::mc_jstate 737 ibd_mcache::mc_req 738 ibd_rwqe_s 739 ibd_swqe_s 740 ibd_wqe_s 741 ibt_wr_ds_s::ds_va 742 ibt_wr_lso_s 743 ipoib_mac::ipoib_qpn 744 mac_capab_lso_s 745 msgb::b_next 746 msgb::b_rptr 747 msgb::b_wptr)) 748 749 int 750 _init() 751 { 752 int status; 753 754 /* 755 * Sanity check some parameter settings. Tx completion polling 756 * only makes sense with separate CQs for Tx and Rx. 757 */ 758 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 759 cmn_err(CE_NOTE, "!ibd: %s", 760 "Setting ibd_txcomp_poll = 0 for combined CQ"); 761 ibd_txcomp_poll = 0; 762 } 763 764 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 765 if (status != 0) { 766 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 767 return (status); 768 } 769 770 mac_init_ops(&ibd_dev_ops, "ibd"); 771 status = mod_install(&ibd_modlinkage); 772 if (status != 0) { 773 DPRINT(10, "_init:failed in mod_install()"); 774 ddi_soft_state_fini(&ibd_list); 775 mac_fini_ops(&ibd_dev_ops); 776 return (status); 777 } 778 779 #ifdef IBD_LOGGING 780 ibd_log_init(); 781 #endif 782 return (0); 783 } 784 785 int 786 _info(struct modinfo *modinfop) 787 { 788 return (mod_info(&ibd_modlinkage, modinfop)); 789 } 790 791 int 792 _fini() 793 { 794 int status; 795 796 status = mod_remove(&ibd_modlinkage); 797 if (status != 0) 798 return (status); 799 800 mac_fini_ops(&ibd_dev_ops); 801 ddi_soft_state_fini(&ibd_list); 802 #ifdef IBD_LOGGING 803 ibd_log_fini(); 804 #endif 805 return (0); 806 } 807 808 /* 809 * Convert the GID part of the mac address from network byte order 810 * to host order. 811 */ 812 static void 813 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 814 { 815 ib_sn_prefix_t nbopref; 816 ib_guid_t nboguid; 817 818 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 819 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 820 dgid->gid_prefix = b2h64(nbopref); 821 dgid->gid_guid = b2h64(nboguid); 822 } 823 824 /* 825 * Create the IPoIB address in network byte order from host order inputs. 826 */ 827 static void 828 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 829 ib_guid_t guid) 830 { 831 ib_sn_prefix_t nbopref; 832 ib_guid_t nboguid; 833 834 mac->ipoib_qpn = htonl(qpn); 835 nbopref = h2b64(prefix); 836 nboguid = h2b64(guid); 837 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 838 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 839 } 840 841 /* 842 * Send to the appropriate all-routers group when the IBA multicast group 843 * does not exist, based on whether the target group is v4 or v6. 844 */ 845 static boolean_t 846 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 847 ipoib_mac_t *rmac) 848 { 849 boolean_t retval = B_TRUE; 850 uint32_t adjscope = state->id_scope << 16; 851 uint32_t topword; 852 853 /* 854 * Copy the first 4 bytes in without assuming any alignment of 855 * input mac address; this will have IPoIB signature, flags and 856 * scope bits. 857 */ 858 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 859 topword = ntohl(topword); 860 861 /* 862 * Generate proper address for IPv4/v6, adding in the Pkey properly. 863 */ 864 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 865 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 866 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 867 ((uint32_t)(state->id_pkey << 16))), 868 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 869 else 870 /* 871 * Does not have proper bits in the mgid address. 872 */ 873 retval = B_FALSE; 874 875 return (retval); 876 } 877 878 /* 879 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 880 * front of optional src/tgt link layer address. Right now Solaris inserts 881 * padding by default at the end. The routine which is doing is nce_xmit() 882 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 883 * the packet comes down from IP layer to the IBD driver, it is in the 884 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 885 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 886 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 887 * 888 * The send routine at IBD driver changes this packet as follows: 889 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 890 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 891 * aligned. 892 * 893 * At the receiving side again ibd_process_rx takes the above packet and 894 * removes the two bytes of front padding and inserts it at the end. This 895 * is since the IP layer does not understand padding at the front. 896 */ 897 #define IBD_PAD_NSNA(ip6h, len, type) { \ 898 uchar_t *nd_lla_ptr; \ 899 icmp6_t *icmp6; \ 900 nd_opt_hdr_t *opt; \ 901 int i; \ 902 \ 903 icmp6 = (icmp6_t *)&ip6h[1]; \ 904 len -= sizeof (nd_neighbor_advert_t); \ 905 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 906 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 907 (len != 0)) { \ 908 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 909 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 910 ASSERT(opt != NULL); \ 911 nd_lla_ptr = (uchar_t *)&opt[1]; \ 912 if (type == IBD_SEND) { \ 913 for (i = IPOIB_ADDRL; i > 0; i--) \ 914 *(nd_lla_ptr + i + 1) = \ 915 *(nd_lla_ptr + i - 1); \ 916 } else { \ 917 for (i = 0; i < IPOIB_ADDRL; i++) \ 918 *(nd_lla_ptr + i) = \ 919 *(nd_lla_ptr + i + 2); \ 920 } \ 921 *(nd_lla_ptr + i) = 0; \ 922 *(nd_lla_ptr + i + 1) = 0; \ 923 } \ 924 } 925 926 /* 927 * Address handle entries maintained by the driver are kept in the 928 * free and active lists. Each entry starts out in the free list; 929 * it migrates to the active list when primed using ibt_get_paths() 930 * and ibt_modify_ud_dest() for transmission to a specific destination. 931 * In the active list, the entry has a reference count indicating the 932 * number of ongoing/uncompleted transmits that reference it. The 933 * entry is left in the active list even after the reference count 934 * goes to 0, since successive transmits can find it there and do 935 * not need to set up another entry (ie the path information is 936 * cached using the active list). Entries on the active list are 937 * also hashed using the destination link address as a key for faster 938 * lookups during transmits. 939 * 940 * For any destination address (unicast or multicast, whatever the 941 * join states), there will be at most one entry in the active list. 942 * Entries with a 0 reference count on the active list can be reused 943 * for a transmit to a new destination, if the free list is empty. 944 * 945 * The AH free list insertion/deletion is protected with the id_ac_mutex, 946 * since the async thread and Tx callback handlers insert/delete. The 947 * active list does not need a lock (all operations are done by the 948 * async thread) but updates to the reference count are atomically 949 * done (increments done by Tx path, decrements by the Tx callback handler). 950 */ 951 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 952 list_insert_head(&state->id_ah_free, ce) 953 #define IBD_ACACHE_GET_FREE(state) \ 954 list_get_head(&state->id_ah_free) 955 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 956 int _ret_; \ 957 list_insert_head(&state->id_ah_active, ce); \ 958 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 959 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 960 ASSERT(_ret_ == 0); \ 961 } 962 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 963 list_remove(&state->id_ah_active, ce); \ 964 (void) mod_hash_remove(state->id_ah_active_hash, \ 965 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 966 } 967 #define IBD_ACACHE_GET_ACTIVE(state) \ 968 list_get_head(&state->id_ah_active) 969 970 /* 971 * Membership states for different mcg's are tracked by two lists: 972 * the "non" list is used for promiscuous mode, when all mcg traffic 973 * needs to be inspected. This type of membership is never used for 974 * transmission, so there can not be an AH in the active list 975 * corresponding to a member in this list. This list does not need 976 * any protection, since all operations are performed by the async 977 * thread. 978 * 979 * "Full" and "SendOnly" membership is tracked using a single list, 980 * the "full" list. This is because this single list can then be 981 * searched during transmit to a multicast group (if an AH for the 982 * mcg is not found in the active list), since at least one type 983 * of membership must be present before initiating the transmit. 984 * This list is also emptied during driver detach, since sendonly 985 * membership acquired during transmit is dropped at detach time 986 * alongwith ipv4 broadcast full membership. Insert/deletes to 987 * this list are done only by the async thread, but it is also 988 * searched in program context (see multicast disable case), thus 989 * the id_mc_mutex protects the list. The driver detach path also 990 * deconstructs the "full" list, but it ensures that the async 991 * thread will not be accessing the list (by blocking out mcg 992 * trap handling and making sure no more Tx reaping will happen). 993 * 994 * Currently, an IBA attach is done in the SendOnly case too, 995 * although this is not required. 996 */ 997 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 998 list_insert_head(&state->id_mc_full, mce) 999 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1000 list_insert_head(&state->id_mc_non, mce) 1001 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1002 ibd_mcache_find(mgid, &state->id_mc_full) 1003 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1004 ibd_mcache_find(mgid, &state->id_mc_non) 1005 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1006 list_remove(&state->id_mc_full, mce) 1007 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1008 list_remove(&state->id_mc_non, mce) 1009 1010 /* 1011 * AH and MCE active list manipulation: 1012 * 1013 * Multicast disable requests and MCG delete traps are two cases 1014 * where the active AH entry for the mcg (if any unreferenced one exists) 1015 * will be moved to the free list (to force the next Tx to the mcg to 1016 * join the MCG in SendOnly mode). Port up handling will also move AHs 1017 * from active to free list. 1018 * 1019 * In the case when some transmits are still pending on an entry 1020 * for an mcg, but a multicast disable has already been issued on the 1021 * mcg, there are some options to consider to preserve the join state 1022 * to ensure the emitted packet is properly routed on the IBA fabric. 1023 * For the AH, we can 1024 * 1. take out of active list at multicast disable time. 1025 * 2. take out of active list only when last pending Tx completes. 1026 * For the MCE, we can 1027 * 3. take out of active list at multicast disable time. 1028 * 4. take out of active list only when last pending Tx completes. 1029 * 5. move from active list to stale list at multicast disable time. 1030 * We choose to use 2,4. We use option 4 so that if a multicast enable 1031 * is tried before the pending Tx completes, the enable code finds the 1032 * mce in the active list and just has to make sure it will not be reaped 1033 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1034 * a stale list (#5) that would be checked in the enable code would need 1035 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1036 * after the multicast disable would try to put an AH in the active list, 1037 * and associate the mce it finds in the active list to this new AH, 1038 * whereas the mce is already associated with the previous AH (taken off 1039 * the active list), and will be removed once the pending Tx's complete 1040 * (unless a reference count on mce's is implemented). One implication of 1041 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1042 * grab new references on the AH, further delaying the leave. 1043 * 1044 * In the case of mcg delete (or create) trap when the port is sendonly 1045 * joined, the AH and MCE handling is different: the AH and MCE has to be 1046 * immediately taken off the active lists (forcing a join and path lookup 1047 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1048 * to an mcg as it is repeatedly created and deleted and goes thru 1049 * reincarnations). 1050 * 1051 * When a port is already sendonly joined, and a multicast enable is 1052 * attempted, the same mce structure is promoted; this ensures only a 1053 * single mce on the active list tracks the most powerful join state. 1054 * 1055 * In the case of port up event handling, the MCE for sendonly membership 1056 * is freed up, and the ACE is put into the free list as soon as possible 1057 * (depending on whether posted Tx's have completed). For fullmembership 1058 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1059 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1060 * done; else the mce is deconstructed (mc_fullreap case). 1061 * 1062 * MCG creation and deletion trap handling: 1063 * 1064 * These traps are unreliable (meaning sometimes the trap might never 1065 * be delivered to the subscribed nodes) and may arrive out-of-order 1066 * since they use UD transport. An alternative to relying on these 1067 * unreliable traps is to poll for mcg presence every so often, but 1068 * instead of doing that, we try to be as conservative as possible 1069 * while handling the traps, and hope that the traps do arrive at 1070 * the subscribed nodes soon. Note that if a node is fullmember 1071 * joined to an mcg, it can not possibly receive a mcg create/delete 1072 * trap for that mcg (by fullmember definition); if it does, it is 1073 * an old trap from a previous incarnation of the mcg. 1074 * 1075 * Whenever a trap is received, the driver cleans up its sendonly 1076 * membership to the group; we choose to do a sendonly leave even 1077 * on a creation trap to handle the case of a prior deletion of the mcg 1078 * having gone unnoticed. Consider an example scenario: 1079 * T1: MCG M is deleted, and fires off deletion trap D1. 1080 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1081 * T3: Node N tries to transmit to M, joining in sendonly mode. 1082 * T4: MCG M is deleted, and fires off deletion trap D2. 1083 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1084 * If the trap is D2, then a LEAVE is not required, since the mcg 1085 * is already deleted; but if it is D1, a LEAVE is required. A safe 1086 * approach is to always LEAVE, but the SM may be confused if it 1087 * receives a LEAVE without a prior JOIN. 1088 * 1089 * Management of the non-membership to an mcg is similar to the above, 1090 * except that if the interface is in promiscuous mode, it is required 1091 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1092 * if the re-join attempt fails (in which case a warning message needs 1093 * to be printed), it is not clear whether it failed due to the mcg not 1094 * existing, or some fabric/hca issues, due to the delayed nature of 1095 * trap delivery. Querying the SA to establish presence/absence of the 1096 * mcg is also racy at best. Thus, the driver just prints a warning 1097 * message when it can not rejoin after receiving a create trap, although 1098 * this might be (on rare occassions) a mis-warning if the create trap is 1099 * received after the mcg was deleted. 1100 */ 1101 1102 /* 1103 * Implementation of atomic "recycle" bits and reference count 1104 * on address handles. This utilizes the fact that max reference 1105 * count on any handle is limited by number of send wqes, thus 1106 * high bits in the ac_ref field can be used as the recycle bits, 1107 * and only the low bits hold the number of pending Tx requests. 1108 * This atomic AH reference counting allows the Tx completion 1109 * handler not to acquire the id_ac_mutex to process every completion, 1110 * thus reducing lock contention problems between completion and 1111 * the Tx path. 1112 */ 1113 #define CYCLEVAL 0x80000 1114 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1115 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1116 #define GET_REF(ace) ((ace)->ac_ref) 1117 #define GET_REF_CYCLE(ace) ( \ 1118 /* \ 1119 * Make sure "cycle" bit is set. \ 1120 */ \ 1121 ASSERT(CYCLE_SET(ace)), \ 1122 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1123 ) 1124 #define INC_REF(ace, num) { \ 1125 atomic_add_32(&(ace)->ac_ref, num); \ 1126 } 1127 #define SET_CYCLE_IF_REF(ace) ( \ 1128 CYCLE_SET(ace) ? B_TRUE : \ 1129 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1130 CYCLEVAL ? \ 1131 /* \ 1132 * Clear the "cycle" bit we just set; \ 1133 * ref count known to be 0 from above. \ 1134 */ \ 1135 CLEAR_REFCYCLE(ace), B_FALSE : \ 1136 /* \ 1137 * We set "cycle" bit; let caller know. \ 1138 */ \ 1139 B_TRUE \ 1140 ) 1141 #define DEC_REF_DO_CYCLE(ace) ( \ 1142 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1143 CYCLEVAL ? \ 1144 /* \ 1145 * Ref count known to be 0 from above. \ 1146 */ \ 1147 B_TRUE : \ 1148 B_FALSE \ 1149 ) 1150 1151 static void * 1152 list_get_head(list_t *list) 1153 { 1154 list_node_t *lhead = list_head(list); 1155 1156 if (lhead != NULL) 1157 list_remove(list, lhead); 1158 return (lhead); 1159 } 1160 1161 /* 1162 * This is always guaranteed to be able to queue the work. 1163 */ 1164 static void 1165 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1166 { 1167 /* Initialize request */ 1168 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1169 ptr->rq_op = op; 1170 1171 /* 1172 * Queue provided slot onto request pool. 1173 */ 1174 mutex_enter(&state->id_acache_req_lock); 1175 list_insert_tail(&state->id_req_list, ptr); 1176 1177 /* Go, fetch, async thread */ 1178 cv_signal(&state->id_acache_req_cv); 1179 mutex_exit(&state->id_acache_req_lock); 1180 } 1181 1182 /* 1183 * Main body of the per interface async thread. 1184 */ 1185 static void 1186 ibd_async_work(ibd_state_t *state) 1187 { 1188 ibd_req_t *ptr; 1189 callb_cpr_t cprinfo; 1190 1191 mutex_enter(&state->id_acache_req_lock); 1192 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1193 callb_generic_cpr, "ibd_async_work"); 1194 1195 for (;;) { 1196 ptr = list_get_head(&state->id_req_list); 1197 if (ptr != NULL) { 1198 mutex_exit(&state->id_acache_req_lock); 1199 1200 /* 1201 * Once we have done the operation, there is no 1202 * guarantee the request slot is going to be valid, 1203 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1204 * TRAP). 1205 * 1206 * Perform the request. 1207 */ 1208 switch (ptr->rq_op) { 1209 case IBD_ASYNC_GETAH: 1210 ibd_async_acache(state, &ptr->rq_mac); 1211 break; 1212 case IBD_ASYNC_JOIN: 1213 case IBD_ASYNC_LEAVE: 1214 ibd_async_multicast(state, 1215 ptr->rq_gid, ptr->rq_op); 1216 break; 1217 case IBD_ASYNC_PROMON: 1218 ibd_async_setprom(state); 1219 break; 1220 case IBD_ASYNC_PROMOFF: 1221 ibd_async_unsetprom(state); 1222 break; 1223 case IBD_ASYNC_REAP: 1224 ibd_async_reap_group(state, 1225 ptr->rq_ptr, ptr->rq_gid, 1226 IB_MC_JSTATE_FULL); 1227 /* 1228 * the req buf contains in mce 1229 * structure, so we do not need 1230 * to free it here. 1231 */ 1232 ptr = NULL; 1233 break; 1234 case IBD_ASYNC_TRAP: 1235 ibd_async_trap(state, ptr); 1236 break; 1237 case IBD_ASYNC_SCHED: 1238 ibd_async_txsched(state); 1239 break; 1240 case IBD_ASYNC_LINK: 1241 ibd_async_link(state, ptr); 1242 break; 1243 case IBD_ASYNC_EXIT: 1244 mutex_enter(&state->id_acache_req_lock); 1245 #ifndef __lock_lint 1246 CALLB_CPR_EXIT(&cprinfo); 1247 #else 1248 mutex_exit(&state->id_acache_req_lock); 1249 #endif 1250 return; 1251 } 1252 if (ptr != NULL) 1253 kmem_cache_free(state->id_req_kmc, ptr); 1254 1255 mutex_enter(&state->id_acache_req_lock); 1256 } else { 1257 #ifndef __lock_lint 1258 /* 1259 * Nothing to do: wait till new request arrives. 1260 */ 1261 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1262 cv_wait(&state->id_acache_req_cv, 1263 &state->id_acache_req_lock); 1264 CALLB_CPR_SAFE_END(&cprinfo, 1265 &state->id_acache_req_lock); 1266 #endif 1267 } 1268 } 1269 1270 /*NOTREACHED*/ 1271 _NOTE(NOT_REACHED) 1272 } 1273 1274 /* 1275 * Return when it is safe to queue requests to the async daemon; primarily 1276 * for subnet trap and async event handling. Disallow requests before the 1277 * daemon is created, and when interface deinitilization starts. 1278 */ 1279 static boolean_t 1280 ibd_async_safe(ibd_state_t *state) 1281 { 1282 mutex_enter(&state->id_trap_lock); 1283 if (state->id_trap_stop) { 1284 mutex_exit(&state->id_trap_lock); 1285 return (B_FALSE); 1286 } 1287 state->id_trap_inprog++; 1288 mutex_exit(&state->id_trap_lock); 1289 return (B_TRUE); 1290 } 1291 1292 /* 1293 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1294 * trap or event handling to complete to kill the async thread and deconstruct 1295 * the mcg/ace list. 1296 */ 1297 static void 1298 ibd_async_done(ibd_state_t *state) 1299 { 1300 mutex_enter(&state->id_trap_lock); 1301 if (--state->id_trap_inprog == 0) 1302 cv_signal(&state->id_trap_cv); 1303 mutex_exit(&state->id_trap_lock); 1304 } 1305 1306 /* 1307 * Hash functions: 1308 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1309 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1310 * These operate on mac addresses input into ibd_send, but there is no 1311 * guarantee on the alignment of the ipoib_mac_t structure. 1312 */ 1313 /*ARGSUSED*/ 1314 static uint_t 1315 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1316 { 1317 ulong_t ptraddr = (ulong_t)key; 1318 uint_t hval; 1319 1320 /* 1321 * If the input address is 4 byte aligned, we can just dereference 1322 * it. This is most common, since IP will send in a 4 byte aligned 1323 * IP header, which implies the 24 byte IPoIB psuedo header will be 1324 * 4 byte aligned too. 1325 */ 1326 if ((ptraddr & 3) == 0) 1327 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1328 1329 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1330 return (hval); 1331 } 1332 1333 static int 1334 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1335 { 1336 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1337 return (0); 1338 else 1339 return (1); 1340 } 1341 1342 /* 1343 * Initialize all the per interface caches and lists; AH cache, 1344 * MCG list etc. 1345 */ 1346 static int 1347 ibd_acache_init(ibd_state_t *state) 1348 { 1349 ibd_ace_t *ce; 1350 int i; 1351 1352 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1353 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1354 1355 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1356 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1357 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1358 offsetof(ibd_ace_t, ac_list)); 1359 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1360 offsetof(ibd_ace_t, ac_list)); 1361 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1362 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1363 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1364 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1365 offsetof(ibd_mce_t, mc_list)); 1366 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1367 offsetof(ibd_mce_t, mc_list)); 1368 list_create(&state->id_req_list, sizeof (ibd_req_t), 1369 offsetof(ibd_req_t, rq_list)); 1370 1371 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1372 IBD_NUM_AH, KM_SLEEP); 1373 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1374 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1375 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1376 ibd_acache_fini(state); 1377 return (DDI_FAILURE); 1378 } else { 1379 CLEAR_REFCYCLE(ce); 1380 ce->ac_mce = NULL; 1381 IBD_ACACHE_INSERT_FREE(state, ce); 1382 } 1383 } 1384 return (DDI_SUCCESS); 1385 } 1386 1387 static void 1388 ibd_acache_fini(ibd_state_t *state) 1389 { 1390 ibd_ace_t *ptr; 1391 1392 mutex_enter(&state->id_ac_mutex); 1393 1394 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1395 ASSERT(GET_REF(ptr) == 0); 1396 (void) ibt_free_ud_dest(ptr->ac_dest); 1397 } 1398 1399 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1400 ASSERT(GET_REF(ptr) == 0); 1401 (void) ibt_free_ud_dest(ptr->ac_dest); 1402 } 1403 1404 list_destroy(&state->id_ah_free); 1405 list_destroy(&state->id_ah_active); 1406 list_destroy(&state->id_mc_full); 1407 list_destroy(&state->id_mc_non); 1408 list_destroy(&state->id_req_list); 1409 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1410 mutex_exit(&state->id_ac_mutex); 1411 mutex_destroy(&state->id_ac_mutex); 1412 mutex_destroy(&state->id_mc_mutex); 1413 mutex_destroy(&state->id_acache_req_lock); 1414 cv_destroy(&state->id_acache_req_cv); 1415 } 1416 1417 /* 1418 * Search AH active hash list for a cached path to input destination. 1419 * If we are "just looking", hold == F. When we are in the Tx path, 1420 * we set hold == T to grab a reference on the AH so that it can not 1421 * be recycled to a new destination while the Tx request is posted. 1422 */ 1423 static ibd_ace_t * 1424 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1425 { 1426 ibd_ace_t *ptr; 1427 1428 ASSERT(mutex_owned(&state->id_ac_mutex)); 1429 1430 /* 1431 * Do hash search. 1432 */ 1433 if (mod_hash_find(state->id_ah_active_hash, 1434 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1435 if (hold) 1436 INC_REF(ptr, num); 1437 return (ptr); 1438 } 1439 return (NULL); 1440 } 1441 1442 /* 1443 * This is called by the tx side; if an initialized AH is found in 1444 * the active list, it is locked down and can be used; if no entry 1445 * is found, an async request is queued to do path resolution. 1446 */ 1447 static ibd_ace_t * 1448 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1449 { 1450 ibd_ace_t *ptr; 1451 ibd_req_t *req; 1452 1453 /* 1454 * Only attempt to print when we can; in the mdt pattr case, the 1455 * address is not aligned properly. 1456 */ 1457 if (((ulong_t)mac & 3) == 0) { 1458 DPRINT(4, 1459 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1460 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1461 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1462 htonl(mac->ipoib_gidsuff[1])); 1463 } 1464 1465 mutex_enter(&state->id_ac_mutex); 1466 1467 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1468 mutex_exit(&state->id_ac_mutex); 1469 return (ptr); 1470 } 1471 1472 /* 1473 * Implementation of a single outstanding async request; if 1474 * the operation is not started yet, queue a request and move 1475 * to ongoing state. Remember in id_ah_addr for which address 1476 * we are queueing the request, in case we need to flag an error; 1477 * Any further requests, for the same or different address, until 1478 * the operation completes, is sent back to GLDv3 to be retried. 1479 * The async thread will update id_ah_op with an error indication 1480 * or will set it to indicate the next look up can start; either 1481 * way, it will mac_tx_update() so that all blocked requests come 1482 * back here. 1483 */ 1484 *err = EAGAIN; 1485 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1486 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1487 if (req != NULL) { 1488 /* 1489 * We did not even find the entry; queue a request 1490 * for it. 1491 */ 1492 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1493 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1494 state->id_ah_op = IBD_OP_ONGOING; 1495 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1496 } 1497 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1498 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1499 /* 1500 * Check the status of the pathrecord lookup request 1501 * we had queued before. 1502 */ 1503 if (state->id_ah_op == IBD_OP_ERRORED) { 1504 *err = EFAULT; 1505 state->id_ah_error++; 1506 } else { 1507 /* 1508 * IBD_OP_ROUTERED case: We need to send to the 1509 * all-router MCG. If we can find the AH for 1510 * the mcg, the Tx will be attempted. If we 1511 * do not find the AH, we return NORESOURCES 1512 * to retry. 1513 */ 1514 ipoib_mac_t routermac; 1515 1516 (void) ibd_get_allroutergroup(state, mac, &routermac); 1517 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1518 numwqe); 1519 } 1520 state->id_ah_op = IBD_OP_NOTSTARTED; 1521 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1522 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1523 /* 1524 * This case can happen when we get a higher band 1525 * packet. The easiest way is to reset the state machine 1526 * to accommodate the higher priority packet. 1527 */ 1528 state->id_ah_op = IBD_OP_NOTSTARTED; 1529 } 1530 mutex_exit(&state->id_ac_mutex); 1531 1532 return (ptr); 1533 } 1534 1535 /* 1536 * Grab a not-currently-in-use AH/PathRecord from the active 1537 * list to recycle to a new destination. Only the async thread 1538 * executes this code. 1539 */ 1540 static ibd_ace_t * 1541 ibd_acache_get_unref(ibd_state_t *state) 1542 { 1543 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1544 1545 ASSERT(mutex_owned(&state->id_ac_mutex)); 1546 1547 /* 1548 * Do plain linear search. 1549 */ 1550 while (ptr != NULL) { 1551 /* 1552 * Note that it is possible that the "cycle" bit 1553 * is set on the AH w/o any reference count. The 1554 * mcg must have been deleted, and the tx cleanup 1555 * just decremented the reference count to 0, but 1556 * hasn't gotten around to grabbing the id_ac_mutex 1557 * to move the AH into the free list. 1558 */ 1559 if (GET_REF(ptr) == 0) { 1560 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1561 break; 1562 } 1563 ptr = list_next(&state->id_ah_active, ptr); 1564 } 1565 return (ptr); 1566 } 1567 1568 /* 1569 * Invoked to clean up AH from active list in case of multicast 1570 * disable and to handle sendonly memberships during mcg traps. 1571 * And for port up processing for multicast and unicast AHs. 1572 * Normally, the AH is taken off the active list, and put into 1573 * the free list to be recycled for a new destination. In case 1574 * Tx requests on the AH have not completed yet, the AH is marked 1575 * for reaping (which will put the AH on the free list) once the Tx's 1576 * complete; in this case, depending on the "force" input, we take 1577 * out the AH from the active list right now, or leave it also for 1578 * the reap operation. Returns TRUE if the AH is taken off the active 1579 * list (and either put into the free list right now, or arranged for 1580 * later), FALSE otherwise. 1581 */ 1582 static boolean_t 1583 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1584 { 1585 ibd_ace_t *acactive; 1586 boolean_t ret = B_TRUE; 1587 1588 ASSERT(mutex_owned(&state->id_ac_mutex)); 1589 1590 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1591 1592 /* 1593 * Note that the AH might already have the cycle bit set 1594 * on it; this might happen if sequences of multicast 1595 * enables and disables are coming so fast, that posted 1596 * Tx's to the mcg have not completed yet, and the cycle 1597 * bit is set successively by each multicast disable. 1598 */ 1599 if (SET_CYCLE_IF_REF(acactive)) { 1600 if (!force) { 1601 /* 1602 * The ace is kept on the active list, further 1603 * Tx's can still grab a reference on it; the 1604 * ace is reaped when all pending Tx's 1605 * referencing the AH complete. 1606 */ 1607 ret = B_FALSE; 1608 } else { 1609 /* 1610 * In the mcg trap case, we always pull the 1611 * AH from the active list. And also the port 1612 * up multi/unicast case. 1613 */ 1614 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1615 acactive->ac_mce = NULL; 1616 } 1617 } else { 1618 /* 1619 * Determined the ref count is 0, thus reclaim 1620 * immediately after pulling out the ace from 1621 * the active list. 1622 */ 1623 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1624 acactive->ac_mce = NULL; 1625 IBD_ACACHE_INSERT_FREE(state, acactive); 1626 } 1627 1628 } 1629 return (ret); 1630 } 1631 1632 /* 1633 * Helper function for async path record lookup. If we are trying to 1634 * Tx to a MCG, check our membership, possibly trying to join the 1635 * group if required. If that fails, try to send the packet to the 1636 * all router group (indicated by the redirect output), pointing 1637 * the input mac address to the router mcg address. 1638 */ 1639 static ibd_mce_t * 1640 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1641 { 1642 ib_gid_t mgid; 1643 ibd_mce_t *mce; 1644 ipoib_mac_t routermac; 1645 1646 *redirect = B_FALSE; 1647 ibd_n2h_gid(mac, &mgid); 1648 1649 /* 1650 * Check the FullMember+SendOnlyNonMember list. 1651 * Since we are the only one who manipulates the 1652 * id_mc_full list, no locks are needed. 1653 */ 1654 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1655 if (mce != NULL) { 1656 DPRINT(4, "ibd_async_mcache : already joined to group"); 1657 return (mce); 1658 } 1659 1660 /* 1661 * Not found; try to join(SendOnlyNonMember) and attach. 1662 */ 1663 DPRINT(4, "ibd_async_mcache : not joined to group"); 1664 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1665 NULL) { 1666 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1667 return (mce); 1668 } 1669 1670 /* 1671 * MCGroup not present; try to join the all-router group. If 1672 * any of the following steps succeed, we will be redirecting 1673 * to the all router group. 1674 */ 1675 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1676 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1677 return (NULL); 1678 *redirect = B_TRUE; 1679 ibd_n2h_gid(&routermac, &mgid); 1680 bcopy(&routermac, mac, IPOIB_ADDRL); 1681 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1682 mgid.gid_prefix, mgid.gid_guid); 1683 1684 /* 1685 * Are we already joined to the router group? 1686 */ 1687 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1688 DPRINT(4, "ibd_async_mcache : using already joined router" 1689 "group\n"); 1690 return (mce); 1691 } 1692 1693 /* 1694 * Can we join(SendOnlyNonMember) the router group? 1695 */ 1696 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1697 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1698 NULL) { 1699 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1700 return (mce); 1701 } 1702 1703 return (NULL); 1704 } 1705 1706 /* 1707 * Async path record lookup code. 1708 */ 1709 static void 1710 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1711 { 1712 ibd_ace_t *ce; 1713 ibd_mce_t *mce = NULL; 1714 ibt_path_attr_t path_attr; 1715 ibt_path_info_t path_info; 1716 ib_gid_t destgid; 1717 char ret = IBD_OP_NOTSTARTED; 1718 1719 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1720 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1721 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1722 htonl(mac->ipoib_gidsuff[1])); 1723 1724 /* 1725 * Check whether we are trying to transmit to a MCG. 1726 * In that case, we need to make sure we are a member of 1727 * the MCG. 1728 */ 1729 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1730 boolean_t redirected; 1731 1732 /* 1733 * If we can not find or join the group or even 1734 * redirect, error out. 1735 */ 1736 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1737 NULL) { 1738 state->id_ah_op = IBD_OP_ERRORED; 1739 return; 1740 } 1741 1742 /* 1743 * If we got redirected, we need to determine whether 1744 * the AH for the new mcg is in the cache already, and 1745 * not pull it in then; otherwise proceed to get the 1746 * path for the new mcg. There is no guarantee that 1747 * if the AH is currently in the cache, it will still be 1748 * there when we look in ibd_acache_lookup(), but that's 1749 * okay, we will come back here. 1750 */ 1751 if (redirected) { 1752 ret = IBD_OP_ROUTERED; 1753 DPRINT(4, "ibd_async_acache : redirected to " 1754 "%08X:%08X:%08X:%08X:%08X", 1755 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1756 htonl(mac->ipoib_gidpref[1]), 1757 htonl(mac->ipoib_gidsuff[0]), 1758 htonl(mac->ipoib_gidsuff[1])); 1759 1760 mutex_enter(&state->id_ac_mutex); 1761 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1762 state->id_ah_op = IBD_OP_ROUTERED; 1763 mutex_exit(&state->id_ac_mutex); 1764 DPRINT(4, "ibd_async_acache : router AH found"); 1765 return; 1766 } 1767 mutex_exit(&state->id_ac_mutex); 1768 } 1769 } 1770 1771 /* 1772 * Get an AH from the free list. 1773 */ 1774 mutex_enter(&state->id_ac_mutex); 1775 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1776 /* 1777 * No free ones; try to grab an unreferenced active 1778 * one. Maybe we need to make the active list LRU, 1779 * but that will create more work for Tx callbacks. 1780 * Is there a way of not having to pull out the 1781 * entry from the active list, but just indicate it 1782 * is being recycled? Yes, but that creates one more 1783 * check in the fast lookup path. 1784 */ 1785 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1786 /* 1787 * Pretty serious shortage now. 1788 */ 1789 state->id_ah_op = IBD_OP_NOTSTARTED; 1790 mutex_exit(&state->id_ac_mutex); 1791 DPRINT(10, "ibd_async_acache : failed to find AH " 1792 "slot\n"); 1793 return; 1794 } 1795 /* 1796 * We could check whether ac_mce points to a SendOnly 1797 * member and drop that membership now. Or do it lazily 1798 * at detach time. 1799 */ 1800 ce->ac_mce = NULL; 1801 } 1802 mutex_exit(&state->id_ac_mutex); 1803 ASSERT(ce->ac_mce == NULL); 1804 1805 /* 1806 * Update the entry. 1807 */ 1808 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1809 1810 bzero(&path_info, sizeof (path_info)); 1811 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1812 path_attr.pa_sgid = state->id_sgid; 1813 path_attr.pa_num_dgids = 1; 1814 ibd_n2h_gid(&ce->ac_mac, &destgid); 1815 path_attr.pa_dgids = &destgid; 1816 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1817 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1818 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1819 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1820 goto error; 1821 } 1822 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1823 ntohl(ce->ac_mac.ipoib_qpn), 1824 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1825 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1826 goto error; 1827 } 1828 1829 /* 1830 * mce is set whenever an AH is being associated with a 1831 * MCG; this will come in handy when we leave the MCG. The 1832 * lock protects Tx fastpath from scanning the active list. 1833 */ 1834 if (mce != NULL) 1835 ce->ac_mce = mce; 1836 mutex_enter(&state->id_ac_mutex); 1837 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1838 state->id_ah_op = ret; 1839 mutex_exit(&state->id_ac_mutex); 1840 return; 1841 error: 1842 /* 1843 * We might want to drop SendOnly membership here if we 1844 * joined above. The lock protects Tx callbacks inserting 1845 * into the free list. 1846 */ 1847 mutex_enter(&state->id_ac_mutex); 1848 state->id_ah_op = IBD_OP_ERRORED; 1849 IBD_ACACHE_INSERT_FREE(state, ce); 1850 mutex_exit(&state->id_ac_mutex); 1851 } 1852 1853 /* 1854 * While restoring port's presence on the subnet on a port up, it is possible 1855 * that the port goes down again. 1856 */ 1857 static void 1858 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1859 { 1860 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1861 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1862 LINK_STATE_UP; 1863 ibd_mce_t *mce, *pmce; 1864 ibd_ace_t *ace, *pace; 1865 1866 DPRINT(10, "ibd_async_link(): %d", opcode); 1867 1868 /* 1869 * On a link up, revalidate the link speed/width. No point doing 1870 * this on a link down, since we will be unable to do SA operations, 1871 * defaulting to the lowest speed. Also notice that we update our 1872 * notion of speed before calling mac_link_update(), which will do 1873 * neccesary higher level notifications for speed changes. 1874 */ 1875 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1876 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1877 state->id_link_speed = ibd_get_portspeed(state); 1878 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1879 } 1880 1881 /* 1882 * Do all the work required to establish our presence on 1883 * the subnet. 1884 */ 1885 if (opcode == IBD_LINK_UP_ABSENT) { 1886 /* 1887 * If in promiscuous mode ... 1888 */ 1889 if (state->id_prom_op == IBD_OP_COMPLETED) { 1890 /* 1891 * Drop all nonmembership. 1892 */ 1893 ibd_async_unsetprom(state); 1894 1895 /* 1896 * Then, try to regain nonmembership to all mcg's. 1897 */ 1898 ibd_async_setprom(state); 1899 1900 } 1901 1902 /* 1903 * Drop all sendonly membership (which also gets rid of the 1904 * AHs); try to reacquire all full membership. 1905 */ 1906 mce = list_head(&state->id_mc_full); 1907 while ((pmce = mce) != NULL) { 1908 mce = list_next(&state->id_mc_full, mce); 1909 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1910 ibd_leave_group(state, 1911 pmce->mc_info.mc_adds_vect.av_dgid, 1912 IB_MC_JSTATE_SEND_ONLY_NON); 1913 else 1914 ibd_reacquire_group(state, pmce); 1915 } 1916 1917 /* 1918 * Recycle all active AHs to free list (and if there are 1919 * pending posts, make sure they will go into the free list 1920 * once the Tx's complete). Grab the lock to prevent 1921 * concurrent Tx's as well as Tx cleanups. 1922 */ 1923 mutex_enter(&state->id_ac_mutex); 1924 ace = list_head(&state->id_ah_active); 1925 while ((pace = ace) != NULL) { 1926 boolean_t cycled; 1927 1928 ace = list_next(&state->id_ah_active, ace); 1929 mce = pace->ac_mce; 1930 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1931 B_TRUE); 1932 /* 1933 * If this is for an mcg, it must be for a fullmember, 1934 * since we got rid of send-only members above when 1935 * processing the mce list. 1936 */ 1937 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1938 IB_MC_JSTATE_FULL))); 1939 1940 /* 1941 * Check if the fullmember mce needs to be torn down, 1942 * ie whether the DLPI disable has already been done. 1943 * If so, do some of the work of tx_cleanup, namely 1944 * causing leave (which will fail), detach and 1945 * mce-freeing. tx_cleanup will put the AH into free 1946 * list. The reason to duplicate some of this 1947 * tx_cleanup work is because we want to delete the 1948 * AH right now instead of waiting for tx_cleanup, to 1949 * force subsequent Tx's to reacquire an AH. 1950 */ 1951 if ((mce != NULL) && (mce->mc_fullreap)) 1952 ibd_async_reap_group(state, mce, 1953 mce->mc_info.mc_adds_vect.av_dgid, 1954 mce->mc_jstate); 1955 } 1956 mutex_exit(&state->id_ac_mutex); 1957 } 1958 1959 /* 1960 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1961 * (which stops further events from being delivered) before 1962 * mac_unregister(). At this point, it is guaranteed that mac_register 1963 * has already been done. 1964 */ 1965 mutex_enter(&state->id_link_mutex); 1966 state->id_link_state = lstate; 1967 mac_link_update(state->id_mh, lstate); 1968 mutex_exit(&state->id_link_mutex); 1969 1970 ibd_async_done(state); 1971 } 1972 1973 /* 1974 * Check the pkey table to see if we can find the pkey we're looking for. 1975 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1976 * failure. 1977 */ 1978 static int 1979 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1980 uint16_t *pkix) 1981 { 1982 uint16_t ndx; 1983 1984 ASSERT(pkix != NULL); 1985 1986 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1987 if (pkey_tbl[ndx] == pkey) { 1988 *pkix = ndx; 1989 return (0); 1990 } 1991 } 1992 return (-1); 1993 } 1994 1995 /* 1996 * When the link is notified up, we need to do a few things, based 1997 * on the port's current p_init_type_reply claiming a reinit has been 1998 * done or not. The reinit steps are: 1999 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2000 * the old Pkey and GID0 are correct. 2001 * 2. Register for mcg traps (already done by ibmf). 2002 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2003 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2004 * 4. Give up all sendonly memberships. 2005 * 5. Acquire all full memberships. 2006 * 6. In promiscuous mode, acquire all non memberships. 2007 * 7. Recycle all AHs to free list. 2008 */ 2009 static void 2010 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2011 { 2012 ibt_hca_portinfo_t *port_infop = NULL; 2013 ibt_status_t ibt_status; 2014 uint_t psize, port_infosz; 2015 ibd_link_op_t opcode; 2016 ibd_req_t *req; 2017 link_state_t new_link_state = LINK_STATE_UP; 2018 uint8_t itreply; 2019 uint16_t pkix; 2020 int ret; 2021 2022 /* 2023 * Let's not race with a plumb or an unplumb; if we detect a 2024 * pkey relocation event later on here, we may have to restart. 2025 */ 2026 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2027 2028 mutex_enter(&state->id_link_mutex); 2029 2030 /* 2031 * If the init code in ibd_m_start hasn't yet set up the 2032 * pkey/gid, nothing to do; that code will set the link state. 2033 */ 2034 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2035 mutex_exit(&state->id_link_mutex); 2036 goto link_mod_return; 2037 } 2038 2039 /* 2040 * If this routine was called in response to a port down event, 2041 * we just need to see if this should be informed. 2042 */ 2043 if (code == IBT_ERROR_PORT_DOWN) { 2044 new_link_state = LINK_STATE_DOWN; 2045 goto update_link_state; 2046 } 2047 2048 /* 2049 * If it's not a port down event we've received, try to get the port 2050 * attributes first. If we fail here, the port is as good as down. 2051 * Otherwise, if the link went down by the time the handler gets 2052 * here, give up - we cannot even validate the pkey/gid since those 2053 * are not valid and this is as bad as a port down anyway. 2054 */ 2055 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2056 &port_infop, &psize, &port_infosz); 2057 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2058 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2059 new_link_state = LINK_STATE_DOWN; 2060 goto update_link_state; 2061 } 2062 2063 /* 2064 * Check the SM InitTypeReply flags. If both NoLoadReply and 2065 * PreserveContentReply are 0, we don't know anything about the 2066 * data loaded into the port attributes, so we need to verify 2067 * if gid0 and pkey are still valid. 2068 */ 2069 itreply = port_infop->p_init_type_reply; 2070 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2071 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2072 /* 2073 * Check to see if the subnet part of GID0 has changed. If 2074 * not, check the simple case first to see if the pkey 2075 * index is the same as before; finally check to see if the 2076 * pkey has been relocated to a different index in the table. 2077 */ 2078 if (bcmp(port_infop->p_sgid_tbl, 2079 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2080 2081 new_link_state = LINK_STATE_DOWN; 2082 2083 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2084 state->id_pkey) { 2085 2086 new_link_state = LINK_STATE_UP; 2087 2088 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2089 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2090 2091 ibt_free_portinfo(port_infop, port_infosz); 2092 mutex_exit(&state->id_link_mutex); 2093 2094 /* 2095 * Currently a restart is required if our pkey has moved 2096 * in the pkey table. If we get the ibt_recycle_ud() to 2097 * work as documented (expected), we may be able to 2098 * avoid a complete restart. Note that we've already 2099 * marked both the start and stop 'in-progress' flags, 2100 * so it is ok to go ahead and do this restart. 2101 */ 2102 ibd_undo_start(state, LINK_STATE_DOWN); 2103 if ((ret = ibd_start(state)) != 0) { 2104 DPRINT(10, "ibd_restart: cannot restart, " 2105 "ret=%d", ret); 2106 } 2107 2108 goto link_mod_return; 2109 } else { 2110 new_link_state = LINK_STATE_DOWN; 2111 } 2112 } 2113 2114 update_link_state: 2115 if (port_infop) { 2116 ibt_free_portinfo(port_infop, port_infosz); 2117 } 2118 2119 /* 2120 * If the old state is the same as the new state, nothing to do 2121 */ 2122 if (state->id_link_state == new_link_state) { 2123 mutex_exit(&state->id_link_mutex); 2124 goto link_mod_return; 2125 } 2126 2127 /* 2128 * Ok, so there was a link state change; see if it's safe to ask 2129 * the async thread to do the work 2130 */ 2131 if (!ibd_async_safe(state)) { 2132 state->id_link_state = new_link_state; 2133 mutex_exit(&state->id_link_mutex); 2134 goto link_mod_return; 2135 } 2136 2137 mutex_exit(&state->id_link_mutex); 2138 2139 /* 2140 * If we're reporting a link up, check InitTypeReply to see if 2141 * the SM has ensured that the port's presence in mcg, traps, 2142 * etc. is intact. 2143 */ 2144 if (new_link_state == LINK_STATE_DOWN) { 2145 opcode = IBD_LINK_DOWN; 2146 } else { 2147 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2148 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2149 opcode = IBD_LINK_UP; 2150 } else { 2151 opcode = IBD_LINK_UP_ABSENT; 2152 } 2153 } 2154 2155 /* 2156 * Queue up a request for ibd_async_link() to handle this link 2157 * state change event 2158 */ 2159 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2160 req->rq_ptr = (void *)opcode; 2161 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2162 2163 link_mod_return: 2164 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2165 } 2166 2167 /* 2168 * For the port up/down events, IBTL guarantees there will not be concurrent 2169 * invocations of the handler. IBTL might coalesce link transition events, 2170 * and not invoke the handler for _each_ up/down transition, but it will 2171 * invoke the handler with last known state 2172 */ 2173 static void 2174 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2175 ibt_async_code_t code, ibt_async_event_t *event) 2176 { 2177 ibd_state_t *state = (ibd_state_t *)clnt_private; 2178 2179 switch (code) { 2180 case IBT_ERROR_CATASTROPHIC_CHAN: 2181 ibd_print_warn(state, "catastrophic channel error"); 2182 break; 2183 case IBT_ERROR_CQ: 2184 ibd_print_warn(state, "completion queue error"); 2185 break; 2186 case IBT_PORT_CHANGE_EVENT: 2187 /* 2188 * Events will be delivered to all instances that have 2189 * done ibt_open_hca() but not yet done ibt_close_hca(). 2190 * Only need to do work for our port; IBTF will deliver 2191 * events for other ports on the hca we have ibt_open_hca'ed 2192 * too. Note that id_port is initialized in ibd_attach() 2193 * before we do an ibt_open_hca() in ibd_attach(). 2194 */ 2195 ASSERT(state->id_hca_hdl == hca_hdl); 2196 if (state->id_port != event->ev_port) 2197 break; 2198 2199 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2200 IBT_PORT_CHANGE_PKEY) { 2201 ibd_link_mod(state, code); 2202 } 2203 break; 2204 case IBT_ERROR_PORT_DOWN: 2205 case IBT_CLNT_REREG_EVENT: 2206 case IBT_EVENT_PORT_UP: 2207 /* 2208 * Events will be delivered to all instances that have 2209 * done ibt_open_hca() but not yet done ibt_close_hca(). 2210 * Only need to do work for our port; IBTF will deliver 2211 * events for other ports on the hca we have ibt_open_hca'ed 2212 * too. Note that id_port is initialized in ibd_attach() 2213 * before we do an ibt_open_hca() in ibd_attach(). 2214 */ 2215 ASSERT(state->id_hca_hdl == hca_hdl); 2216 if (state->id_port != event->ev_port) 2217 break; 2218 2219 ibd_link_mod(state, code); 2220 break; 2221 2222 case IBT_HCA_ATTACH_EVENT: 2223 case IBT_HCA_DETACH_EVENT: 2224 /* 2225 * When a new card is plugged to the system, attach_event is 2226 * invoked. Additionally, a cfgadm needs to be run to make the 2227 * card known to the system, and an ifconfig needs to be run to 2228 * plumb up any ibd interfaces on the card. In the case of card 2229 * unplug, a cfgadm is run that will trigger any RCM scripts to 2230 * unplumb the ibd interfaces on the card; when the card is 2231 * actually unplugged, the detach_event is invoked; 2232 * additionally, if any ibd instances are still active on the 2233 * card (eg there were no associated RCM scripts), driver's 2234 * detach routine is invoked. 2235 */ 2236 break; 2237 default: 2238 break; 2239 } 2240 } 2241 2242 static int 2243 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2244 { 2245 mac_register_t *macp; 2246 int ret; 2247 2248 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2249 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2250 return (DDI_FAILURE); 2251 } 2252 2253 /* 2254 * Note that when we register with mac during attach, we don't 2255 * have the id_macaddr yet, so we'll simply be registering a 2256 * zero macaddr that we'll overwrite later during plumb (in 2257 * ibd_m_start()). Similar is the case with id_mtu - we'll 2258 * update the mac layer with the correct mtu during plumb. 2259 */ 2260 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2261 macp->m_driver = state; 2262 macp->m_dip = dip; 2263 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2264 macp->m_callbacks = &ibd_m_callbacks; 2265 macp->m_min_sdu = 0; 2266 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2267 2268 /* 2269 * Register ourselves with the GLDv3 interface 2270 */ 2271 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2272 mac_free(macp); 2273 DPRINT(10, 2274 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2275 return (DDI_FAILURE); 2276 } 2277 2278 mac_free(macp); 2279 return (DDI_SUCCESS); 2280 } 2281 2282 static int 2283 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2284 { 2285 ibt_hca_attr_t hca_attrs; 2286 ibt_status_t ibt_status; 2287 2288 /* 2289 * Query the HCA and fetch its attributes 2290 */ 2291 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2292 ASSERT(ibt_status == IBT_SUCCESS); 2293 2294 /* 2295 * 1. Set the Hardware Checksum capability. Currently we only consider 2296 * full checksum offload. 2297 */ 2298 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 2299 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2300 } 2301 2302 /* 2303 * 2. Set LSO policy, capability and maximum length 2304 */ 2305 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2306 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2307 state->id_lso_policy = B_TRUE; 2308 } else { 2309 state->id_lso_policy = B_FALSE; 2310 } 2311 2312 /* 2313 * Work-around for Bug 6866957. Ignore policy from ibd.conf. 2314 * Turn off LSO forcibly. Remove it when the work-around is no longer 2315 * needed. 2316 */ 2317 if (ibd_force_lso_disable) { 2318 state->id_lso_policy = B_FALSE; 2319 } 2320 /* End of Workaround */ 2321 2322 if (hca_attrs.hca_max_lso_size > 0) { 2323 state->id_lso_capable = B_TRUE; 2324 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2325 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2326 else 2327 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 2328 } else { 2329 state->id_lso_capable = B_FALSE; 2330 state->id_lso_maxlen = 0; 2331 } 2332 2333 /* 2334 * 3. Set Reserved L_Key capability 2335 */ 2336 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2337 state->id_hca_res_lkey_capab = 1; 2338 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2339 } 2340 2341 /* 2342 * 4. Set maximum sqseg value after checking to see if extended sgl 2343 * size information is provided by the hca 2344 */ 2345 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2346 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2347 } else { 2348 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2349 } 2350 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2351 state->id_max_sqseg = IBD_MAX_SQSEG; 2352 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2353 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2354 state->id_max_sqseg, IBD_MAX_SQSEG); 2355 } 2356 2357 /* 2358 * 5. Set number of recv and send wqes after checking hca maximum 2359 * channel size 2360 */ 2361 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2362 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2363 } else { 2364 state->id_num_rwqe = IBD_NUM_RWQE; 2365 } 2366 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2367 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2368 } else { 2369 state->id_num_swqe = IBD_NUM_SWQE; 2370 } 2371 2372 return (DDI_SUCCESS); 2373 } 2374 2375 static int 2376 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2377 { 2378 int instance; 2379 uint32_t progress = state->id_mac_state; 2380 ibt_status_t ret; 2381 2382 if (progress & IBD_DRV_MAC_REGISTERED) { 2383 (void) mac_unregister(state->id_mh); 2384 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2385 } 2386 2387 if (progress & IBD_DRV_PD_ALLOCD) { 2388 if ((ret = ibt_free_pd(state->id_hca_hdl, 2389 state->id_pd_hdl)) != IBT_SUCCESS) { 2390 ibd_print_warn(state, "failed to free " 2391 "protection domain, ret=%d", ret); 2392 } 2393 state->id_pd_hdl = NULL; 2394 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2395 } 2396 2397 if (progress & IBD_DRV_HCA_OPENED) { 2398 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2399 IBT_SUCCESS) { 2400 ibd_print_warn(state, "failed to close " 2401 "HCA device, ret=%d", ret); 2402 } 2403 state->id_hca_hdl = NULL; 2404 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2405 } 2406 2407 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2408 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2409 ibd_print_warn(state, 2410 "ibt_detach() failed, ret=%d", ret); 2411 } 2412 state->id_ibt_hdl = NULL; 2413 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2414 } 2415 2416 if (progress & IBD_DRV_TXINTR_ADDED) { 2417 ddi_remove_softintr(state->id_tx); 2418 state->id_tx = NULL; 2419 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2420 } 2421 2422 if (progress & IBD_DRV_RXINTR_ADDED) { 2423 ddi_remove_softintr(state->id_rx); 2424 state->id_rx = NULL; 2425 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2426 } 2427 2428 if (progress & IBD_DRV_STATE_INITIALIZED) { 2429 ibd_state_fini(state); 2430 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2431 } 2432 2433 instance = ddi_get_instance(dip); 2434 ddi_soft_state_free(ibd_list, instance); 2435 2436 return (DDI_SUCCESS); 2437 } 2438 2439 /* 2440 * Attach device to the IO framework. 2441 */ 2442 static int 2443 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2444 { 2445 ibd_state_t *state = NULL; 2446 ib_guid_t hca_guid; 2447 int instance; 2448 ibt_status_t ret; 2449 int rv; 2450 2451 /* 2452 * IBD doesn't support suspend/resume 2453 */ 2454 if (cmd != DDI_ATTACH) 2455 return (DDI_FAILURE); 2456 2457 /* 2458 * Allocate softstate structure 2459 */ 2460 instance = ddi_get_instance(dip); 2461 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2462 return (DDI_FAILURE); 2463 state = ddi_get_soft_state(ibd_list, instance); 2464 2465 /* 2466 * Initialize mutexes and condition variables 2467 */ 2468 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2469 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2470 goto attach_fail; 2471 } 2472 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2473 2474 /* 2475 * Allocate rx,tx softintr 2476 */ 2477 if (ibd_rx_softintr == 1) { 2478 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2479 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2480 DPRINT(10, "ibd_attach: failed in " 2481 "ddi_add_softintr(id_rx), ret=%d", rv); 2482 goto attach_fail; 2483 } 2484 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2485 } 2486 if (ibd_tx_softintr == 1) { 2487 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2488 NULL, NULL, ibd_tx_recycle, 2489 (caddr_t)state)) != DDI_SUCCESS) { 2490 DPRINT(10, "ibd_attach: failed in " 2491 "ddi_add_softintr(id_tx), ret=%d", rv); 2492 goto attach_fail; 2493 } 2494 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2495 } 2496 2497 /* 2498 * Obtain IBA P_Key, port number and HCA guid and validate 2499 * them (for P_Key, only full members are allowed as per 2500 * IPoIB specification; neither port number nor HCA guid 2501 * can be zero) 2502 */ 2503 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2504 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2505 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2506 state->id_pkey); 2507 goto attach_fail; 2508 } 2509 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2510 "port-number", 0)) == 0) { 2511 DPRINT(10, "ibd_attach: invalid port number (%d)", 2512 state->id_port); 2513 goto attach_fail; 2514 } 2515 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2516 "hca-guid", 0)) == 0) { 2517 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2518 hca_guid); 2519 goto attach_fail; 2520 } 2521 2522 /* 2523 * Attach to IBTL 2524 */ 2525 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2526 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2527 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2528 goto attach_fail; 2529 } 2530 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2531 2532 /* 2533 * Open the HCA 2534 */ 2535 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2536 &state->id_hca_hdl)) != IBT_SUCCESS) { 2537 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2538 goto attach_fail; 2539 } 2540 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2541 2542 /* 2543 * Record capabilities 2544 */ 2545 (void) ibd_record_capab(state, dip); 2546 2547 /* 2548 * Allocate a protection domain on the HCA 2549 */ 2550 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2551 &state->id_pd_hdl)) != IBT_SUCCESS) { 2552 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2553 goto attach_fail; 2554 } 2555 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2556 2557 2558 /* 2559 * Register ibd interfaces with the Nemo framework 2560 */ 2561 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2562 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2563 goto attach_fail; 2564 } 2565 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2566 2567 /* 2568 * We're done with everything we could to make the attach 2569 * succeed. All the buffer allocations and IPoIB broadcast 2570 * group joins are deferred to when the interface instance 2571 * is actually plumbed to avoid wasting memory. 2572 */ 2573 return (DDI_SUCCESS); 2574 2575 attach_fail: 2576 (void) ibd_unattach(state, dip); 2577 return (DDI_FAILURE); 2578 } 2579 2580 /* 2581 * Detach device from the IO framework. 2582 */ 2583 static int 2584 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2585 { 2586 ibd_state_t *state; 2587 int instance; 2588 2589 /* 2590 * IBD doesn't support suspend/resume 2591 */ 2592 if (cmd != DDI_DETACH) 2593 return (DDI_FAILURE); 2594 2595 /* 2596 * Get the instance softstate 2597 */ 2598 instance = ddi_get_instance(dip); 2599 state = ddi_get_soft_state(ibd_list, instance); 2600 2601 /* 2602 * Release all resources we're holding still. Note that if we'd 2603 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2604 * so far, we should find all the flags we need in id_mac_state. 2605 */ 2606 (void) ibd_unattach(state, dip); 2607 2608 return (DDI_SUCCESS); 2609 } 2610 2611 /* 2612 * Pre ibt_attach() driver initialization 2613 */ 2614 static int 2615 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2616 { 2617 char buf[64]; 2618 2619 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2620 state->id_link_state = LINK_STATE_UNKNOWN; 2621 2622 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2623 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2624 state->id_trap_stop = B_TRUE; 2625 state->id_trap_inprog = 0; 2626 2627 mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2628 state->id_dip = dip; 2629 2630 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2631 2632 state->id_tx_list.dl_head = NULL; 2633 state->id_tx_list.dl_tail = NULL; 2634 state->id_tx_list.dl_pending_sends = B_FALSE; 2635 state->id_tx_list.dl_cnt = 0; 2636 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2637 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2638 state->id_tx_busy = 0; 2639 2640 state->id_rx_list.dl_head = NULL; 2641 state->id_rx_list.dl_tail = NULL; 2642 state->id_rx_list.dl_bufs_outstanding = 0; 2643 state->id_rx_list.dl_cnt = 0; 2644 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2645 mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL); 2646 2647 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2648 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2649 0, NULL, NULL, NULL, NULL, NULL, 0); 2650 2651 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2652 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2653 2654 return (DDI_SUCCESS); 2655 } 2656 2657 /* 2658 * Post ibt_detach() driver deconstruction 2659 */ 2660 static void 2661 ibd_state_fini(ibd_state_t *state) 2662 { 2663 cv_destroy(&state->id_macst_cv); 2664 mutex_destroy(&state->id_macst_lock); 2665 2666 kmem_cache_destroy(state->id_req_kmc); 2667 2668 mutex_destroy(&state->id_rxpost_lock); 2669 mutex_destroy(&state->id_rx_list.dl_mutex); 2670 2671 mutex_destroy(&state->id_txpost_lock); 2672 mutex_destroy(&state->id_tx_list.dl_mutex); 2673 2674 mutex_destroy(&state->id_sched_lock); 2675 mutex_destroy(&state->id_cq_poll_lock); 2676 2677 cv_destroy(&state->id_trap_cv); 2678 mutex_destroy(&state->id_trap_lock); 2679 mutex_destroy(&state->id_link_mutex); 2680 } 2681 2682 /* 2683 * Fetch link speed from SA for snmp ifspeed reporting. 2684 */ 2685 static uint64_t 2686 ibd_get_portspeed(ibd_state_t *state) 2687 { 2688 int ret; 2689 ibt_path_info_t path; 2690 ibt_path_attr_t path_attr; 2691 uint8_t num_paths; 2692 uint64_t ifspeed; 2693 2694 /* 2695 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2696 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2697 * 2000000000. Start with that as default. 2698 */ 2699 ifspeed = 2000000000; 2700 2701 bzero(&path_attr, sizeof (path_attr)); 2702 2703 /* 2704 * Get the port speed from Loopback path information. 2705 */ 2706 path_attr.pa_dgids = &state->id_sgid; 2707 path_attr.pa_num_dgids = 1; 2708 path_attr.pa_sgid = state->id_sgid; 2709 2710 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2711 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2712 goto earlydone; 2713 2714 if (num_paths < 1) 2715 goto earlydone; 2716 2717 /* 2718 * In case SA does not return an expected value, report the default 2719 * speed as 1X. 2720 */ 2721 ret = 1; 2722 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2723 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2724 ret = 1; 2725 break; 2726 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2727 ret = 4; 2728 break; 2729 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2730 ret = 12; 2731 break; 2732 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2733 ret = 2; 2734 break; 2735 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2736 ret = 8; 2737 break; 2738 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2739 ret = 16; 2740 break; 2741 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2742 ret = 24; 2743 break; 2744 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2745 ret = 32; 2746 break; 2747 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2748 ret = 48; 2749 break; 2750 } 2751 2752 ifspeed *= ret; 2753 2754 earlydone: 2755 return (ifspeed); 2756 } 2757 2758 /* 2759 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2760 * representing the input mcg mgid. 2761 */ 2762 static ibd_mce_t * 2763 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2764 { 2765 ibd_mce_t *ptr = list_head(mlist); 2766 2767 /* 2768 * Do plain linear search. 2769 */ 2770 while (ptr != NULL) { 2771 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2772 sizeof (ib_gid_t)) == 0) 2773 return (ptr); 2774 ptr = list_next(mlist, ptr); 2775 } 2776 return (NULL); 2777 } 2778 2779 /* 2780 * Execute IBA JOIN. 2781 */ 2782 static ibt_status_t 2783 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2784 { 2785 ibt_mcg_attr_t mcg_attr; 2786 2787 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2788 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2789 mcg_attr.mc_mgid = mgid; 2790 mcg_attr.mc_join_state = mce->mc_jstate; 2791 mcg_attr.mc_scope = state->id_scope; 2792 mcg_attr.mc_pkey = state->id_pkey; 2793 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2794 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2795 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2796 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2797 NULL, NULL)); 2798 } 2799 2800 /* 2801 * This code JOINs the port in the proper way (depending on the join 2802 * state) so that IBA fabric will forward mcg packets to/from the port. 2803 * It also attaches the QPN to the mcg so it can receive those mcg 2804 * packets. This code makes sure not to attach the mcg to the QP if 2805 * that has been previously done due to the mcg being joined with a 2806 * different join state, even though this is not required by SWG_0216, 2807 * refid 3610. 2808 */ 2809 static ibd_mce_t * 2810 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2811 { 2812 ibt_status_t ibt_status; 2813 ibd_mce_t *mce, *tmce, *omce = NULL; 2814 boolean_t do_attach = B_TRUE; 2815 2816 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2817 jstate, mgid.gid_prefix, mgid.gid_guid); 2818 2819 /* 2820 * For enable_multicast Full member joins, we need to do some 2821 * extra work. If there is already an mce on the list that 2822 * indicates full membership, that means the membership has 2823 * not yet been dropped (since the disable_multicast was issued) 2824 * because there are pending Tx's to the mcg; in that case, just 2825 * mark the mce not to be reaped when the Tx completion queues 2826 * an async reap operation. 2827 * 2828 * If there is already an mce on the list indicating sendonly 2829 * membership, try to promote to full membership. Be careful 2830 * not to deallocate the old mce, since there might be an AH 2831 * pointing to it; instead, update the old mce with new data 2832 * that tracks the full membership. 2833 */ 2834 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2835 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2836 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2837 ASSERT(omce->mc_fullreap); 2838 omce->mc_fullreap = B_FALSE; 2839 return (omce); 2840 } else { 2841 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2842 } 2843 } 2844 2845 /* 2846 * Allocate the ibd_mce_t to track this JOIN. 2847 */ 2848 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2849 mce->mc_fullreap = B_FALSE; 2850 mce->mc_jstate = jstate; 2851 2852 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2853 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2854 ibt_status); 2855 kmem_free(mce, sizeof (ibd_mce_t)); 2856 return (NULL); 2857 } 2858 2859 /* 2860 * Is an IBA attach required? Not if the interface is already joined 2861 * to the mcg in a different appropriate join state. 2862 */ 2863 if (jstate == IB_MC_JSTATE_NON) { 2864 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2865 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2866 do_attach = B_FALSE; 2867 } else if (jstate == IB_MC_JSTATE_FULL) { 2868 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2869 do_attach = B_FALSE; 2870 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2871 do_attach = B_FALSE; 2872 } 2873 2874 if (do_attach) { 2875 /* 2876 * Do the IBA attach. 2877 */ 2878 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2879 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2880 &mce->mc_info)) != IBT_SUCCESS) { 2881 DPRINT(10, "ibd_join_group : failed qp attachment " 2882 "%d\n", ibt_status); 2883 /* 2884 * NOTE that we should probably preserve the join info 2885 * in the list and later try to leave again at detach 2886 * time. 2887 */ 2888 (void) ibt_leave_mcg(state->id_sgid, mgid, 2889 state->id_sgid, jstate); 2890 kmem_free(mce, sizeof (ibd_mce_t)); 2891 return (NULL); 2892 } 2893 } 2894 2895 /* 2896 * Insert the ibd_mce_t in the proper list. 2897 */ 2898 if (jstate == IB_MC_JSTATE_NON) { 2899 IBD_MCACHE_INSERT_NON(state, mce); 2900 } else { 2901 /* 2902 * Set up the mc_req fields used for reaping the 2903 * mcg in case of delayed tx completion (see 2904 * ibd_tx_cleanup()). Also done for sendonly join in 2905 * case we are promoted to fullmembership later and 2906 * keep using the same mce. 2907 */ 2908 mce->mc_req.rq_gid = mgid; 2909 mce->mc_req.rq_ptr = mce; 2910 /* 2911 * Check whether this is the case of trying to join 2912 * full member, and we were already joined send only. 2913 * We try to drop our SendOnly membership, but it is 2914 * possible that the mcg does not exist anymore (and 2915 * the subnet trap never reached us), so the leave 2916 * operation might fail. 2917 */ 2918 if (omce != NULL) { 2919 (void) ibt_leave_mcg(state->id_sgid, mgid, 2920 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2921 omce->mc_jstate = IB_MC_JSTATE_FULL; 2922 bcopy(&mce->mc_info, &omce->mc_info, 2923 sizeof (ibt_mcg_info_t)); 2924 kmem_free(mce, sizeof (ibd_mce_t)); 2925 return (omce); 2926 } 2927 mutex_enter(&state->id_mc_mutex); 2928 IBD_MCACHE_INSERT_FULL(state, mce); 2929 mutex_exit(&state->id_mc_mutex); 2930 } 2931 2932 return (mce); 2933 } 2934 2935 /* 2936 * Called during port up event handling to attempt to reacquire full 2937 * membership to an mcg. Stripped down version of ibd_join_group(). 2938 * Note that it is possible that the mcg might have gone away, and 2939 * gets recreated at this point. 2940 */ 2941 static void 2942 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2943 { 2944 ib_gid_t mgid; 2945 2946 /* 2947 * If the mc_fullreap flag is set, or this join fails, a subsequent 2948 * reap/leave is going to try to leave the group. We could prevent 2949 * that by adding a boolean flag into ibd_mce_t, if required. 2950 */ 2951 if (mce->mc_fullreap) 2952 return; 2953 2954 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2955 2956 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2957 mgid.gid_guid); 2958 2959 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2960 ibd_print_warn(state, "Failure on port up to rejoin " 2961 "multicast gid %016llx:%016llx", 2962 (u_longlong_t)mgid.gid_prefix, 2963 (u_longlong_t)mgid.gid_guid); 2964 } 2965 2966 /* 2967 * This code handles delayed Tx completion cleanups for mcg's to which 2968 * disable_multicast has been issued, regular mcg related cleanups during 2969 * disable_multicast, disable_promiscous and mcg traps, as well as 2970 * cleanups during driver detach time. Depending on the join state, 2971 * it deletes the mce from the appropriate list and issues the IBA 2972 * leave/detach; except in the disable_multicast case when the mce 2973 * is left on the active list for a subsequent Tx completion cleanup. 2974 */ 2975 static void 2976 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2977 uint8_t jstate) 2978 { 2979 ibd_mce_t *tmce; 2980 boolean_t do_detach = B_TRUE; 2981 2982 /* 2983 * Before detaching, we must check whether the other list 2984 * contains the mcg; if we detach blindly, the consumer 2985 * who set up the other list will also stop receiving 2986 * traffic. 2987 */ 2988 if (jstate == IB_MC_JSTATE_FULL) { 2989 /* 2990 * The following check is only relevant while coming 2991 * from the Tx completion path in the reap case. 2992 */ 2993 if (!mce->mc_fullreap) 2994 return; 2995 mutex_enter(&state->id_mc_mutex); 2996 IBD_MCACHE_PULLOUT_FULL(state, mce); 2997 mutex_exit(&state->id_mc_mutex); 2998 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2999 do_detach = B_FALSE; 3000 } else if (jstate == IB_MC_JSTATE_NON) { 3001 IBD_MCACHE_PULLOUT_NON(state, mce); 3002 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3003 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3004 do_detach = B_FALSE; 3005 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3006 mutex_enter(&state->id_mc_mutex); 3007 IBD_MCACHE_PULLOUT_FULL(state, mce); 3008 mutex_exit(&state->id_mc_mutex); 3009 do_detach = B_FALSE; 3010 } 3011 3012 /* 3013 * If we are reacting to a mcg trap and leaving our sendonly or 3014 * non membership, the mcg is possibly already gone, so attempting 3015 * to leave might fail. On the other hand, we must try to leave 3016 * anyway, since this might be a trap from long ago, and we could 3017 * have potentially sendonly joined to a recent incarnation of 3018 * the mcg and are about to loose track of this information. 3019 */ 3020 if (do_detach) { 3021 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3022 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3023 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3024 } 3025 3026 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3027 kmem_free(mce, sizeof (ibd_mce_t)); 3028 } 3029 3030 /* 3031 * Async code executed due to multicast and promiscuous disable requests 3032 * and mcg trap handling; also executed during driver detach. Mostly, a 3033 * leave and detach is done; except for the fullmember case when Tx 3034 * requests are pending, whence arrangements are made for subsequent 3035 * cleanup on Tx completion. 3036 */ 3037 static void 3038 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3039 { 3040 ipoib_mac_t mcmac; 3041 boolean_t recycled; 3042 ibd_mce_t *mce; 3043 3044 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3045 jstate, mgid.gid_prefix, mgid.gid_guid); 3046 3047 if (jstate == IB_MC_JSTATE_NON) { 3048 recycled = B_TRUE; 3049 mce = IBD_MCACHE_FIND_NON(state, mgid); 3050 /* 3051 * In case we are handling a mcg trap, we might not find 3052 * the mcg in the non list. 3053 */ 3054 if (mce == NULL) { 3055 return; 3056 } 3057 } else { 3058 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3059 3060 /* 3061 * In case we are handling a mcg trap, make sure the trap 3062 * is not arriving late; if we have an mce that indicates 3063 * that we are already a fullmember, that would be a clear 3064 * indication that the trap arrived late (ie, is for a 3065 * previous incarnation of the mcg). 3066 */ 3067 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3068 if ((mce == NULL) || (mce->mc_jstate == 3069 IB_MC_JSTATE_FULL)) { 3070 return; 3071 } 3072 } else { 3073 ASSERT(jstate == IB_MC_JSTATE_FULL); 3074 3075 /* 3076 * If join group failed, mce will be NULL here. 3077 * This is because in GLDv3 driver, set multicast 3078 * will always return success. 3079 */ 3080 if (mce == NULL) { 3081 return; 3082 } 3083 3084 mce->mc_fullreap = B_TRUE; 3085 } 3086 3087 /* 3088 * If no pending Tx's remain that reference the AH 3089 * for the mcg, recycle it from active to free list. 3090 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3091 * so the last completing Tx will cause an async reap 3092 * operation to be invoked, at which time we will drop our 3093 * membership to the mcg so that the pending Tx's complete 3094 * successfully. Refer to comments on "AH and MCE active 3095 * list manipulation" at top of this file. The lock protects 3096 * against Tx fast path and Tx cleanup code. 3097 */ 3098 mutex_enter(&state->id_ac_mutex); 3099 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3100 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3101 IB_MC_JSTATE_SEND_ONLY_NON)); 3102 mutex_exit(&state->id_ac_mutex); 3103 } 3104 3105 if (recycled) { 3106 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3107 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3108 ibd_async_reap_group(state, mce, mgid, jstate); 3109 } 3110 } 3111 3112 /* 3113 * Find the broadcast address as defined by IPoIB; implicitly 3114 * determines the IBA scope, mtu, tclass etc of the link the 3115 * interface is going to be a member of. 3116 */ 3117 static ibt_status_t 3118 ibd_find_bgroup(ibd_state_t *state) 3119 { 3120 ibt_mcg_attr_t mcg_attr; 3121 uint_t numg; 3122 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3123 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3124 IB_MC_SCOPE_GLOBAL }; 3125 int i, mcgmtu; 3126 boolean_t found = B_FALSE; 3127 int ret; 3128 ibt_mcg_info_t mcg_info; 3129 3130 state->id_bgroup_created = B_FALSE; 3131 3132 query_bcast_grp: 3133 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3134 mcg_attr.mc_pkey = state->id_pkey; 3135 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3136 3137 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3138 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3139 3140 /* 3141 * Look for the IPoIB broadcast group. 3142 */ 3143 state->id_mgid.gid_prefix = 3144 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3145 ((uint64_t)state->id_scope << 48) | 3146 ((uint32_t)(state->id_pkey << 16))); 3147 mcg_attr.mc_mgid = state->id_mgid; 3148 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3149 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3150 found = B_TRUE; 3151 break; 3152 } 3153 } 3154 3155 if (!found) { 3156 if (ibd_create_broadcast_group) { 3157 /* 3158 * If we created the broadcast group, but failed to 3159 * find it, we can't do anything except leave the 3160 * one we created and return failure. 3161 */ 3162 if (state->id_bgroup_created) { 3163 ibd_print_warn(state, "IPoIB broadcast group " 3164 "absent. Unable to query after create."); 3165 goto find_bgroup_fail; 3166 } 3167 3168 /* 3169 * Create the ipoib broadcast group if it didn't exist 3170 */ 3171 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3172 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3173 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3174 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3175 mcg_attr.mc_pkey = state->id_pkey; 3176 mcg_attr.mc_flow = 0; 3177 mcg_attr.mc_sl = 0; 3178 mcg_attr.mc_tclass = 0; 3179 state->id_mgid.gid_prefix = 3180 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3181 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3182 ((uint32_t)(state->id_pkey << 16))); 3183 mcg_attr.mc_mgid = state->id_mgid; 3184 3185 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3186 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3187 ibd_print_warn(state, "IPoIB broadcast group " 3188 "absent, create failed: ret = %d\n", ret); 3189 state->id_bgroup_created = B_FALSE; 3190 return (IBT_FAILURE); 3191 } 3192 state->id_bgroup_created = B_TRUE; 3193 goto query_bcast_grp; 3194 } else { 3195 ibd_print_warn(state, "IPoIB broadcast group absent"); 3196 return (IBT_FAILURE); 3197 } 3198 } 3199 3200 /* 3201 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3202 */ 3203 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3204 if (state->id_mtu < mcgmtu) { 3205 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3206 "greater than port's maximum MTU %d", mcgmtu, 3207 state->id_mtu); 3208 ibt_free_mcg_info(state->id_mcinfo, 1); 3209 goto find_bgroup_fail; 3210 } 3211 state->id_mtu = mcgmtu; 3212 3213 return (IBT_SUCCESS); 3214 3215 find_bgroup_fail: 3216 if (state->id_bgroup_created) { 3217 (void) ibt_leave_mcg(state->id_sgid, 3218 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3219 IB_MC_JSTATE_FULL); 3220 } 3221 3222 return (IBT_FAILURE); 3223 } 3224 3225 static int 3226 ibd_alloc_tx_copybufs(ibd_state_t *state) 3227 { 3228 ibt_mr_attr_t mem_attr; 3229 3230 /* 3231 * Allocate one big chunk for all regular tx copy bufs 3232 */ 3233 state->id_tx_buf_sz = state->id_mtu; 3234 if (state->id_lso_policy && state->id_lso_capable && 3235 (IBD_TX_BUF_SZ > state->id_mtu)) { 3236 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3237 } 3238 3239 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3240 state->id_tx_buf_sz, KM_SLEEP); 3241 3242 /* 3243 * Do one memory registration on the entire txbuf area 3244 */ 3245 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3246 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3247 mem_attr.mr_as = NULL; 3248 mem_attr.mr_flags = IBT_MR_SLEEP; 3249 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3250 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3251 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3252 kmem_free(state->id_tx_bufs, 3253 state->id_num_swqe * state->id_tx_buf_sz); 3254 state->id_tx_bufs = NULL; 3255 return (DDI_FAILURE); 3256 } 3257 3258 return (DDI_SUCCESS); 3259 } 3260 3261 static int 3262 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3263 { 3264 ibt_mr_attr_t mem_attr; 3265 ibd_lsobuf_t *buflist; 3266 ibd_lsobuf_t *lbufp; 3267 ibd_lsobuf_t *tail; 3268 ibd_lsobkt_t *bktp; 3269 uint8_t *membase; 3270 uint8_t *memp; 3271 uint_t memsz; 3272 int i; 3273 3274 /* 3275 * Allocate the lso bucket 3276 */ 3277 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3278 3279 /* 3280 * Allocate the entire lso memory and register it 3281 */ 3282 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3283 membase = kmem_zalloc(memsz, KM_SLEEP); 3284 3285 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3286 mem_attr.mr_len = memsz; 3287 mem_attr.mr_as = NULL; 3288 mem_attr.mr_flags = IBT_MR_SLEEP; 3289 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3290 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3291 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3292 kmem_free(membase, memsz); 3293 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3294 return (DDI_FAILURE); 3295 } 3296 3297 /* 3298 * Now allocate the buflist. Note that the elements in the buflist and 3299 * the buffers in the lso memory have a permanent 1-1 relation, so we 3300 * can always derive the address of a buflist entry from the address of 3301 * an lso buffer. 3302 */ 3303 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3304 KM_SLEEP); 3305 3306 /* 3307 * Set up the lso buf chain 3308 */ 3309 memp = membase; 3310 lbufp = buflist; 3311 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3312 lbufp->lb_isfree = 1; 3313 lbufp->lb_buf = memp; 3314 lbufp->lb_next = lbufp + 1; 3315 3316 tail = lbufp; 3317 3318 memp += IBD_LSO_BUFSZ; 3319 lbufp++; 3320 } 3321 tail->lb_next = NULL; 3322 3323 /* 3324 * Set up the LSO buffer information in ibd state 3325 */ 3326 bktp->bkt_bufl = buflist; 3327 bktp->bkt_free_head = buflist; 3328 bktp->bkt_mem = membase; 3329 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3330 bktp->bkt_nfree = bktp->bkt_nelem; 3331 3332 state->id_lso = bktp; 3333 3334 return (DDI_SUCCESS); 3335 } 3336 3337 /* 3338 * Statically allocate Tx buffer list(s). 3339 */ 3340 static int 3341 ibd_init_txlist(ibd_state_t *state) 3342 { 3343 ibd_swqe_t *swqe; 3344 ibt_lkey_t lkey; 3345 int i; 3346 3347 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3348 return (DDI_FAILURE); 3349 3350 if (state->id_lso_policy && state->id_lso_capable) { 3351 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3352 state->id_lso_policy = B_FALSE; 3353 } 3354 3355 /* 3356 * Allocate and setup the swqe list 3357 */ 3358 lkey = state->id_tx_mr_desc.md_lkey; 3359 for (i = 0; i < state->id_num_swqe; i++) { 3360 if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) { 3361 DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed"); 3362 ibd_fini_txlist(state); 3363 return (DDI_FAILURE); 3364 } 3365 3366 /* add to list */ 3367 state->id_tx_list.dl_cnt++; 3368 if (state->id_tx_list.dl_head == NULL) { 3369 swqe->swqe_prev = NULL; 3370 swqe->swqe_next = NULL; 3371 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3372 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3373 } else { 3374 swqe->swqe_prev = state->id_tx_list.dl_tail; 3375 swqe->swqe_next = NULL; 3376 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3377 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3378 } 3379 } 3380 3381 return (DDI_SUCCESS); 3382 } 3383 3384 static int 3385 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3386 uint32_t *nds_p) 3387 { 3388 ibd_lsobkt_t *bktp; 3389 ibd_lsobuf_t *lbufp; 3390 ibd_lsobuf_t *nextp; 3391 ibt_lkey_t lso_lkey; 3392 uint_t frag_sz; 3393 uint_t num_needed; 3394 int i; 3395 3396 ASSERT(sgl_p != NULL); 3397 ASSERT(nds_p != NULL); 3398 ASSERT(req_sz != 0); 3399 3400 /* 3401 * Determine how many bufs we'd need for the size requested 3402 */ 3403 num_needed = req_sz / IBD_LSO_BUFSZ; 3404 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3405 num_needed++; 3406 3407 mutex_enter(&state->id_lso_lock); 3408 3409 /* 3410 * If we don't have enough lso bufs, return failure 3411 */ 3412 ASSERT(state->id_lso != NULL); 3413 bktp = state->id_lso; 3414 if (bktp->bkt_nfree < num_needed) { 3415 mutex_exit(&state->id_lso_lock); 3416 return (-1); 3417 } 3418 3419 /* 3420 * Pick the first 'num_needed' bufs from the free list 3421 */ 3422 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3423 lbufp = bktp->bkt_free_head; 3424 for (i = 0; i < num_needed; i++) { 3425 ASSERT(lbufp->lb_isfree != 0); 3426 ASSERT(lbufp->lb_buf != NULL); 3427 3428 nextp = lbufp->lb_next; 3429 3430 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3431 sgl_p[i].ds_key = lso_lkey; 3432 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3433 3434 lbufp->lb_isfree = 0; 3435 lbufp->lb_next = NULL; 3436 3437 lbufp = nextp; 3438 } 3439 bktp->bkt_free_head = lbufp; 3440 3441 /* 3442 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3443 * to adjust the last sgl entry's length. Since we know we need atleast 3444 * one, the i-1 use below is ok. 3445 */ 3446 if (frag_sz) { 3447 sgl_p[i-1].ds_len = frag_sz; 3448 } 3449 3450 /* 3451 * Update nfree count and return 3452 */ 3453 bktp->bkt_nfree -= num_needed; 3454 3455 mutex_exit(&state->id_lso_lock); 3456 3457 *nds_p = num_needed; 3458 3459 return (0); 3460 } 3461 3462 static void 3463 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3464 { 3465 ibd_lsobkt_t *bktp; 3466 ibd_lsobuf_t *lbufp; 3467 uint8_t *lso_mem_end; 3468 uint_t ndx; 3469 int i; 3470 3471 mutex_enter(&state->id_lso_lock); 3472 3473 bktp = state->id_lso; 3474 ASSERT(bktp != NULL); 3475 3476 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3477 for (i = 0; i < nds; i++) { 3478 uint8_t *va; 3479 3480 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3481 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3482 3483 /* 3484 * Figure out the buflist element this sgl buffer corresponds 3485 * to and put it back at the head 3486 */ 3487 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3488 lbufp = bktp->bkt_bufl + ndx; 3489 3490 ASSERT(lbufp->lb_isfree == 0); 3491 ASSERT(lbufp->lb_buf == va); 3492 3493 lbufp->lb_isfree = 1; 3494 lbufp->lb_next = bktp->bkt_free_head; 3495 bktp->bkt_free_head = lbufp; 3496 } 3497 bktp->bkt_nfree += nds; 3498 3499 mutex_exit(&state->id_lso_lock); 3500 } 3501 3502 static void 3503 ibd_free_tx_copybufs(ibd_state_t *state) 3504 { 3505 /* 3506 * Unregister txbuf mr 3507 */ 3508 if (ibt_deregister_mr(state->id_hca_hdl, 3509 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3510 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3511 } 3512 state->id_tx_mr_hdl = NULL; 3513 3514 /* 3515 * Free txbuf memory 3516 */ 3517 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3518 state->id_tx_bufs = NULL; 3519 } 3520 3521 static void 3522 ibd_free_tx_lsobufs(ibd_state_t *state) 3523 { 3524 ibd_lsobkt_t *bktp; 3525 3526 mutex_enter(&state->id_lso_lock); 3527 3528 if ((bktp = state->id_lso) == NULL) { 3529 mutex_exit(&state->id_lso_lock); 3530 return; 3531 } 3532 3533 /* 3534 * First, free the buflist 3535 */ 3536 ASSERT(bktp->bkt_bufl != NULL); 3537 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3538 3539 /* 3540 * Unregister the LSO memory and free it 3541 */ 3542 ASSERT(bktp->bkt_mr_hdl != NULL); 3543 if (ibt_deregister_mr(state->id_hca_hdl, 3544 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3545 DPRINT(10, 3546 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3547 } 3548 ASSERT(bktp->bkt_mem); 3549 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3550 3551 /* 3552 * Finally free the bucket 3553 */ 3554 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3555 state->id_lso = NULL; 3556 3557 mutex_exit(&state->id_lso_lock); 3558 } 3559 3560 /* 3561 * Free the statically allocated Tx buffer list. 3562 */ 3563 static void 3564 ibd_fini_txlist(ibd_state_t *state) 3565 { 3566 ibd_swqe_t *node; 3567 3568 /* 3569 * Free the allocated swqes 3570 */ 3571 mutex_enter(&state->id_tx_list.dl_mutex); 3572 while (state->id_tx_list.dl_head != NULL) { 3573 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3574 state->id_tx_list.dl_head = node->swqe_next; 3575 ASSERT(state->id_tx_list.dl_cnt > 0); 3576 state->id_tx_list.dl_cnt--; 3577 ibd_free_swqe(state, node); 3578 } 3579 mutex_exit(&state->id_tx_list.dl_mutex); 3580 3581 ibd_free_tx_lsobufs(state); 3582 ibd_free_tx_copybufs(state); 3583 } 3584 3585 /* 3586 * Allocate a single send wqe and register it so it is almost 3587 * ready to be posted to the hardware. 3588 */ 3589 static int 3590 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey) 3591 { 3592 ibd_swqe_t *swqe; 3593 3594 swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP); 3595 *wqe = swqe; 3596 3597 swqe->swqe_type = IBD_WQE_SEND; 3598 swqe->swqe_next = NULL; 3599 swqe->swqe_prev = NULL; 3600 swqe->swqe_im_mblk = NULL; 3601 3602 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3603 (state->id_tx_bufs + ndx * state->id_tx_buf_sz); 3604 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3605 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3606 3607 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3608 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3609 swqe->w_swr.wr_trans = IBT_UD_SRV; 3610 3611 /* These are set in send */ 3612 swqe->w_swr.wr_nds = 0; 3613 swqe->w_swr.wr_sgl = NULL; 3614 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3615 3616 return (DDI_SUCCESS); 3617 } 3618 3619 /* 3620 * Free an allocated send wqe. 3621 */ 3622 /*ARGSUSED*/ 3623 static void 3624 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3625 { 3626 kmem_free(swqe, sizeof (ibd_swqe_t)); 3627 } 3628 3629 /* 3630 * Post a rwqe to the hardware and add it to the Rx list. The 3631 * "recycle" parameter indicates whether an old rwqe is being 3632 * recycled, or this is a new one. 3633 */ 3634 static int 3635 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3636 { 3637 ibt_status_t ibt_status; 3638 3639 if (recycle == B_FALSE) { 3640 mutex_enter(&state->id_rx_list.dl_mutex); 3641 if (state->id_rx_list.dl_head == NULL) { 3642 rwqe->rwqe_prev = NULL; 3643 rwqe->rwqe_next = NULL; 3644 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3645 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3646 } else { 3647 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3648 rwqe->rwqe_next = NULL; 3649 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3650 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3651 } 3652 mutex_exit(&state->id_rx_list.dl_mutex); 3653 } 3654 3655 mutex_enter(&state->id_rxpost_lock); 3656 if (state->id_rx_busy) { 3657 rwqe->w_post_link = NULL; 3658 if (state->id_rx_head) 3659 *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe; 3660 else 3661 state->id_rx_head = rwqe; 3662 state->id_rx_tailp = &(rwqe->w_post_link); 3663 } else { 3664 state->id_rx_busy = 1; 3665 do { 3666 mutex_exit(&state->id_rxpost_lock); 3667 3668 /* 3669 * Here we should add dl_cnt before post recv, because 3670 * we would have to make sure dl_cnt is updated before 3671 * the corresponding ibd_process_rx() is called. 3672 */ 3673 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3674 3675 ibt_status = ibt_post_recv(state->id_chnl_hdl, 3676 &rwqe->w_rwr, 1, NULL); 3677 if (ibt_status != IBT_SUCCESS) { 3678 (void) atomic_add_32_nv( 3679 &state->id_rx_list.dl_cnt, -1); 3680 ibd_print_warn(state, "ibd_post_recv: " 3681 "posting failed, ret=%d", ibt_status); 3682 return (DDI_FAILURE); 3683 } 3684 3685 mutex_enter(&state->id_rxpost_lock); 3686 rwqe = state->id_rx_head; 3687 if (rwqe) { 3688 state->id_rx_head = 3689 (ibd_rwqe_t *)(rwqe->w_post_link); 3690 } 3691 } while (rwqe); 3692 state->id_rx_busy = 0; 3693 } 3694 mutex_exit(&state->id_rxpost_lock); 3695 3696 return (DDI_SUCCESS); 3697 } 3698 3699 /* 3700 * Allocate the statically allocated Rx buffer list. 3701 */ 3702 static int 3703 ibd_init_rxlist(ibd_state_t *state) 3704 { 3705 ibd_rwqe_t *rwqe; 3706 int i; 3707 3708 for (i = 0; i < state->id_num_rwqe; i++) { 3709 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3710 ibd_fini_rxlist(state); 3711 return (DDI_FAILURE); 3712 } 3713 3714 if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) { 3715 ibd_free_rwqe(state, rwqe); 3716 ibd_fini_rxlist(state); 3717 return (DDI_FAILURE); 3718 } 3719 } 3720 3721 return (DDI_SUCCESS); 3722 } 3723 3724 /* 3725 * Free the statically allocated Rx buffer list. 3726 * 3727 */ 3728 static void 3729 ibd_fini_rxlist(ibd_state_t *state) 3730 { 3731 ibd_rwqe_t *node; 3732 3733 mutex_enter(&state->id_rx_list.dl_mutex); 3734 while (state->id_rx_list.dl_head != NULL) { 3735 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3736 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3737 ASSERT(state->id_rx_list.dl_cnt > 0); 3738 state->id_rx_list.dl_cnt--; 3739 3740 ibd_free_rwqe(state, node); 3741 } 3742 mutex_exit(&state->id_rx_list.dl_mutex); 3743 } 3744 3745 /* 3746 * Allocate a single recv wqe and register it so it is almost 3747 * ready to be posted to the hardware. 3748 */ 3749 static int 3750 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3751 { 3752 ibt_mr_attr_t mem_attr; 3753 ibd_rwqe_t *rwqe; 3754 3755 if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3756 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3757 return (DDI_FAILURE); 3758 } 3759 *wqe = rwqe; 3760 rwqe->rwqe_type = IBD_WQE_RECV; 3761 rwqe->w_state = state; 3762 rwqe->rwqe_next = NULL; 3763 rwqe->rwqe_prev = NULL; 3764 rwqe->w_freeing_wqe = B_FALSE; 3765 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3766 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3767 3768 rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3769 IPOIB_GRH_SIZE, KM_NOSLEEP); 3770 if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) { 3771 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3772 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3773 return (DDI_FAILURE); 3774 } 3775 3776 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3777 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3778 NULL) { 3779 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3780 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3781 state->id_mtu + IPOIB_GRH_SIZE); 3782 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3783 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3784 return (DDI_FAILURE); 3785 } 3786 3787 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3788 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3789 mem_attr.mr_as = NULL; 3790 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3791 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3792 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3793 IBT_SUCCESS) { 3794 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3795 rwqe->w_freeing_wqe = B_TRUE; 3796 freemsg(rwqe->rwqe_im_mblk); 3797 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3798 state->id_mtu + IPOIB_GRH_SIZE); 3799 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3800 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3801 return (DDI_FAILURE); 3802 } 3803 3804 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3805 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3806 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3807 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3808 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3809 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3810 rwqe->w_rwr.wr_nds = 1; 3811 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3812 3813 return (DDI_SUCCESS); 3814 } 3815 3816 /* 3817 * Free an allocated recv wqe. 3818 */ 3819 static void 3820 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3821 { 3822 if (ibt_deregister_mr(state->id_hca_hdl, 3823 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3824 DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()"); 3825 return; 3826 } 3827 3828 /* 3829 * Indicate to the callback function that this rwqe/mblk 3830 * should not be recycled. The freemsg() will invoke 3831 * ibd_freemsg_cb(). 3832 */ 3833 if (rwqe->rwqe_im_mblk != NULL) { 3834 rwqe->w_freeing_wqe = B_TRUE; 3835 freemsg(rwqe->rwqe_im_mblk); 3836 } 3837 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3838 state->id_mtu + IPOIB_GRH_SIZE); 3839 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3840 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3841 } 3842 3843 /* 3844 * Delete the rwqe being freed from the rx list. 3845 */ 3846 static void 3847 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3848 { 3849 mutex_enter(&state->id_rx_list.dl_mutex); 3850 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3851 state->id_rx_list.dl_head = rwqe->rwqe_next; 3852 else 3853 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3854 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3855 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3856 else 3857 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3858 mutex_exit(&state->id_rx_list.dl_mutex); 3859 } 3860 3861 /* 3862 * IBA Rx/Tx completion queue handler. Guaranteed to be single 3863 * threaded and nonreentrant for this CQ. When using combined CQ, 3864 * this handles Tx and Rx completions. With separate CQs, this handles 3865 * only Rx completions. 3866 */ 3867 /* ARGSUSED */ 3868 static void 3869 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3870 { 3871 ibd_state_t *state = (ibd_state_t *)arg; 3872 3873 atomic_add_64(&state->id_num_intrs, 1); 3874 3875 if (ibd_rx_softintr == 1) 3876 ddi_trigger_softintr(state->id_rx); 3877 else 3878 (void) ibd_intr((char *)state); 3879 } 3880 3881 /* 3882 * Separate CQ handler for Tx completions, when the Tx CQ is in 3883 * interrupt driven mode. 3884 */ 3885 /* ARGSUSED */ 3886 static void 3887 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3888 { 3889 ibd_state_t *state = (ibd_state_t *)arg; 3890 3891 atomic_add_64(&state->id_num_intrs, 1); 3892 3893 if (ibd_tx_softintr == 1) 3894 ddi_trigger_softintr(state->id_tx); 3895 else 3896 (void) ibd_tx_recycle((char *)state); 3897 } 3898 3899 /* 3900 * Multicast group create/delete trap handler. These will be delivered 3901 * on a kernel thread (handling can thus block) and can be invoked 3902 * concurrently. The handler can be invoked anytime after it is 3903 * registered and before ibt_detach(). 3904 */ 3905 /* ARGSUSED */ 3906 static void 3907 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3908 ibt_subnet_event_t *event) 3909 { 3910 ibd_state_t *state = (ibd_state_t *)arg; 3911 ibd_req_t *req; 3912 3913 /* 3914 * The trap handler will get invoked once for every event for 3915 * evert port. The input "gid" is the GID0 of the port the 3916 * trap came in on; we just need to act on traps that came 3917 * to our port, meaning the port on which the ipoib interface 3918 * resides. Since ipoib uses GID0 of the port, we just match 3919 * the gids to check whether we need to handle the trap. 3920 */ 3921 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3922 return; 3923 3924 DPRINT(10, "ibd_notices_handler : %d\n", code); 3925 3926 switch (code) { 3927 case IBT_SM_EVENT_UNAVAILABLE: 3928 /* 3929 * If we are in promiscuous mode or have 3930 * sendnonmembers, we need to print a warning 3931 * message right now. Else, just store the 3932 * information, print when we enter promiscuous 3933 * mode or attempt nonmember send. We might 3934 * also want to stop caching sendnonmember. 3935 */ 3936 ibd_print_warn(state, "IBA multicast support " 3937 "degraded due to unavailability of multicast " 3938 "traps"); 3939 break; 3940 case IBT_SM_EVENT_AVAILABLE: 3941 /* 3942 * If we printed a warning message above or 3943 * while trying to nonmember send or get into 3944 * promiscuous mode, print an okay message. 3945 */ 3946 ibd_print_warn(state, "IBA multicast support " 3947 "restored due to availability of multicast " 3948 "traps"); 3949 break; 3950 case IBT_SM_EVENT_MCG_CREATED: 3951 case IBT_SM_EVENT_MCG_DELETED: 3952 /* 3953 * Common processing of creation/deletion traps. 3954 * First check if the instance is being 3955 * [de]initialized; back off then, without doing 3956 * anything more, since we are not sure if the 3957 * async thread is around, or whether we might 3958 * be racing with the detach code in ibd_m_stop() 3959 * that scans the mcg list. 3960 */ 3961 if (!ibd_async_safe(state)) 3962 return; 3963 3964 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 3965 req->rq_gid = event->sm_notice_gid; 3966 req->rq_ptr = (void *)code; 3967 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 3968 break; 3969 } 3970 } 3971 3972 static void 3973 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 3974 { 3975 ib_gid_t mgid = req->rq_gid; 3976 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 3977 3978 DPRINT(10, "ibd_async_trap : %d\n", code); 3979 3980 /* 3981 * Atomically search the nonmember and sendonlymember lists and 3982 * delete. 3983 */ 3984 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 3985 3986 if (state->id_prom_op == IBD_OP_COMPLETED) { 3987 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 3988 3989 /* 3990 * If in promiscuous mode, try to join/attach to the new 3991 * mcg. Given the unreliable out-of-order mode of trap 3992 * delivery, we can never be sure whether it is a problem 3993 * if the join fails. Thus, we warn the admin of a failure 3994 * if this was a creation trap. Note that the trap might 3995 * actually be reporting a long past event, and the mcg 3996 * might already have been deleted, thus we might be warning 3997 * in vain. 3998 */ 3999 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4000 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4001 ibd_print_warn(state, "IBA promiscuous mode missed " 4002 "new multicast gid %016llx:%016llx", 4003 (u_longlong_t)mgid.gid_prefix, 4004 (u_longlong_t)mgid.gid_guid); 4005 } 4006 4007 /* 4008 * Free the request slot allocated by the subnet event thread. 4009 */ 4010 ibd_async_done(state); 4011 } 4012 4013 /* 4014 * GLDv3 entry point to get capabilities. 4015 */ 4016 static boolean_t 4017 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4018 { 4019 ibd_state_t *state = arg; 4020 4021 switch (cap) { 4022 case MAC_CAPAB_HCKSUM: { 4023 uint32_t *txflags = cap_data; 4024 4025 /* 4026 * We either do full checksum or not do it at all 4027 */ 4028 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4029 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4030 else 4031 return (B_FALSE); 4032 break; 4033 } 4034 4035 case MAC_CAPAB_LSO: { 4036 mac_capab_lso_t *cap_lso = cap_data; 4037 4038 /* 4039 * In addition to the capability and policy, since LSO 4040 * relies on hw checksum, we'll not enable LSO if we 4041 * don't have hw checksum. Of course, if the HCA doesn't 4042 * provide the reserved lkey capability, enabling LSO will 4043 * actually affect performance adversely, so we'll disable 4044 * LSO even for that case. 4045 */ 4046 if (!state->id_lso_policy || !state->id_lso_capable) 4047 return (B_FALSE); 4048 4049 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4050 return (B_FALSE); 4051 4052 if (state->id_hca_res_lkey_capab == 0) { 4053 ibd_print_warn(state, "no reserved-lkey capability, " 4054 "disabling LSO"); 4055 return (B_FALSE); 4056 } 4057 4058 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4059 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4060 break; 4061 } 4062 4063 default: 4064 return (B_FALSE); 4065 } 4066 4067 return (B_TRUE); 4068 } 4069 4070 static int 4071 ibd_get_port_details(ibd_state_t *state) 4072 { 4073 ibt_hca_portinfo_t *port_infop; 4074 ibt_status_t ret; 4075 uint_t psize, port_infosz; 4076 4077 mutex_enter(&state->id_link_mutex); 4078 4079 /* 4080 * Query for port information 4081 */ 4082 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4083 &port_infop, &psize, &port_infosz); 4084 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4085 mutex_exit(&state->id_link_mutex); 4086 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4087 "failed, ret=%d", ret); 4088 return (ENETDOWN); 4089 } 4090 4091 /* 4092 * If the link already went down by the time we get here, 4093 * give up 4094 */ 4095 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4096 mutex_exit(&state->id_link_mutex); 4097 ibt_free_portinfo(port_infop, port_infosz); 4098 DPRINT(10, "ibd_get_port_details: port is not active"); 4099 return (ENETDOWN); 4100 } 4101 4102 /* 4103 * If the link is active, verify the pkey 4104 */ 4105 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4106 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4107 mutex_exit(&state->id_link_mutex); 4108 ibt_free_portinfo(port_infop, port_infosz); 4109 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4110 "failed, ret=%d", ret); 4111 return (ENONET); 4112 } 4113 4114 state->id_mtu = (128 << port_infop->p_mtu); 4115 state->id_sgid = *port_infop->p_sgid_tbl; 4116 state->id_link_state = LINK_STATE_UP; 4117 4118 mutex_exit(&state->id_link_mutex); 4119 ibt_free_portinfo(port_infop, port_infosz); 4120 4121 /* 4122 * Now that the port is active, record the port speed 4123 */ 4124 state->id_link_speed = ibd_get_portspeed(state); 4125 4126 return (0); 4127 } 4128 4129 static int 4130 ibd_alloc_cqs(ibd_state_t *state) 4131 { 4132 ibt_hca_attr_t hca_attrs; 4133 ibt_cq_attr_t cq_attr; 4134 ibt_status_t ret; 4135 uint32_t real_size; 4136 4137 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4138 ASSERT(ret == IBT_SUCCESS); 4139 4140 /* 4141 * Allocate Rx/combined CQ: 4142 * Theoretically, there is no point in having more than #rwqe 4143 * plus #swqe cqe's, except that the CQ will be signalled for 4144 * overflow when the last wqe completes, if none of the previous 4145 * cqe's have been polled. Thus, we allocate just a few less wqe's 4146 * to make sure such overflow does not occur. 4147 */ 4148 cq_attr.cq_sched = NULL; 4149 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4150 4151 if (ibd_separate_cqs == 1) { 4152 /* 4153 * Allocate Receive CQ. 4154 */ 4155 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4156 cq_attr.cq_size = state->id_num_rwqe + 1; 4157 } else { 4158 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4159 state->id_num_rwqe = cq_attr.cq_size - 1; 4160 } 4161 4162 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4163 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4164 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4165 "failed, ret=%d\n", ret); 4166 return (DDI_FAILURE); 4167 } 4168 4169 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4170 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4171 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4172 "moderation failed, ret=%d\n", ret); 4173 } 4174 4175 state->id_rxwcs_size = state->id_num_rwqe + 1; 4176 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4177 state->id_rxwcs_size, KM_SLEEP); 4178 4179 /* 4180 * Allocate Send CQ. 4181 */ 4182 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4183 cq_attr.cq_size = state->id_num_swqe + 1; 4184 } else { 4185 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4186 state->id_num_swqe = cq_attr.cq_size - 1; 4187 } 4188 4189 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4190 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4191 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4192 "failed, ret=%d\n", ret); 4193 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4194 state->id_rxwcs_size); 4195 (void) ibt_free_cq(state->id_rcq_hdl); 4196 return (DDI_FAILURE); 4197 } 4198 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4199 IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) { 4200 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4201 "moderation failed, ret=%d\n", ret); 4202 } 4203 4204 state->id_txwcs_size = state->id_num_swqe + 1; 4205 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4206 state->id_txwcs_size, KM_SLEEP); 4207 } else { 4208 /* 4209 * Allocate combined Send/Receive CQ. 4210 */ 4211 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 4212 state->id_num_swqe + 1)) { 4213 cq_attr.cq_size = state->id_num_rwqe + 4214 state->id_num_swqe + 1; 4215 } else { 4216 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4217 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 4218 state->id_num_rwqe) / (state->id_num_rwqe + 4219 state->id_num_swqe); 4220 state->id_num_swqe = cq_attr.cq_size - 1 - 4221 state->id_num_rwqe; 4222 } 4223 4224 state->id_rxwcs_size = cq_attr.cq_size; 4225 state->id_txwcs_size = state->id_rxwcs_size; 4226 4227 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4228 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4229 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) " 4230 "failed, ret=%d\n", ret); 4231 return (DDI_FAILURE); 4232 } 4233 state->id_scq_hdl = state->id_rcq_hdl; 4234 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4235 state->id_rxwcs_size, KM_SLEEP); 4236 state->id_txwcs = state->id_rxwcs; 4237 } 4238 4239 /* 4240 * Print message in case we could not allocate as many wqe's 4241 * as was requested. 4242 */ 4243 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4244 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4245 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4246 } 4247 if (state->id_num_swqe != IBD_NUM_SWQE) { 4248 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4249 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4250 } 4251 4252 return (DDI_SUCCESS); 4253 } 4254 4255 static int 4256 ibd_setup_ud_channel(ibd_state_t *state) 4257 { 4258 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4259 ibt_ud_chan_query_attr_t ud_chan_attr; 4260 ibt_status_t ret; 4261 4262 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 4263 if (state->id_hca_res_lkey_capab) 4264 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4265 if (state->id_lso_policy && state->id_lso_capable) 4266 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4267 4268 ud_alloc_attr.ud_hca_port_num = state->id_port; 4269 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4270 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4271 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4272 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4273 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4274 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4275 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4276 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4277 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4278 ud_alloc_attr.ud_clone_chan = NULL; 4279 4280 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4281 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4282 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4283 "failed, ret=%d\n", ret); 4284 return (DDI_FAILURE); 4285 } 4286 4287 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4288 &ud_chan_attr)) != IBT_SUCCESS) { 4289 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4290 "failed, ret=%d\n", ret); 4291 (void) ibt_free_channel(state->id_chnl_hdl); 4292 return (DDI_FAILURE); 4293 } 4294 4295 state->id_qpnum = ud_chan_attr.ud_qpn; 4296 4297 return (DDI_SUCCESS); 4298 } 4299 4300 static int 4301 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4302 { 4303 uint32_t progress = state->id_mac_state; 4304 uint_t attempts; 4305 ibt_status_t ret; 4306 ib_gid_t mgid; 4307 ibd_mce_t *mce; 4308 uint8_t jstate; 4309 4310 /* 4311 * Before we try to stop/undo whatever we did in ibd_start(), 4312 * we need to mark the link state appropriately to prevent the 4313 * ip layer from using this instance for any new transfers. Note 4314 * that if the original state of the link was "up" when we're 4315 * here, we'll set the final link state to "unknown", to behave 4316 * in the same fashion as other ethernet drivers. 4317 */ 4318 mutex_enter(&state->id_link_mutex); 4319 if (cur_link_state == LINK_STATE_DOWN) { 4320 state->id_link_state = cur_link_state; 4321 } else { 4322 state->id_link_state = LINK_STATE_UNKNOWN; 4323 } 4324 mutex_exit(&state->id_link_mutex); 4325 mac_link_update(state->id_mh, state->id_link_state); 4326 4327 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4328 if (progress & IBD_DRV_STARTED) { 4329 state->id_mac_state &= (~IBD_DRV_STARTED); 4330 } 4331 4332 /* 4333 * First, stop receive interrupts; this stops the driver from 4334 * handing up buffers to higher layers. Wait for receive buffers 4335 * to be returned and give up after 5 seconds. 4336 */ 4337 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4338 4339 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4340 4341 attempts = 50; 4342 while (state->id_rx_list.dl_bufs_outstanding > 0) { 4343 delay(drv_usectohz(100000)); 4344 if (--attempts == 0) { 4345 /* 4346 * There are pending bufs with the network 4347 * layer and we have no choice but to wait 4348 * for them to be done with. Reap all the 4349 * Tx/Rx completions that were posted since 4350 * we turned off the notification and 4351 * return failure. 4352 */ 4353 DPRINT(2, "ibd_undo_start: " 4354 "reclaiming failed"); 4355 ibd_poll_compq(state, state->id_rcq_hdl); 4356 ibt_set_cq_handler(state->id_rcq_hdl, 4357 ibd_rcq_handler, state); 4358 return (DDI_FAILURE); 4359 } 4360 } 4361 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4362 } 4363 4364 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4365 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4366 4367 mutex_enter(&state->id_trap_lock); 4368 state->id_trap_stop = B_TRUE; 4369 while (state->id_trap_inprog > 0) 4370 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4371 mutex_exit(&state->id_trap_lock); 4372 4373 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4374 } 4375 4376 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4377 /* 4378 * Flushing the channel ensures that all pending WQE's 4379 * are marked with flush_error and handed to the CQ. It 4380 * does not guarantee the invocation of the CQ handler. 4381 * This call is guaranteed to return successfully for 4382 * UD QPNs. 4383 */ 4384 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4385 IBT_SUCCESS) { 4386 DPRINT(10, "ibd_undo_start: flush_channel " 4387 "failed, ret=%d", ret); 4388 } 4389 4390 /* 4391 * Turn off Tx interrupts and poll. By the time the polling 4392 * returns an empty indicator, we are sure we have seen all 4393 * pending Tx callbacks. Note that after the call to 4394 * ibt_set_cq_handler() returns, the old handler is 4395 * guaranteed not to be invoked anymore. 4396 */ 4397 if (ibd_separate_cqs == 1) { 4398 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4399 } 4400 ibd_poll_compq(state, state->id_scq_hdl); 4401 4402 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4403 } 4404 4405 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4406 /* 4407 * No new async requests will be posted since the device 4408 * link state has been marked as unknown; completion handlers 4409 * have been turned off, so Tx handler will not cause any 4410 * more IBD_ASYNC_REAP requests. 4411 * 4412 * Queue a request for the async thread to exit, which will 4413 * be serviced after any pending ones. This can take a while, 4414 * specially if the SM is unreachable, since IBMF will slowly 4415 * timeout each SM request issued by the async thread. Reap 4416 * the thread before continuing on, we do not want it to be 4417 * lingering in modunloaded code (or we could move the reap 4418 * to ibd_detach(), provided we keep track of the current 4419 * id_async_thrid somewhere safe). 4420 */ 4421 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4422 thread_join(state->id_async_thrid); 4423 4424 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4425 } 4426 4427 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4428 /* 4429 * Drop all residual full/non membership. This includes full 4430 * membership to the broadcast group, and any nonmembership 4431 * acquired during transmits. We do this after the Tx completion 4432 * handlers are done, since those might result in some late 4433 * leaves; this also eliminates a potential race with that 4434 * path wrt the mc full list insert/delete. Trap handling 4435 * has also been suppressed at this point. Thus, no locks 4436 * are required while traversing the mc full list. 4437 */ 4438 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4439 mce = list_head(&state->id_mc_full); 4440 while (mce != NULL) { 4441 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4442 jstate = mce->mc_jstate; 4443 mce = list_next(&state->id_mc_full, mce); 4444 ibd_leave_group(state, mgid, jstate); 4445 } 4446 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4447 } 4448 4449 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4450 ibd_fini_rxlist(state); 4451 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4452 } 4453 4454 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4455 ibd_fini_txlist(state); 4456 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4457 } 4458 4459 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4460 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4461 IBT_SUCCESS) { 4462 DPRINT(10, "ibd_undo_start: free_channel " 4463 "failed, ret=%d", ret); 4464 } 4465 4466 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4467 } 4468 4469 if (progress & IBD_DRV_CQS_ALLOCD) { 4470 if (ibd_separate_cqs == 1) { 4471 kmem_free(state->id_txwcs, 4472 sizeof (ibt_wc_t) * state->id_txwcs_size); 4473 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4474 IBT_SUCCESS) { 4475 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4476 "failed, ret=%d", ret); 4477 } 4478 } 4479 4480 kmem_free(state->id_rxwcs, 4481 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4482 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4483 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4484 "ret=%d", ret); 4485 } 4486 4487 state->id_txwcs = NULL; 4488 state->id_rxwcs = NULL; 4489 state->id_scq_hdl = NULL; 4490 state->id_rcq_hdl = NULL; 4491 4492 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4493 } 4494 4495 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4496 mod_hash_destroy_hash(state->id_ah_active_hash); 4497 ibd_acache_fini(state); 4498 4499 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4500 } 4501 4502 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4503 /* 4504 * If we'd created the ipoib broadcast group and had 4505 * successfully joined it, leave it now 4506 */ 4507 if (state->id_bgroup_created) { 4508 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4509 jstate = IB_MC_JSTATE_FULL; 4510 (void) ibt_leave_mcg(state->id_sgid, mgid, 4511 state->id_sgid, jstate); 4512 } 4513 ibt_free_mcg_info(state->id_mcinfo, 1); 4514 4515 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4516 } 4517 4518 return (DDI_SUCCESS); 4519 } 4520 4521 /* 4522 * These pair of routines are used to set/clear the condition that 4523 * the caller is likely to do something to change the id_mac_state. 4524 * If there's already someone doing either a start or a stop (possibly 4525 * due to the async handler detecting a pkey relocation event, a plumb 4526 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4527 * that's done. 4528 */ 4529 static void 4530 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4531 { 4532 mutex_enter(&state->id_macst_lock); 4533 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4534 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4535 4536 state->id_mac_state |= flag; 4537 mutex_exit(&state->id_macst_lock); 4538 } 4539 4540 static void 4541 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4542 { 4543 mutex_enter(&state->id_macst_lock); 4544 state->id_mac_state &= (~flag); 4545 cv_signal(&state->id_macst_cv); 4546 mutex_exit(&state->id_macst_lock); 4547 } 4548 4549 /* 4550 * GLDv3 entry point to start hardware. 4551 */ 4552 /*ARGSUSED*/ 4553 static int 4554 ibd_m_start(void *arg) 4555 { 4556 ibd_state_t *state = arg; 4557 int ret; 4558 4559 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4560 4561 ret = ibd_start(state); 4562 4563 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4564 4565 return (ret); 4566 } 4567 4568 static int 4569 ibd_start(ibd_state_t *state) 4570 { 4571 kthread_t *kht; 4572 int err; 4573 ibt_status_t ret; 4574 4575 if (state->id_mac_state & IBD_DRV_STARTED) 4576 return (DDI_SUCCESS); 4577 4578 /* 4579 * Get port details; if we fail here, very likely the port 4580 * state is inactive or the pkey can't be verified. 4581 */ 4582 if ((err = ibd_get_port_details(state)) != 0) { 4583 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4584 goto start_fail; 4585 } 4586 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4587 4588 /* 4589 * Find the IPoIB broadcast group 4590 */ 4591 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4592 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4593 err = ENOTACTIVE; 4594 goto start_fail; 4595 } 4596 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4597 4598 /* 4599 * Initialize per-interface caches and lists; if we fail here, 4600 * it is most likely due to a lack of resources 4601 */ 4602 if (ibd_acache_init(state) != DDI_SUCCESS) { 4603 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4604 err = ENOMEM; 4605 goto start_fail; 4606 } 4607 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4608 4609 /* 4610 * Allocate send and receive completion queues 4611 */ 4612 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4613 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4614 err = ENOMEM; 4615 goto start_fail; 4616 } 4617 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4618 4619 /* 4620 * Setup a UD channel 4621 */ 4622 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4623 err = ENOMEM; 4624 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4625 goto start_fail; 4626 } 4627 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4628 4629 /* 4630 * Allocate and initialize the tx buffer list 4631 */ 4632 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4633 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4634 err = ENOMEM; 4635 goto start_fail; 4636 } 4637 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4638 4639 /* 4640 * If we have separate cqs, create the send cq handler here 4641 */ 4642 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 4643 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4644 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4645 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4646 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4647 "failed, ret=%d", ret); 4648 err = EINVAL; 4649 goto start_fail; 4650 } 4651 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4652 } 4653 4654 /* 4655 * Allocate and initialize the rx buffer list 4656 */ 4657 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4658 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4659 err = ENOMEM; 4660 goto start_fail; 4661 } 4662 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4663 4664 /* 4665 * Join IPoIB broadcast group 4666 */ 4667 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4668 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4669 err = ENOTACTIVE; 4670 goto start_fail; 4671 } 4672 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4673 4674 /* 4675 * Create the async thread; thread_create never fails. 4676 */ 4677 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4678 TS_RUN, minclsyspri); 4679 state->id_async_thrid = kht->t_did; 4680 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4681 4682 /* 4683 * When we did mac_register() in ibd_attach(), we didn't register 4684 * the real macaddr and we didn't have the true port mtu. Now that 4685 * we're almost ready, set the local mac address and broadcast 4686 * addresses and update gldv3 about the real values of these 4687 * parameters. 4688 */ 4689 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4690 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4691 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4692 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4693 4694 mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); 4695 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4696 4697 /* 4698 * Setup the receive cq handler 4699 */ 4700 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4701 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4702 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4703 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4704 "failed, ret=%d", ret); 4705 err = EINVAL; 4706 goto start_fail; 4707 } 4708 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4709 4710 /* 4711 * Setup the subnet notices handler after we've initialized the acache/ 4712 * mcache and started the async thread, both of which are required for 4713 * the trap handler to function properly. 4714 * 4715 * Now that the async thread has been started (and we've already done 4716 * a mac_register() during attach so mac_tx_update() can be called 4717 * if necessary without any problem), we can enable the trap handler 4718 * to queue requests to the async thread. 4719 */ 4720 ibt_register_subnet_notices(state->id_ibt_hdl, 4721 ibd_snet_notices_handler, state); 4722 mutex_enter(&state->id_trap_lock); 4723 state->id_trap_stop = B_FALSE; 4724 mutex_exit(&state->id_trap_lock); 4725 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4726 4727 /* 4728 * Indicate link status to GLDv3 and higher layers. By default, 4729 * we assume we are in up state (which must have been true at 4730 * least at the time the broadcast mcg's were probed); if there 4731 * were any up/down transitions till the time we come here, the 4732 * async handler will have updated last known state, which we 4733 * use to tell GLDv3. The async handler will not send any 4734 * notifications to GLDv3 till we reach here in the initialization 4735 * sequence. 4736 */ 4737 state->id_mac_state |= IBD_DRV_STARTED; 4738 mac_link_update(state->id_mh, state->id_link_state); 4739 4740 return (DDI_SUCCESS); 4741 4742 start_fail: 4743 /* 4744 * If we ran into a problem during ibd_start() and ran into 4745 * some other problem during undoing our partial work, we can't 4746 * do anything about it. Ignore any errors we might get from 4747 * ibd_undo_start() and just return the original error we got. 4748 */ 4749 (void) ibd_undo_start(state, LINK_STATE_DOWN); 4750 return (err); 4751 } 4752 4753 /* 4754 * GLDv3 entry point to stop hardware from receiving packets. 4755 */ 4756 /*ARGSUSED*/ 4757 static void 4758 ibd_m_stop(void *arg) 4759 { 4760 ibd_state_t *state = (ibd_state_t *)arg; 4761 4762 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4763 4764 (void) ibd_undo_start(state, state->id_link_state); 4765 4766 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4767 } 4768 4769 /* 4770 * GLDv3 entry point to modify device's mac address. We do not 4771 * allow address modifications. 4772 */ 4773 static int 4774 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4775 { 4776 ibd_state_t *state = arg; 4777 4778 /* 4779 * Don't bother even comparing the macaddr if we haven't 4780 * completed ibd_m_start(). 4781 */ 4782 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4783 return (0); 4784 4785 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4786 return (0); 4787 else 4788 return (EINVAL); 4789 } 4790 4791 /* 4792 * The blocking part of the IBA join/leave operations are done out 4793 * of here on the async thread. 4794 */ 4795 static void 4796 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4797 { 4798 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4799 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4800 4801 if (op == IBD_ASYNC_JOIN) { 4802 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4803 ibd_print_warn(state, "Joint multicast group failed :" 4804 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4805 } 4806 } else { 4807 /* 4808 * Here, we must search for the proper mcg_info and 4809 * use that to leave the group. 4810 */ 4811 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4812 } 4813 } 4814 4815 /* 4816 * GLDv3 entry point for multicast enable/disable requests. 4817 * This function queues the operation to the async thread and 4818 * return success for a valid multicast address. 4819 */ 4820 static int 4821 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4822 { 4823 ibd_state_t *state = (ibd_state_t *)arg; 4824 ipoib_mac_t maddr, *mcast; 4825 ib_gid_t mgid; 4826 ibd_req_t *req; 4827 4828 /* 4829 * If we haven't completed ibd_m_start(), async thread wouldn't 4830 * have been started and id_bcaddr wouldn't be set, so there's 4831 * no point in continuing. 4832 */ 4833 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4834 return (0); 4835 4836 /* 4837 * The incoming multicast address might not be aligned properly 4838 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4839 * it to look like one though, to get the offsets of the mc gid, 4840 * since we know we are not going to dereference any values with 4841 * the ipoib_mac_t pointer. 4842 */ 4843 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4844 mcast = &maddr; 4845 4846 /* 4847 * Check validity of MCG address. We could additionally check 4848 * that a enable/disable is not being issued on the "broadcast" 4849 * mcg, but since this operation is only invokable by priviledged 4850 * programs anyway, we allow the flexibility to those dlpi apps. 4851 * Note that we do not validate the "scope" of the IBA mcg. 4852 */ 4853 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4854 return (EINVAL); 4855 4856 /* 4857 * fill in multicast pkey and scope 4858 */ 4859 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4860 4861 /* 4862 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4863 * nothing (i.e. we stay JOINed to the broadcast group done in 4864 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 4865 * requires to be joined to broadcast groups at all times. 4866 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4867 * depends on this. 4868 */ 4869 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4870 return (0); 4871 4872 ibd_n2h_gid(mcast, &mgid); 4873 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4874 if (req == NULL) 4875 return (ENOMEM); 4876 4877 req->rq_gid = mgid; 4878 4879 if (add) { 4880 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4881 mgid.gid_prefix, mgid.gid_guid); 4882 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 4883 } else { 4884 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4885 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4886 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 4887 } 4888 return (0); 4889 } 4890 4891 /* 4892 * The blocking part of the IBA promiscuous operations are done 4893 * out of here on the async thread. The dlpireq parameter indicates 4894 * whether this invocation is due to a dlpi request or due to 4895 * a port up/down event. 4896 */ 4897 static void 4898 ibd_async_unsetprom(ibd_state_t *state) 4899 { 4900 ibd_mce_t *mce = list_head(&state->id_mc_non); 4901 ib_gid_t mgid; 4902 4903 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4904 4905 while (mce != NULL) { 4906 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4907 mce = list_next(&state->id_mc_non, mce); 4908 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4909 } 4910 state->id_prom_op = IBD_OP_NOTSTARTED; 4911 } 4912 4913 /* 4914 * The blocking part of the IBA promiscuous operations are done 4915 * out of here on the async thread. The dlpireq parameter indicates 4916 * whether this invocation is due to a dlpi request or due to 4917 * a port up/down event. 4918 */ 4919 static void 4920 ibd_async_setprom(ibd_state_t *state) 4921 { 4922 ibt_mcg_attr_t mcg_attr; 4923 ibt_mcg_info_t *mcg_info; 4924 ib_gid_t mgid; 4925 uint_t numg; 4926 int i; 4927 char ret = IBD_OP_COMPLETED; 4928 4929 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4930 4931 /* 4932 * Obtain all active MC groups on the IB fabric with 4933 * specified criteria (scope + Pkey + Qkey + mtu). 4934 */ 4935 bzero(&mcg_attr, sizeof (mcg_attr)); 4936 mcg_attr.mc_pkey = state->id_pkey; 4937 mcg_attr.mc_scope = state->id_scope; 4938 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4939 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4940 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4941 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4942 IBT_SUCCESS) { 4943 ibd_print_warn(state, "Could not get list of IBA multicast " 4944 "groups"); 4945 ret = IBD_OP_ERRORED; 4946 goto done; 4947 } 4948 4949 /* 4950 * Iterate over the returned mcg's and join as NonMember 4951 * to the IP mcg's. 4952 */ 4953 for (i = 0; i < numg; i++) { 4954 /* 4955 * Do a NonMember JOIN on the MC group. 4956 */ 4957 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4958 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4959 ibd_print_warn(state, "IBA promiscuous mode missed " 4960 "multicast gid %016llx:%016llx", 4961 (u_longlong_t)mgid.gid_prefix, 4962 (u_longlong_t)mgid.gid_guid); 4963 } 4964 4965 ibt_free_mcg_info(mcg_info, numg); 4966 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4967 done: 4968 state->id_prom_op = ret; 4969 } 4970 4971 /* 4972 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4973 * GLDv3 assumes phys state receives more packets than multi state, 4974 * which is not true for IPoIB. Thus, treat the multi and phys 4975 * promiscuous states the same way to work with GLDv3's assumption. 4976 */ 4977 static int 4978 ibd_m_promisc(void *arg, boolean_t on) 4979 { 4980 ibd_state_t *state = (ibd_state_t *)arg; 4981 ibd_req_t *req; 4982 4983 /* 4984 * Async thread wouldn't have been started if we haven't 4985 * passed ibd_m_start() 4986 */ 4987 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4988 return (0); 4989 4990 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4991 if (req == NULL) 4992 return (ENOMEM); 4993 if (on) { 4994 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4995 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 4996 } else { 4997 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4998 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 4999 } 5000 5001 return (0); 5002 } 5003 5004 /* 5005 * GLDv3 entry point for gathering statistics. 5006 */ 5007 static int 5008 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 5009 { 5010 ibd_state_t *state = (ibd_state_t *)arg; 5011 5012 switch (stat) { 5013 case MAC_STAT_IFSPEED: 5014 *val = state->id_link_speed; 5015 break; 5016 case MAC_STAT_MULTIRCV: 5017 *val = state->id_multi_rcv; 5018 break; 5019 case MAC_STAT_BRDCSTRCV: 5020 *val = state->id_brd_rcv; 5021 break; 5022 case MAC_STAT_MULTIXMT: 5023 *val = state->id_multi_xmt; 5024 break; 5025 case MAC_STAT_BRDCSTXMT: 5026 *val = state->id_brd_xmt; 5027 break; 5028 case MAC_STAT_RBYTES: 5029 *val = state->id_rcv_bytes; 5030 break; 5031 case MAC_STAT_IPACKETS: 5032 *val = state->id_rcv_pkt; 5033 break; 5034 case MAC_STAT_OBYTES: 5035 *val = state->id_xmt_bytes; 5036 break; 5037 case MAC_STAT_OPACKETS: 5038 *val = state->id_xmt_pkt; 5039 break; 5040 case MAC_STAT_OERRORS: 5041 *val = state->id_ah_error; /* failed AH translation */ 5042 break; 5043 case MAC_STAT_IERRORS: 5044 *val = 0; 5045 break; 5046 case MAC_STAT_NOXMTBUF: 5047 *val = state->id_tx_short; 5048 break; 5049 case MAC_STAT_NORCVBUF: 5050 default: 5051 return (ENOTSUP); 5052 } 5053 5054 return (0); 5055 } 5056 5057 static void 5058 ibd_async_txsched(ibd_state_t *state) 5059 { 5060 ibd_req_t *req; 5061 int ret; 5062 5063 if (ibd_txcomp_poll) 5064 ibd_poll_compq(state, state->id_scq_hdl); 5065 5066 ret = ibd_resume_transmission(state); 5067 if (ret && ibd_txcomp_poll) { 5068 if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP)) 5069 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5070 else { 5071 ibd_print_warn(state, "ibd_async_txsched: " 5072 "no memory, can't schedule work slot"); 5073 } 5074 } 5075 } 5076 5077 static int 5078 ibd_resume_transmission(ibd_state_t *state) 5079 { 5080 int flag; 5081 int met_thresh = 0; 5082 int ret = -1; 5083 5084 mutex_enter(&state->id_sched_lock); 5085 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5086 met_thresh = (state->id_tx_list.dl_cnt > 5087 IBD_FREE_SWQES_THRESH); 5088 flag = IBD_RSRC_SWQE; 5089 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5090 ASSERT(state->id_lso != NULL); 5091 met_thresh = (state->id_lso->bkt_nfree > 5092 IBD_FREE_LSOS_THRESH); 5093 flag = IBD_RSRC_LSOBUF; 5094 } 5095 if (met_thresh) { 5096 state->id_sched_needed &= ~flag; 5097 ret = 0; 5098 } 5099 mutex_exit(&state->id_sched_lock); 5100 5101 if (ret == 0) 5102 mac_tx_update(state->id_mh); 5103 5104 return (ret); 5105 } 5106 5107 /* 5108 * Release the send wqe back into free list. 5109 */ 5110 static void 5111 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 5112 { 5113 /* 5114 * Add back on Tx list for reuse. 5115 */ 5116 swqe->swqe_next = NULL; 5117 mutex_enter(&state->id_tx_list.dl_mutex); 5118 if (state->id_tx_list.dl_pending_sends) { 5119 state->id_tx_list.dl_pending_sends = B_FALSE; 5120 } 5121 if (state->id_tx_list.dl_head == NULL) { 5122 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 5123 } else { 5124 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 5125 } 5126 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 5127 state->id_tx_list.dl_cnt++; 5128 mutex_exit(&state->id_tx_list.dl_mutex); 5129 } 5130 5131 /* 5132 * Acquire a send wqe from free list. 5133 * Returns error number and send wqe pointer. 5134 */ 5135 static int 5136 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe) 5137 { 5138 int rc = 0; 5139 ibd_swqe_t *wqe; 5140 5141 /* 5142 * Check and reclaim some of the completed Tx requests. 5143 * If someone else is already in this code and pulling Tx 5144 * completions, no need to poll, since the current lock holder 5145 * will do the work anyway. Normally, we poll for completions 5146 * every few Tx attempts, but if we are short on Tx descriptors, 5147 * we always try to poll. 5148 */ 5149 if ((ibd_txcomp_poll == 1) && 5150 (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) { 5151 ibd_poll_compq(state, state->id_scq_hdl); 5152 } 5153 5154 /* 5155 * Grab required transmit wqes. 5156 */ 5157 mutex_enter(&state->id_tx_list.dl_mutex); 5158 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5159 if (wqe != NULL) { 5160 state->id_tx_list.dl_cnt -= 1; 5161 state->id_tx_list.dl_head = wqe->swqe_next; 5162 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 5163 state->id_tx_list.dl_tail = NULL; 5164 } else { 5165 /* 5166 * If we did not find the number we were looking for, flag 5167 * no resource. Adjust list appropriately in either case. 5168 */ 5169 rc = ENOENT; 5170 state->id_tx_list.dl_pending_sends = B_TRUE; 5171 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5172 atomic_add_64(&state->id_tx_short, 1); 5173 } 5174 mutex_exit(&state->id_tx_list.dl_mutex); 5175 *swqe = wqe; 5176 5177 return (rc); 5178 } 5179 5180 static int 5181 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5182 ibt_ud_dest_hdl_t ud_dest) 5183 { 5184 mblk_t *nmp; 5185 int iph_len, tcph_len; 5186 ibt_wr_lso_t *lso; 5187 uintptr_t ip_start, tcp_start; 5188 uint8_t *dst; 5189 uint_t pending, mblen; 5190 5191 /* 5192 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5193 * we need to adjust it here for lso. 5194 */ 5195 lso = &(node->w_swr.wr.ud_lso); 5196 lso->lso_ud_dest = ud_dest; 5197 lso->lso_mss = mss; 5198 5199 /* 5200 * Calculate the LSO header size and set it in the UD LSO structure. 5201 * Note that the only assumption we make is that each of the IPoIB, 5202 * IP and TCP headers will be contained in a single mblk fragment; 5203 * together, the headers may span multiple mblk fragments. 5204 */ 5205 nmp = mp; 5206 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5207 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5208 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5209 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5210 nmp = nmp->b_cont; 5211 5212 } 5213 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5214 5215 tcp_start = ip_start + iph_len; 5216 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5217 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5218 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5219 nmp = nmp->b_cont; 5220 } 5221 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5222 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5223 5224 /* 5225 * If the lso header fits entirely within a single mblk fragment, 5226 * we'll avoid an additional copy of the lso header here and just 5227 * pass the b_rptr of the mblk directly. 5228 * 5229 * If this isn't true, we'd have to allocate for it explicitly. 5230 */ 5231 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5232 lso->lso_hdr = mp->b_rptr; 5233 } else { 5234 /* On work completion, remember to free this allocated hdr */ 5235 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5236 if (lso->lso_hdr == NULL) { 5237 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5238 "sz = %d", lso->lso_hdr_sz); 5239 lso->lso_hdr_sz = 0; 5240 lso->lso_mss = 0; 5241 return (-1); 5242 } 5243 } 5244 5245 /* 5246 * Copy in the lso header only if we need to 5247 */ 5248 if (lso->lso_hdr != mp->b_rptr) { 5249 dst = lso->lso_hdr; 5250 pending = lso->lso_hdr_sz; 5251 5252 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5253 mblen = MBLKL(nmp); 5254 if (pending > mblen) { 5255 bcopy(nmp->b_rptr, dst, mblen); 5256 dst += mblen; 5257 pending -= mblen; 5258 } else { 5259 bcopy(nmp->b_rptr, dst, pending); 5260 break; 5261 } 5262 } 5263 } 5264 5265 return (0); 5266 } 5267 5268 static void 5269 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5270 { 5271 ibt_wr_lso_t *lso; 5272 5273 if ((!node) || (!mp)) 5274 return; 5275 5276 /* 5277 * Free any header space that we might've allocated if we 5278 * did an LSO 5279 */ 5280 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5281 lso = &(node->w_swr.wr.ud_lso); 5282 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5283 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5284 lso->lso_hdr = NULL; 5285 lso->lso_hdr_sz = 0; 5286 } 5287 } 5288 } 5289 5290 static void 5291 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5292 { 5293 uint_t i; 5294 uint_t num_posted; 5295 uint_t n_wrs; 5296 ibt_status_t ibt_status; 5297 ibt_send_wr_t wrs[IBD_MAX_POST_MULTIPLE]; 5298 ibd_swqe_t *elem; 5299 ibd_swqe_t *nodes[IBD_MAX_POST_MULTIPLE]; 5300 5301 node->swqe_next = NULL; 5302 5303 mutex_enter(&state->id_txpost_lock); 5304 5305 /* 5306 * Enqueue the new node in chain of wqes to send 5307 */ 5308 if (state->id_tx_head) { 5309 *(state->id_tx_tailp) = (ibd_wqe_t *)node; 5310 } else { 5311 state->id_tx_head = node; 5312 } 5313 state->id_tx_tailp = &(node->swqe_next); 5314 5315 /* 5316 * If someone else is helping out with the sends, 5317 * just go back 5318 */ 5319 if (state->id_tx_busy) { 5320 mutex_exit(&state->id_txpost_lock); 5321 return; 5322 } 5323 5324 /* 5325 * Otherwise, mark the flag to indicate that we'll be 5326 * doing the dispatch of what's there in the wqe chain 5327 */ 5328 state->id_tx_busy = 1; 5329 5330 while (state->id_tx_head) { 5331 /* 5332 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs 5333 * at a time if possible, and keep posting them. 5334 */ 5335 for (n_wrs = 0, elem = state->id_tx_head; 5336 (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE); 5337 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5338 5339 nodes[n_wrs] = elem; 5340 wrs[n_wrs] = elem->w_swr; 5341 } 5342 state->id_tx_head = elem; 5343 5344 /* 5345 * Release the txpost lock before posting the 5346 * send request to the hca; if the posting fails 5347 * for some reason, we'll never receive completion 5348 * intimation, so we'll need to cleanup. 5349 */ 5350 mutex_exit(&state->id_txpost_lock); 5351 5352 ASSERT(n_wrs != 0); 5353 5354 /* 5355 * If posting fails for some reason, we'll never receive 5356 * completion intimation, so we'll need to cleanup. But 5357 * we need to make sure we don't clean up nodes whose 5358 * wrs have been successfully posted. We assume that the 5359 * hca driver returns on the first failure to post and 5360 * therefore the first 'num_posted' entries don't need 5361 * cleanup here. 5362 */ 5363 num_posted = 0; 5364 ibt_status = ibt_post_send(state->id_chnl_hdl, 5365 wrs, n_wrs, &num_posted); 5366 if (ibt_status != IBT_SUCCESS) { 5367 5368 ibd_print_warn(state, "ibd_post_send: " 5369 "posting multiple wrs failed: " 5370 "requested=%d, done=%d, ret=%d", 5371 n_wrs, num_posted, ibt_status); 5372 5373 for (i = num_posted; i < n_wrs; i++) 5374 ibd_tx_cleanup(state, nodes[i]); 5375 } 5376 5377 /* 5378 * Grab the mutex before we go and check the tx Q again 5379 */ 5380 mutex_enter(&state->id_txpost_lock); 5381 } 5382 5383 state->id_tx_busy = 0; 5384 mutex_exit(&state->id_txpost_lock); 5385 } 5386 5387 static int 5388 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5389 uint_t lsohdr_sz) 5390 { 5391 ibt_wr_ds_t *sgl; 5392 ibt_status_t ibt_status; 5393 mblk_t *nmp; 5394 mblk_t *data_mp; 5395 uchar_t *bufp; 5396 size_t blksize; 5397 size_t skip; 5398 size_t avail; 5399 uint_t pktsize; 5400 uint_t frag_len; 5401 uint_t pending_hdr; 5402 uint_t hiwm; 5403 int nmblks; 5404 int i; 5405 5406 /* 5407 * Let's skip ahead to the data if this is LSO 5408 */ 5409 data_mp = mp; 5410 pending_hdr = 0; 5411 if (lsohdr_sz) { 5412 pending_hdr = lsohdr_sz; 5413 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5414 frag_len = nmp->b_wptr - nmp->b_rptr; 5415 if (frag_len > pending_hdr) 5416 break; 5417 pending_hdr -= frag_len; 5418 } 5419 data_mp = nmp; /* start of data past lso header */ 5420 ASSERT(data_mp != NULL); 5421 } 5422 5423 /* 5424 * Calculate the size of message data and number of msg blocks 5425 */ 5426 pktsize = 0; 5427 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5428 nmp = nmp->b_cont, nmblks++) { 5429 pktsize += MBLKL(nmp); 5430 } 5431 pktsize -= pending_hdr; 5432 5433 /* 5434 * Translating the virtual address regions into physical regions 5435 * for using the Reserved LKey feature results in a wr sgl that 5436 * is a little longer. Since failing ibt_map_mem_iov() is costly, 5437 * we'll fix a high-water mark (65%) for when we should stop. 5438 */ 5439 hiwm = (state->id_max_sqseg * 65) / 100; 5440 5441 /* 5442 * We only do ibt_map_mem_iov() if the pktsize is above the 5443 * "copy-threshold", and if the number of mp fragments is less than 5444 * the maximum acceptable. 5445 */ 5446 if ((state->id_hca_res_lkey_capab) && 5447 (pktsize > IBD_TX_COPY_THRESH) && 5448 (nmblks < hiwm)) { 5449 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5450 ibt_iov_attr_t iov_attr; 5451 5452 iov_attr.iov_as = NULL; 5453 iov_attr.iov = iov_arr; 5454 iov_attr.iov_buf = NULL; 5455 iov_attr.iov_list_len = nmblks; 5456 iov_attr.iov_wr_nds = state->id_max_sqseg; 5457 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5458 iov_attr.iov_flags = IBT_IOV_SLEEP; 5459 5460 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5461 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5462 iov_arr[i].iov_len = MBLKL(nmp); 5463 if (i == 0) { 5464 iov_arr[i].iov_addr += pending_hdr; 5465 iov_arr[i].iov_len -= pending_hdr; 5466 } 5467 } 5468 5469 node->w_buftype = IBD_WQE_MAPPED; 5470 node->w_swr.wr_sgl = node->w_sgl; 5471 5472 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5473 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5474 if (ibt_status != IBT_SUCCESS) { 5475 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5476 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5477 goto ibd_copy_path; 5478 } 5479 5480 return (0); 5481 } 5482 5483 ibd_copy_path: 5484 if (pktsize <= state->id_tx_buf_sz) { 5485 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5486 node->w_swr.wr_nds = 1; 5487 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5488 node->w_buftype = IBD_WQE_TXBUF; 5489 5490 /* 5491 * Even though this is the copy path for transfers less than 5492 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5493 * is possible the first data mblk fragment (data_mp) still 5494 * contains part of the LSO header that we need to skip. 5495 */ 5496 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5497 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5498 blksize = MBLKL(nmp) - pending_hdr; 5499 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5500 bufp += blksize; 5501 pending_hdr = 0; 5502 } 5503 5504 return (0); 5505 } 5506 5507 /* 5508 * Copy path for transfers greater than id_tx_buf_sz 5509 */ 5510 node->w_swr.wr_sgl = node->w_sgl; 5511 if (ibd_acquire_lsobufs(state, pktsize, 5512 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5513 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5514 return (-1); 5515 } 5516 node->w_buftype = IBD_WQE_LSOBUF; 5517 5518 /* 5519 * Copy the larger-than-id_tx_buf_sz packet into a set of 5520 * fixed-sized, pre-mapped LSO buffers. Note that we might 5521 * need to skip part of the LSO header in the first fragment 5522 * as before. 5523 */ 5524 nmp = data_mp; 5525 skip = pending_hdr; 5526 for (i = 0; i < node->w_swr.wr_nds; i++) { 5527 sgl = node->w_swr.wr_sgl + i; 5528 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5529 avail = IBD_LSO_BUFSZ; 5530 while (nmp && avail) { 5531 blksize = MBLKL(nmp) - skip; 5532 if (blksize > avail) { 5533 bcopy(nmp->b_rptr + skip, bufp, avail); 5534 skip += avail; 5535 avail = 0; 5536 } else { 5537 bcopy(nmp->b_rptr + skip, bufp, blksize); 5538 skip = 0; 5539 avail -= blksize; 5540 bufp += blksize; 5541 nmp = nmp->b_cont; 5542 } 5543 } 5544 } 5545 5546 return (0); 5547 } 5548 5549 /* 5550 * Schedule a completion queue polling to reap the resource we're 5551 * short on. If we implement the change to reap tx completions 5552 * in a separate thread, we'll need to wake up that thread here. 5553 */ 5554 static int 5555 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5556 { 5557 ibd_req_t *req; 5558 5559 mutex_enter(&state->id_sched_lock); 5560 state->id_sched_needed |= resource_type; 5561 mutex_exit(&state->id_sched_lock); 5562 5563 /* 5564 * If we are asked to queue a work entry, we need to do it 5565 */ 5566 if (q_flag) { 5567 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5568 if (req == NULL) 5569 return (-1); 5570 5571 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5572 } 5573 5574 return (0); 5575 } 5576 5577 /* 5578 * The passed in packet has this format: 5579 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5580 */ 5581 static boolean_t 5582 ibd_send(ibd_state_t *state, mblk_t *mp) 5583 { 5584 ibd_ace_t *ace; 5585 ibd_swqe_t *node; 5586 ipoib_mac_t *dest; 5587 ib_header_info_t *ipibp; 5588 ip6_t *ip6h; 5589 uint_t pktsize; 5590 uint32_t mss; 5591 uint32_t hckflags; 5592 uint32_t lsoflags = 0; 5593 uint_t lsohdr_sz = 0; 5594 int ret, len; 5595 boolean_t dofree = B_FALSE; 5596 boolean_t rc; 5597 5598 /* 5599 * If we aren't done with the device initialization and start, 5600 * we shouldn't be here. 5601 */ 5602 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5603 return (B_FALSE); 5604 5605 node = NULL; 5606 if (ibd_acquire_swqe(state, &node) != 0) { 5607 /* 5608 * If we don't have an swqe available, schedule a transmit 5609 * completion queue cleanup and hold off on sending more 5610 * more packets until we have some free swqes 5611 */ 5612 if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0) 5613 return (B_FALSE); 5614 5615 /* 5616 * If a poll cannot be scheduled, we have no choice but 5617 * to drop this packet 5618 */ 5619 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5620 return (B_TRUE); 5621 } 5622 5623 /* 5624 * Initialize the commonly used fields in swqe to NULL to protect 5625 * against ibd_tx_cleanup accidentally misinterpreting these on a 5626 * failure. 5627 */ 5628 node->swqe_im_mblk = NULL; 5629 node->w_swr.wr_nds = 0; 5630 node->w_swr.wr_sgl = NULL; 5631 node->w_swr.wr_opcode = IBT_WRC_SEND; 5632 5633 /* 5634 * Obtain an address handle for the destination. 5635 */ 5636 ipibp = (ib_header_info_t *)mp->b_rptr; 5637 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5638 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5639 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5640 5641 pktsize = msgsize(mp); 5642 5643 atomic_add_64(&state->id_xmt_bytes, pktsize); 5644 atomic_inc_64(&state->id_xmt_pkt); 5645 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5646 atomic_inc_64(&state->id_brd_xmt); 5647 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5648 atomic_inc_64(&state->id_multi_xmt); 5649 5650 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 5651 node->w_ahandle = ace; 5652 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5653 } else { 5654 DPRINT(5, 5655 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5656 ((ret == EFAULT) ? "failed" : "queued"), 5657 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5658 htonl(dest->ipoib_gidpref[1]), 5659 htonl(dest->ipoib_gidsuff[0]), 5660 htonl(dest->ipoib_gidsuff[1])); 5661 node->w_ahandle = NULL; 5662 5663 /* 5664 * for the poll mode, it is probably some cqe pending in the 5665 * cq. So ibd has to poll cq here, otherwise acache probably 5666 * may not be recycled. 5667 */ 5668 if (ibd_txcomp_poll == 1) 5669 ibd_poll_compq(state, state->id_scq_hdl); 5670 5671 /* 5672 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5673 * can not find a path for the specific dest address. We 5674 * should get rid of this kind of packet. We also should get 5675 * rid of the packet if we cannot schedule a poll via the 5676 * async thread. For the normal case, ibd will return the 5677 * packet to upper layer and wait for AH creating. 5678 * 5679 * Note that we always queue a work slot entry for the async 5680 * thread when we fail AH lookup (even in intr mode); this is 5681 * due to the convoluted way the code currently looks for AH. 5682 */ 5683 if (ret == EFAULT) { 5684 dofree = B_TRUE; 5685 rc = B_TRUE; 5686 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5687 dofree = B_TRUE; 5688 rc = B_TRUE; 5689 } else { 5690 dofree = B_FALSE; 5691 rc = B_FALSE; 5692 } 5693 goto ibd_send_fail; 5694 } 5695 5696 /* 5697 * For ND6 packets, padding is at the front of the source lladdr. 5698 * Insert the padding at front. 5699 */ 5700 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 5701 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5702 if (!pullupmsg(mp, IPV6_HDR_LEN + 5703 sizeof (ib_header_info_t))) { 5704 DPRINT(10, "ibd_send: pullupmsg failure "); 5705 dofree = B_TRUE; 5706 rc = B_TRUE; 5707 goto ibd_send_fail; 5708 } 5709 ipibp = (ib_header_info_t *)mp->b_rptr; 5710 } 5711 ip6h = (ip6_t *)((uchar_t *)ipibp + 5712 sizeof (ib_header_info_t)); 5713 len = ntohs(ip6h->ip6_plen); 5714 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5715 mblk_t *pad; 5716 5717 pad = allocb(4, 0); 5718 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5719 linkb(mp, pad); 5720 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5721 IPV6_HDR_LEN + len + 4) { 5722 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5723 IPV6_HDR_LEN + len + 4)) { 5724 DPRINT(10, "ibd_send: pullupmsg " 5725 "failure "); 5726 dofree = B_TRUE; 5727 rc = B_TRUE; 5728 goto ibd_send_fail; 5729 } 5730 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5731 sizeof (ib_header_info_t)); 5732 } 5733 5734 /* LINTED: E_CONSTANT_CONDITION */ 5735 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5736 } 5737 } 5738 5739 mp->b_rptr += sizeof (ib_addrs_t); 5740 5741 /* 5742 * Do LSO and checksum related work here. For LSO send, adjust the 5743 * ud destination, the opcode and the LSO header information to the 5744 * work request. 5745 */ 5746 lso_info_get(mp, &mss, &lsoflags); 5747 if ((lsoflags & HW_LSO) != HW_LSO) { 5748 node->w_swr.wr_opcode = IBT_WRC_SEND; 5749 lsohdr_sz = 0; 5750 } else { 5751 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5752 /* 5753 * The routine can only fail if there's no memory; we 5754 * can only drop the packet if this happens 5755 */ 5756 ibd_print_warn(state, 5757 "ibd_send: no memory, lso posting failed"); 5758 dofree = B_TRUE; 5759 rc = B_TRUE; 5760 goto ibd_send_fail; 5761 } 5762 5763 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5764 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5765 } 5766 5767 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5768 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5769 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5770 else 5771 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5772 5773 /* 5774 * Prepare the sgl for posting; the routine can only fail if there's 5775 * no lso buf available for posting. If this is the case, we should 5776 * probably resched for lso bufs to become available and then try again. 5777 */ 5778 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5779 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5780 dofree = B_TRUE; 5781 rc = B_TRUE; 5782 } else { 5783 dofree = B_FALSE; 5784 rc = B_FALSE; 5785 } 5786 goto ibd_send_fail; 5787 } 5788 node->swqe_im_mblk = mp; 5789 5790 /* 5791 * Queue the wqe to hardware; since we can now simply queue a 5792 * post instead of doing it serially, we cannot assume anything 5793 * about the 'node' after ibd_post_send() returns. 5794 */ 5795 ibd_post_send(state, node); 5796 5797 return (B_TRUE); 5798 5799 ibd_send_fail: 5800 if (node && mp) 5801 ibd_free_lsohdr(node, mp); 5802 5803 if (dofree) 5804 freemsg(mp); 5805 5806 if (node != NULL) 5807 ibd_tx_cleanup(state, node); 5808 5809 return (rc); 5810 } 5811 5812 /* 5813 * GLDv3 entry point for transmitting datagram. 5814 */ 5815 static mblk_t * 5816 ibd_m_tx(void *arg, mblk_t *mp) 5817 { 5818 ibd_state_t *state = (ibd_state_t *)arg; 5819 mblk_t *next; 5820 5821 if (state->id_link_state != LINK_STATE_UP) { 5822 freemsgchain(mp); 5823 mp = NULL; 5824 } 5825 5826 while (mp != NULL) { 5827 next = mp->b_next; 5828 mp->b_next = NULL; 5829 if (ibd_send(state, mp) == B_FALSE) { 5830 /* Send fail */ 5831 mp->b_next = next; 5832 break; 5833 } 5834 mp = next; 5835 } 5836 5837 return (mp); 5838 } 5839 5840 /* 5841 * this handles Tx and Rx completions. With separate CQs, this handles 5842 * only Rx completions. 5843 */ 5844 static uint_t 5845 ibd_intr(char *arg) 5846 { 5847 ibd_state_t *state = (ibd_state_t *)arg; 5848 5849 ibd_poll_compq(state, state->id_rcq_hdl); 5850 5851 return (DDI_INTR_CLAIMED); 5852 } 5853 5854 /* 5855 * Poll and drain the cq 5856 */ 5857 static uint_t 5858 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, 5859 uint_t numwcs) 5860 { 5861 ibd_wqe_t *wqe; 5862 ibt_wc_t *wc; 5863 uint_t total_polled = 0; 5864 uint_t num_polled; 5865 int i; 5866 5867 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5868 total_polled += num_polled; 5869 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5870 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5871 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5872 (wqe->w_type == IBD_WQE_RECV)); 5873 if (wc->wc_status != IBT_WC_SUCCESS) { 5874 /* 5875 * Channel being torn down. 5876 */ 5877 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5878 DPRINT(5, "ibd_drain_cq: flush error"); 5879 /* 5880 * Only invoke the Tx handler to 5881 * release possibly held resources 5882 * like AH refcount etc. Can not 5883 * invoke Rx handler because it might 5884 * try adding buffers to the Rx pool 5885 * when we are trying to deinitialize. 5886 */ 5887 if (wqe->w_type == IBD_WQE_RECV) { 5888 continue; 5889 } else { 5890 DPRINT(10, "ibd_drain_cq: Bad " 5891 "status %d", wc->wc_status); 5892 } 5893 } 5894 } 5895 if (wqe->w_type == IBD_WQE_SEND) { 5896 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 5897 } else { 5898 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5899 } 5900 } 5901 } 5902 5903 return (total_polled); 5904 } 5905 5906 /* 5907 * Common code for interrupt handling as well as for polling 5908 * for all completed wqe's while detaching. 5909 */ 5910 static void 5911 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5912 { 5913 ibt_wc_t *wcs; 5914 uint_t numwcs; 5915 int flag, redo_flag; 5916 int redo = 1; 5917 uint_t num_polled = 0; 5918 5919 if (ibd_separate_cqs == 1) { 5920 if (cq_hdl == state->id_rcq_hdl) { 5921 flag = IBD_RX_CQ_POLLING; 5922 redo_flag = IBD_REDO_RX_CQ_POLLING; 5923 } else { 5924 flag = IBD_TX_CQ_POLLING; 5925 redo_flag = IBD_REDO_TX_CQ_POLLING; 5926 } 5927 } else { 5928 flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING; 5929 redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING; 5930 } 5931 5932 mutex_enter(&state->id_cq_poll_lock); 5933 if (state->id_cq_poll_busy & flag) { 5934 state->id_cq_poll_busy |= redo_flag; 5935 mutex_exit(&state->id_cq_poll_lock); 5936 return; 5937 } 5938 state->id_cq_poll_busy |= flag; 5939 mutex_exit(&state->id_cq_poll_lock); 5940 5941 /* 5942 * In some cases (eg detaching), this code can be invoked on 5943 * any cpu after disabling cq notification (thus no concurrency 5944 * exists). Apart from that, the following applies normally: 5945 * The receive completion handling is always on the Rx interrupt 5946 * cpu. Transmit completion handling could be from any cpu if 5947 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5948 * is interrupt driven. Combined completion handling is always 5949 * on the interrupt cpu. Thus, lock accordingly and use the 5950 * proper completion array. 5951 */ 5952 if (ibd_separate_cqs == 1) { 5953 if (cq_hdl == state->id_rcq_hdl) { 5954 wcs = state->id_rxwcs; 5955 numwcs = state->id_rxwcs_size; 5956 } else { 5957 wcs = state->id_txwcs; 5958 numwcs = state->id_txwcs_size; 5959 } 5960 } else { 5961 wcs = state->id_rxwcs; 5962 numwcs = state->id_rxwcs_size; 5963 } 5964 5965 /* 5966 * Poll and drain the CQ 5967 */ 5968 num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5969 5970 /* 5971 * Enable CQ notifications and redrain the cq to catch any 5972 * completions we might have missed after the ibd_drain_cq() 5973 * above and before the ibt_enable_cq_notify() that follows. 5974 * Finally, service any new requests to poll the cq that 5975 * could've come in after the ibt_enable_cq_notify(). 5976 */ 5977 do { 5978 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 5979 IBT_SUCCESS) { 5980 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5981 } 5982 5983 num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5984 5985 mutex_enter(&state->id_cq_poll_lock); 5986 if (state->id_cq_poll_busy & redo_flag) 5987 state->id_cq_poll_busy &= ~redo_flag; 5988 else { 5989 state->id_cq_poll_busy &= ~flag; 5990 redo = 0; 5991 } 5992 mutex_exit(&state->id_cq_poll_lock); 5993 5994 } while (redo); 5995 5996 /* 5997 * If we polled the receive cq and found anything, we need to flush 5998 * it out to the nw layer here. 5999 */ 6000 if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) { 6001 ibd_flush_rx(state, NULL); 6002 } 6003 } 6004 6005 /* 6006 * Unmap the memory area associated with a given swqe. 6007 */ 6008 static void 6009 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 6010 { 6011 ibt_status_t stat; 6012 6013 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6014 6015 if (swqe->w_mi_hdl) { 6016 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6017 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6018 DPRINT(10, 6019 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6020 } 6021 swqe->w_mi_hdl = NULL; 6022 } 6023 swqe->w_swr.wr_nds = 0; 6024 } 6025 6026 /* 6027 * Common code that deals with clean ups after a successful or 6028 * erroneous transmission attempt. 6029 */ 6030 static void 6031 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6032 { 6033 ibd_ace_t *ace = swqe->w_ahandle; 6034 6035 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6036 6037 /* 6038 * If this was a dynamic mapping in ibd_send(), we need to 6039 * unmap here. If this was an lso buffer we'd used for sending, 6040 * we need to release the lso buf to the pool, since the resource 6041 * is scarce. However, if this was simply a normal send using 6042 * the copybuf (present in each swqe), we don't need to release it. 6043 */ 6044 if (swqe->swqe_im_mblk != NULL) { 6045 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6046 ibd_unmap_mem(state, swqe); 6047 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6048 ibd_release_lsobufs(state, 6049 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6050 } 6051 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6052 freemsg(swqe->swqe_im_mblk); 6053 swqe->swqe_im_mblk = NULL; 6054 } 6055 6056 /* 6057 * Drop the reference count on the AH; it can be reused 6058 * now for a different destination if there are no more 6059 * posted sends that will use it. This can be eliminated 6060 * if we can always associate each Tx buffer with an AH. 6061 * The ace can be null if we are cleaning up from the 6062 * ibd_send() error path. 6063 */ 6064 if (ace != NULL) { 6065 /* 6066 * The recycling logic can be eliminated from here 6067 * and put into the async thread if we create another 6068 * list to hold ACE's for unjoined mcg's. 6069 */ 6070 if (DEC_REF_DO_CYCLE(ace)) { 6071 ibd_mce_t *mce; 6072 6073 /* 6074 * Check with the lock taken: we decremented 6075 * reference count without the lock, and some 6076 * transmitter might alreay have bumped the 6077 * reference count (possible in case of multicast 6078 * disable when we leave the AH on the active 6079 * list). If not still 0, get out, leaving the 6080 * recycle bit intact. 6081 * 6082 * Atomically transition the AH from active 6083 * to free list, and queue a work request to 6084 * leave the group and destroy the mce. No 6085 * transmitter can be looking at the AH or 6086 * the MCE in between, since we have the 6087 * ac_mutex lock. In the SendOnly reap case, 6088 * it is not neccesary to hold the ac_mutex 6089 * and recheck the ref count (since the AH was 6090 * taken off the active list), we just do it 6091 * to have uniform processing with the Full 6092 * reap case. 6093 */ 6094 mutex_enter(&state->id_ac_mutex); 6095 mce = ace->ac_mce; 6096 if (GET_REF_CYCLE(ace) == 0) { 6097 CLEAR_REFCYCLE(ace); 6098 /* 6099 * Identify the case of fullmember reap as 6100 * opposed to mcg trap reap. Also, port up 6101 * might set ac_mce to NULL to indicate Tx 6102 * cleanup should do no more than put the 6103 * AH in the free list (see ibd_async_link). 6104 */ 6105 if (mce != NULL) { 6106 ace->ac_mce = NULL; 6107 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6108 /* 6109 * mc_req was initialized at mce 6110 * creation time. 6111 */ 6112 ibd_queue_work_slot(state, 6113 &mce->mc_req, IBD_ASYNC_REAP); 6114 } 6115 IBD_ACACHE_INSERT_FREE(state, ace); 6116 } 6117 mutex_exit(&state->id_ac_mutex); 6118 } 6119 } 6120 6121 /* 6122 * Release the send wqe for reuse. 6123 */ 6124 ibd_release_swqe(state, swqe); 6125 } 6126 6127 /* 6128 * Hand off the processed rx mp chain to mac_rx() 6129 */ 6130 static void 6131 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) 6132 { 6133 if (mpc == NULL) { 6134 mutex_enter(&state->id_rx_lock); 6135 6136 mpc = state->id_rx_mp; 6137 6138 state->id_rx_mp = NULL; 6139 state->id_rx_mp_tail = NULL; 6140 state->id_rx_mp_len = 0; 6141 6142 mutex_exit(&state->id_rx_lock); 6143 } 6144 6145 if (mpc) { 6146 mac_rx(state->id_mh, state->id_rh, mpc); 6147 } 6148 } 6149 6150 /* 6151 * Processing to be done after receipt of a packet; hand off to GLD 6152 * in the format expected by GLD. The received packet has this 6153 * format: 2b sap :: 00 :: data. 6154 */ 6155 static void 6156 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6157 { 6158 ib_header_info_t *phdr; 6159 mblk_t *mp; 6160 mblk_t *mpc = NULL; 6161 ipoib_hdr_t *ipibp; 6162 ipha_t *iphap; 6163 ip6_t *ip6h; 6164 int rxcnt, len; 6165 6166 /* 6167 * Track number handed to upper layer, and number still 6168 * available to receive packets. 6169 */ 6170 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 6171 ASSERT(rxcnt >= 0); 6172 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 6173 6174 /* 6175 * Adjust write pointer depending on how much data came in. 6176 */ 6177 mp = rwqe->rwqe_im_mblk; 6178 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 6179 6180 /* 6181 * Make sure this is NULL or we're in trouble. 6182 */ 6183 if (mp->b_next != NULL) { 6184 ibd_print_warn(state, 6185 "ibd_process_rx: got duplicate mp from rcq?"); 6186 mp->b_next = NULL; 6187 } 6188 6189 /* 6190 * the IB link will deliver one of the IB link layer 6191 * headers called, the Global Routing Header (GRH). 6192 * ibd driver uses the information in GRH to build the 6193 * Header_info structure and pass it with the datagram up 6194 * to GLDv3. 6195 * If the GRH is not valid, indicate to GLDv3 by setting 6196 * the VerTcFlow field to 0. 6197 */ 6198 phdr = (ib_header_info_t *)mp->b_rptr; 6199 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6200 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6201 6202 /* if it is loop back packet, just drop it. */ 6203 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6204 IPOIB_ADDRL) == 0) { 6205 freemsg(mp); 6206 return; 6207 } 6208 6209 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6210 sizeof (ipoib_mac_t)); 6211 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6212 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6213 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6214 } else { 6215 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6216 } 6217 } else { 6218 /* 6219 * It can not be a IBA multicast packet. Must have been 6220 * unicast for us. Just copy the interface address to dst. 6221 */ 6222 phdr->ib_grh.ipoib_vertcflow = 0; 6223 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6224 sizeof (ipoib_mac_t)); 6225 } 6226 6227 /* 6228 * For ND6 packets, padding is at the front of the source/target 6229 * lladdr. However the inet6 layer is not aware of it, hence remove 6230 * the padding from such packets. 6231 */ 6232 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6233 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 6234 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 6235 if (!pullupmsg(mp, IPV6_HDR_LEN + 6236 sizeof (ipoib_hdr_t))) { 6237 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 6238 freemsg(mp); 6239 return; 6240 } 6241 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 6242 sizeof (ipoib_pgrh_t)); 6243 } 6244 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6245 len = ntohs(ip6h->ip6_plen); 6246 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6247 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 6248 IPV6_HDR_LEN + len) { 6249 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 6250 IPV6_HDR_LEN + len)) { 6251 DPRINT(10, "ibd_process_rx: pullupmsg" 6252 " failed"); 6253 freemsg(mp); 6254 return; 6255 } 6256 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6257 sizeof (ipoib_pgrh_t) + 6258 sizeof (ipoib_hdr_t)); 6259 } 6260 /* LINTED: E_CONSTANT_CONDITION */ 6261 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6262 } 6263 } 6264 6265 /* 6266 * Update statistics 6267 */ 6268 atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer); 6269 atomic_inc_64(&state->id_rcv_pkt); 6270 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6271 atomic_inc_64(&state->id_brd_rcv); 6272 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6273 atomic_inc_64(&state->id_multi_rcv); 6274 6275 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6276 /* 6277 * Set receive checksum status in mp 6278 * Hardware checksumming can be considered valid only if: 6279 * 1. CQE.IP_OK bit is set 6280 * 2. CQE.CKSUM = 0xffff 6281 * 3. IPv6 routing header is not present in the packet 6282 * 4. If there are no IP_OPTIONS in the IP HEADER 6283 */ 6284 6285 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6286 (wc->wc_cksum == 0xFFFF) && 6287 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6288 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6289 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6290 } 6291 6292 /* 6293 * Add this mp to the list of processed mp's to send to 6294 * the nw layer 6295 */ 6296 mutex_enter(&state->id_rx_lock); 6297 if (state->id_rx_mp) { 6298 ASSERT(state->id_rx_mp_tail != NULL); 6299 state->id_rx_mp_tail->b_next = mp; 6300 } else { 6301 ASSERT(state->id_rx_mp_tail == NULL); 6302 state->id_rx_mp = mp; 6303 } 6304 6305 state->id_rx_mp_tail = mp; 6306 state->id_rx_mp_len++; 6307 6308 if (state->id_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 6309 mpc = state->id_rx_mp; 6310 6311 state->id_rx_mp = NULL; 6312 state->id_rx_mp_tail = NULL; 6313 state->id_rx_mp_len = 0; 6314 } 6315 6316 mutex_exit(&state->id_rx_lock); 6317 6318 if (mpc) { 6319 ibd_flush_rx(state, mpc); 6320 } 6321 } 6322 6323 /* 6324 * Callback code invoked from STREAMs when the receive data buffer is 6325 * free for recycling. 6326 */ 6327 static void 6328 ibd_freemsg_cb(char *arg) 6329 { 6330 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6331 ibd_state_t *state = rwqe->w_state; 6332 6333 /* 6334 * If the wqe is being destructed, do not attempt recycling. 6335 */ 6336 if (rwqe->w_freeing_wqe == B_TRUE) { 6337 DPRINT(6, "ibd_freemsg: wqe being freed"); 6338 return; 6339 } else { 6340 /* 6341 * Upper layer has released held mblk, so we have 6342 * no more use for keeping the old pointer in 6343 * our rwqe. 6344 */ 6345 rwqe->rwqe_im_mblk = NULL; 6346 } 6347 6348 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6349 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6350 if (rwqe->rwqe_im_mblk == NULL) { 6351 ibd_delete_rwqe(state, rwqe); 6352 ibd_free_rwqe(state, rwqe); 6353 DPRINT(6, "ibd_freemsg: desballoc failed"); 6354 return; 6355 } 6356 6357 if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) { 6358 ibd_delete_rwqe(state, rwqe); 6359 ibd_free_rwqe(state, rwqe); 6360 return; 6361 } 6362 6363 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 6364 } 6365 6366 static uint_t 6367 ibd_tx_recycle(char *arg) 6368 { 6369 ibd_state_t *state = (ibd_state_t *)arg; 6370 6371 /* 6372 * Poll for completed entries 6373 */ 6374 ibd_poll_compq(state, state->id_scq_hdl); 6375 6376 /* 6377 * Resume any blocked transmissions if possible 6378 */ 6379 (void) ibd_resume_transmission(state); 6380 6381 return (DDI_INTR_CLAIMED); 6382 } 6383 6384 #ifdef IBD_LOGGING 6385 static void 6386 ibd_log_init(void) 6387 { 6388 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6389 ibd_lbuf_ndx = 0; 6390 6391 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 6392 } 6393 6394 static void 6395 ibd_log_fini(void) 6396 { 6397 if (ibd_lbuf) 6398 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6399 ibd_lbuf_ndx = 0; 6400 ibd_lbuf = NULL; 6401 6402 mutex_destroy(&ibd_lbuf_lock); 6403 } 6404 6405 static void 6406 ibd_log(const char *fmt, ...) 6407 { 6408 va_list ap; 6409 uint32_t off; 6410 uint32_t msglen; 6411 char tmpbuf[IBD_DMAX_LINE]; 6412 6413 if (ibd_lbuf == NULL) 6414 return; 6415 6416 va_start(ap, fmt); 6417 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6418 va_end(ap); 6419 6420 if (msglen >= IBD_DMAX_LINE) 6421 msglen = IBD_DMAX_LINE - 1; 6422 6423 mutex_enter(&ibd_lbuf_lock); 6424 6425 off = ibd_lbuf_ndx; /* current msg should go here */ 6426 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6427 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6428 6429 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6430 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6431 6432 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6433 ibd_lbuf_ndx = 0; 6434 6435 mutex_exit(&ibd_lbuf_lock); 6436 6437 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6438 } 6439 #endif 6440