1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Per-interface tunables (for developers) 68 * 69 * ibd_tx_copy_thresh 70 * This sets the threshold at which ibd will attempt to do a bcopy of the 71 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 72 * is restricted by various parameters, so setting of this value must be 73 * made after careful considerations only. For instance, IB HCAs currently 74 * impose a relatively small limit (when compared to ethernet NICs) on the 75 * length of the SGL for transmit. On the other hand, the ip stack could 76 * send down mp chains that are quite long when LSO is enabled. 77 * 78 * ibd_num_swqe 79 * Number of "send WQE" elements that will be allocated and used by ibd. 80 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 81 * buffer in each of these send wqes must be taken into account. This 82 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 83 * currently set to the same value of ibd_tx_copy_thresh, but may be 84 * changed independently if needed). 85 * 86 * ibd_num_rwqe 87 * Number of "receive WQE" elements that will be allocated and used by 88 * ibd. This parameter is limited by the maximum channel size of the HCA. 89 * Each buffer in the receive wqe will be of MTU size. 90 * 91 * ibd_num_lso_bufs 92 * Number of "larger-than-MTU" copy buffers to use for cases when the 93 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 94 * and too large to be used with regular MTU-sized copy buffers. It is 95 * not recommended to tune this variable without understanding the 96 * application environment and/or memory resources. The size of each of 97 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 98 * 99 * ibd_num_ah 100 * Number of AH cache entries to allocate 101 * 102 * ibd_hash_size 103 * Hash table size for the active AH list 104 * 105 * ibd_tx_softintr 106 * ibd_rx_softintr 107 * The softintr mechanism allows ibd to avoid event queue overflows if 108 * the receive/completion handlers are to be expensive. These are enabled 109 * by default. 110 * 111 * ibd_log_sz 112 * This specifies the size of the ibd log buffer in bytes. The buffer is 113 * allocated and logging is enabled only when IBD_LOGGING is defined. 114 * 115 */ 116 uint_t ibd_tx_copy_thresh = 0x1000; 117 uint_t ibd_num_swqe = 4000; 118 uint_t ibd_num_rwqe = 4000; 119 uint_t ibd_num_lso_bufs = 0x400; 120 uint_t ibd_num_ah = 256; 121 uint_t ibd_hash_size = 32; 122 uint_t ibd_rx_softintr = 1; 123 uint_t ibd_tx_softintr = 1; 124 uint_t ibd_create_broadcast_group = 1; 125 #ifdef IBD_LOGGING 126 uint_t ibd_log_sz = 0x20000; 127 #endif 128 129 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 130 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 131 #define IBD_NUM_SWQE ibd_num_swqe 132 #define IBD_NUM_RWQE ibd_num_rwqe 133 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 134 #define IBD_NUM_AH ibd_num_ah 135 #define IBD_HASH_SIZE ibd_hash_size 136 #ifdef IBD_LOGGING 137 #define IBD_LOG_SZ ibd_log_sz 138 #endif 139 140 /* 141 * Receive CQ moderation parameters: tunable (for developers) 142 */ 143 uint_t ibd_rxcomp_count = 4; 144 uint_t ibd_rxcomp_usec = 10; 145 146 /* 147 * Send CQ moderation parameters: tunable (for developers) 148 */ 149 uint_t ibd_txcomp_count = 16; 150 uint_t ibd_txcomp_usec = 300; 151 152 /* 153 * Thresholds 154 * 155 * When waiting for resources (swqes or lso buffers) to become available, 156 * the first two thresholds below determine how long to wait before informing 157 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 158 * determines how low the available swqes should go before we start polling 159 * the completion queue. 160 */ 161 #define IBD_FREE_LSOS_THRESH 8 162 #define IBD_FREE_SWQES_THRESH 20 163 #define IBD_TX_POLL_THRESH 80 164 165 /* 166 * When doing multiple-send-wr, this value determines how many to do at 167 * a time (in a single ibt_post_send). 168 */ 169 #define IBD_MAX_TX_POST_MULTIPLE 4 170 171 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 172 #define IBD_RX_POST_CNT 8 173 174 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 175 #define IBD_LOG_RX_POST 4 176 177 /* Minimum number of receive work requests driver needs to always have */ 178 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 179 180 /* 181 * Maximum length for returning chained mps back to crossbow. 182 * Also used as the maximum number of rx wc's polled at a time. 183 */ 184 #define IBD_MAX_RX_MP_LEN 16 185 186 /* 187 * LSO parameters 188 */ 189 #define IBD_LSO_MAXLEN 65536 190 #define IBD_LSO_BUFSZ 8192 191 #define IBD_PROP_LSO_POLICY "lso-policy" 192 193 /* 194 * Completion queue polling control 195 */ 196 #define IBD_CQ_POLLING 0x1 197 #define IBD_REDO_CQ_POLLING 0x2 198 199 /* 200 * Flag bits for resources to reap 201 */ 202 #define IBD_RSRC_SWQE 0x1 203 #define IBD_RSRC_LSOBUF 0x2 204 205 /* 206 * Async operation types 207 */ 208 #define IBD_ASYNC_GETAH 1 209 #define IBD_ASYNC_JOIN 2 210 #define IBD_ASYNC_LEAVE 3 211 #define IBD_ASYNC_PROMON 4 212 #define IBD_ASYNC_PROMOFF 5 213 #define IBD_ASYNC_REAP 6 214 #define IBD_ASYNC_TRAP 7 215 #define IBD_ASYNC_SCHED 8 216 #define IBD_ASYNC_LINK 9 217 #define IBD_ASYNC_EXIT 10 218 219 /* 220 * Async operation states 221 */ 222 #define IBD_OP_NOTSTARTED 0 223 #define IBD_OP_ONGOING 1 224 #define IBD_OP_COMPLETED 2 225 #define IBD_OP_ERRORED 3 226 #define IBD_OP_ROUTERED 4 227 228 /* 229 * State of IBD driver initialization during attach/m_start 230 */ 231 #define IBD_DRV_STATE_INITIALIZED 0x00001 232 #define IBD_DRV_RXINTR_ADDED 0x00002 233 #define IBD_DRV_TXINTR_ADDED 0x00004 234 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 235 #define IBD_DRV_HCA_OPENED 0x00010 236 #define IBD_DRV_PD_ALLOCD 0x00020 237 #define IBD_DRV_MAC_REGISTERED 0x00040 238 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 239 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 240 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 241 #define IBD_DRV_CQS_ALLOCD 0x00400 242 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 243 #define IBD_DRV_TXLIST_ALLOCD 0x01000 244 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 245 #define IBD_DRV_RXLIST_ALLOCD 0x04000 246 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 247 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 248 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 249 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 250 #define IBD_DRV_STARTED 0x80000 251 252 /* 253 * Start/stop in-progress flags; note that restart must always remain 254 * the OR of start and stop flag values. 255 */ 256 #define IBD_DRV_START_IN_PROGRESS 0x10000000 257 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 258 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 259 260 /* 261 * Miscellaneous constants 262 */ 263 #define IBD_SEND 0 264 #define IBD_RECV 1 265 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 266 #define IBD_DEF_MAX_SDU 2044 267 #define IBD_DEFAULT_QKEY 0xB1B 268 #ifdef IBD_LOGGING 269 #define IBD_DMAX_LINE 100 270 #endif 271 272 /* 273 * Enumerations for link states 274 */ 275 typedef enum { 276 IBD_LINK_DOWN, 277 IBD_LINK_UP, 278 IBD_LINK_UP_ABSENT 279 } ibd_link_op_t; 280 281 /* 282 * Driver State Pointer 283 */ 284 void *ibd_list; 285 286 /* 287 * Logging 288 */ 289 #ifdef IBD_LOGGING 290 kmutex_t ibd_lbuf_lock; 291 uint8_t *ibd_lbuf; 292 uint32_t ibd_lbuf_ndx; 293 #endif 294 295 /* 296 * Required system entry points 297 */ 298 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 299 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 300 301 /* 302 * Required driver entry points for GLDv3 303 */ 304 static int ibd_m_stat(void *, uint_t, uint64_t *); 305 static int ibd_m_start(void *); 306 static void ibd_m_stop(void *); 307 static int ibd_m_promisc(void *, boolean_t); 308 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 309 static int ibd_m_unicst(void *, const uint8_t *); 310 static mblk_t *ibd_m_tx(void *, mblk_t *); 311 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 312 313 /* 314 * Private driver entry points for GLDv3 315 */ 316 317 /* 318 * Initialization 319 */ 320 static int ibd_state_init(ibd_state_t *, dev_info_t *); 321 static int ibd_init_txlist(ibd_state_t *); 322 static int ibd_init_rxlist(ibd_state_t *); 323 static int ibd_acache_init(ibd_state_t *); 324 #ifdef IBD_LOGGING 325 static void ibd_log_init(void); 326 #endif 327 328 /* 329 * Termination/cleanup 330 */ 331 static void ibd_state_fini(ibd_state_t *); 332 static void ibd_fini_txlist(ibd_state_t *); 333 static void ibd_fini_rxlist(ibd_state_t *); 334 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 335 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 336 static void ibd_acache_fini(ibd_state_t *); 337 #ifdef IBD_LOGGING 338 static void ibd_log_fini(void); 339 #endif 340 341 /* 342 * Allocation/acquire/map routines 343 */ 344 static int ibd_alloc_tx_copybufs(ibd_state_t *); 345 static int ibd_alloc_rx_copybufs(ibd_state_t *); 346 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 347 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 348 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 349 uint32_t *); 350 351 /* 352 * Free/release/unmap routines 353 */ 354 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 355 static void ibd_free_tx_copybufs(ibd_state_t *); 356 static void ibd_free_rx_copybufs(ibd_state_t *); 357 static void ibd_free_rx_rsrcs(ibd_state_t *); 358 static void ibd_free_tx_lsobufs(ibd_state_t *); 359 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 360 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 361 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 362 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 363 364 /* 365 * Handlers/callback routines 366 */ 367 static uint_t ibd_intr(caddr_t); 368 static uint_t ibd_tx_recycle(caddr_t); 369 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 370 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 371 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 372 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 373 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 374 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 375 static void ibd_freemsg_cb(char *); 376 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 377 ibt_async_event_t *); 378 static void ibd_snet_notices_handler(void *, ib_gid_t, 379 ibt_subnet_event_code_t, ibt_subnet_event_t *); 380 381 /* 382 * Send/receive routines 383 */ 384 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 385 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 386 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 387 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 388 389 /* 390 * Threads 391 */ 392 static void ibd_async_work(ibd_state_t *); 393 394 /* 395 * Async tasks 396 */ 397 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 398 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 399 static void ibd_async_setprom(ibd_state_t *); 400 static void ibd_async_unsetprom(ibd_state_t *); 401 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 402 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 403 static void ibd_async_txsched(ibd_state_t *); 404 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 405 406 /* 407 * Async task helpers 408 */ 409 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 410 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 411 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 412 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 413 ipoib_mac_t *, ipoib_mac_t *); 414 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 415 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 416 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 417 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 418 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 419 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 420 static uint64_t ibd_get_portspeed(ibd_state_t *); 421 static boolean_t ibd_async_safe(ibd_state_t *); 422 static void ibd_async_done(ibd_state_t *); 423 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 424 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 425 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 426 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 427 static void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *); 428 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 429 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 430 431 /* 432 * Helpers for attach/start routines 433 */ 434 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 435 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 436 static int ibd_unattach(ibd_state_t *, dev_info_t *); 437 static int ibd_get_port_details(ibd_state_t *); 438 static int ibd_alloc_cqs(ibd_state_t *); 439 static int ibd_setup_ud_channel(ibd_state_t *); 440 static int ibd_start(ibd_state_t *); 441 static int ibd_undo_start(ibd_state_t *, link_state_t); 442 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 443 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 444 445 446 /* 447 * Miscellaneous helpers 448 */ 449 static int ibd_sched_poll(ibd_state_t *, int, int); 450 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 451 static void ibd_resume_transmission(ibd_state_t *); 452 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 453 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 454 static void *list_get_head(list_t *); 455 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 456 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 457 static void ibd_print_warn(ibd_state_t *, char *, ...); 458 #ifdef IBD_LOGGING 459 static void ibd_log(const char *, ...); 460 #endif 461 462 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 463 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 464 465 /* Module Driver Info */ 466 static struct modldrv ibd_modldrv = { 467 &mod_driverops, /* This one is a driver */ 468 "InfiniBand GLDv3 Driver", /* short description */ 469 &ibd_dev_ops /* driver specific ops */ 470 }; 471 472 /* Module Linkage */ 473 static struct modlinkage ibd_modlinkage = { 474 MODREV_1, (void *)&ibd_modldrv, NULL 475 }; 476 477 /* 478 * Module (static) info passed to IBTL during ibt_attach 479 */ 480 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 481 IBTI_V_CURR, 482 IBT_NETWORK, 483 ibd_async_handler, 484 NULL, 485 "IPIB" 486 }; 487 488 /* 489 * GLDv3 entry points 490 */ 491 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 492 static mac_callbacks_t ibd_m_callbacks = { 493 IBD_M_CALLBACK_FLAGS, 494 ibd_m_stat, 495 ibd_m_start, 496 ibd_m_stop, 497 ibd_m_promisc, 498 ibd_m_multicst, 499 ibd_m_unicst, 500 ibd_m_tx, 501 NULL, 502 ibd_m_getcapab 503 }; 504 505 /* 506 * Fill/clear <scope> and <p_key> in multicast/broadcast address 507 */ 508 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 509 { \ 510 *(uint32_t *)((char *)(maddr) + 4) |= \ 511 htonl((uint32_t)(scope) << 16); \ 512 *(uint32_t *)((char *)(maddr) + 8) |= \ 513 htonl((uint32_t)(pkey) << 16); \ 514 } 515 516 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 517 { \ 518 *(uint32_t *)((char *)(maddr) + 4) &= \ 519 htonl(~((uint32_t)0xF << 16)); \ 520 *(uint32_t *)((char *)(maddr) + 8) &= \ 521 htonl(~((uint32_t)0xFFFF << 16)); \ 522 } 523 524 /* 525 * Rudimentary debugging support 526 */ 527 #ifdef DEBUG 528 int ibd_debuglevel = 100; 529 static void 530 debug_print(int l, char *fmt, ...) 531 { 532 va_list ap; 533 534 if (l < ibd_debuglevel) 535 return; 536 va_start(ap, fmt); 537 vcmn_err(CE_CONT, fmt, ap); 538 va_end(ap); 539 } 540 #define DPRINT debug_print 541 #else 542 #define DPRINT 0 && 543 #endif 544 545 /* 546 * Common routine to print warning messages; adds in hca guid, port number 547 * and pkey to be able to identify the IBA interface. 548 */ 549 static void 550 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 551 { 552 ib_guid_t hca_guid; 553 char ibd_print_buf[256]; 554 int len; 555 va_list ap; 556 557 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 558 0, "hca-guid", 0); 559 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 560 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 561 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 562 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 563 va_start(ap, fmt); 564 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 565 fmt, ap); 566 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 567 va_end(ap); 568 } 569 570 /* 571 * Warlock directives 572 */ 573 574 /* 575 * id_lso_lock 576 * 577 * state->id_lso->bkt_nfree may be accessed without a lock to 578 * determine the threshold at which we have to ask the nw layer 579 * to resume transmission (see ibd_resume_transmission()). 580 */ 581 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 582 ibd_state_t::id_lso)) 583 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 584 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 585 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 586 587 /* 588 * id_scq_poll_lock 589 */ 590 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 591 ibd_state_t::id_scq_poll_busy)) 592 593 /* 594 * id_txpost_lock 595 */ 596 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 597 ibd_state_t::id_tx_head)) 598 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 599 ibd_state_t::id_tx_busy)) 600 601 /* 602 * id_acache_req_lock 603 */ 604 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 605 ibd_state_t::id_acache_req_cv)) 606 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 607 ibd_state_t::id_req_list)) 608 _NOTE(SCHEME_PROTECTS_DATA("atomic", 609 ibd_acache_s::ac_ref)) 610 611 /* 612 * id_ac_mutex 613 * 614 * This mutex is actually supposed to protect id_ah_op as well, 615 * but this path of the code isn't clean (see update of id_ah_op 616 * in ibd_async_acache(), immediately after the call to 617 * ibd_async_mcache()). For now, we'll skip this check by 618 * declaring that id_ah_op is protected by some internal scheme 619 * that warlock isn't aware of. 620 */ 621 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 622 ibd_state_t::id_ah_active)) 623 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 624 ibd_state_t::id_ah_free)) 625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 626 ibd_state_t::id_ah_addr)) 627 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 628 ibd_state_t::id_ah_op)) 629 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 630 ibd_state_t::id_ah_error)) 631 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 632 ibd_state_t::id_ac_hot_ace)) 633 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 634 635 /* 636 * id_mc_mutex 637 */ 638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 639 ibd_state_t::id_mc_full)) 640 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 641 ibd_state_t::id_mc_non)) 642 643 /* 644 * id_trap_lock 645 */ 646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 647 ibd_state_t::id_trap_cv)) 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 649 ibd_state_t::id_trap_stop)) 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 651 ibd_state_t::id_trap_inprog)) 652 653 /* 654 * id_prom_op 655 */ 656 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 657 ibd_state_t::id_prom_op)) 658 659 /* 660 * id_sched_lock 661 */ 662 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 663 ibd_state_t::id_sched_needed)) 664 665 /* 666 * id_link_mutex 667 */ 668 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 669 ibd_state_t::id_link_state)) 670 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 671 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 672 ibd_state_t::id_link_speed)) 673 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 674 675 /* 676 * id_tx_list.dl_mutex 677 */ 678 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 679 ibd_state_t::id_tx_list.dl_head)) 680 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 681 ibd_state_t::id_tx_list.dl_pending_sends)) 682 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 683 ibd_state_t::id_tx_list.dl_cnt)) 684 685 /* 686 * id_rx_list.dl_mutex 687 */ 688 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 689 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 690 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 691 ibd_state_t::id_rx_list.dl_cnt)) 692 693 694 /* 695 * Items protected by atomic updates 696 */ 697 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 698 ibd_state_s::id_brd_rcv 699 ibd_state_s::id_brd_xmt 700 ibd_state_s::id_multi_rcv 701 ibd_state_s::id_multi_xmt 702 ibd_state_s::id_num_intrs 703 ibd_state_s::id_rcv_bytes 704 ibd_state_s::id_rcv_pkt 705 ibd_state_s::id_rx_post_queue_index 706 ibd_state_s::id_tx_short 707 ibd_state_s::id_xmt_bytes 708 ibd_state_s::id_xmt_pkt)) 709 710 /* 711 * Non-mutex protection schemes for data elements. Almost all of 712 * these are non-shared items. 713 */ 714 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 715 callb_cpr 716 ib_gid_s 717 ib_header_info 718 ibd_acache_rq 719 ibd_acache_s::ac_mce 720 ibd_mcache::mc_fullreap 721 ibd_mcache::mc_jstate 722 ibd_mcache::mc_req 723 ibd_rwqe_s 724 ibd_swqe_s 725 ibd_wqe_s 726 ibt_wr_ds_s::ds_va 727 ibt_wr_lso_s 728 ipoib_mac::ipoib_qpn 729 mac_capab_lso_s 730 msgb::b_next 731 msgb::b_rptr 732 msgb::b_wptr 733 ibd_state_s::id_bgroup_created 734 ibd_state_s::id_mac_state 735 ibd_state_s::id_mtu 736 ibd_state_s::id_num_rwqe 737 ibd_state_s::id_num_swqe 738 ibd_state_s::id_qpnum 739 ibd_state_s::id_rcq_hdl 740 ibd_state_s::id_rx_buf_sz 741 ibd_state_s::id_rx_bufs 742 ibd_state_s::id_rx_mr_hdl 743 ibd_state_s::id_rx_wqes 744 ibd_state_s::id_rxwcs 745 ibd_state_s::id_rxwcs_size 746 ibd_state_s::id_rx_nqueues 747 ibd_state_s::id_rx_queues 748 ibd_state_s::id_scope 749 ibd_state_s::id_scq_hdl 750 ibd_state_s::id_tx_buf_sz 751 ibd_state_s::id_tx_bufs 752 ibd_state_s::id_tx_mr_hdl 753 ibd_state_s::id_tx_rel_list.dl_cnt 754 ibd_state_s::id_tx_wqes 755 ibd_state_s::id_txwcs 756 ibd_state_s::id_txwcs_size)) 757 758 int 759 _init() 760 { 761 int status; 762 763 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 764 PAGESIZE), 0); 765 if (status != 0) { 766 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 767 return (status); 768 } 769 770 mac_init_ops(&ibd_dev_ops, "ibd"); 771 status = mod_install(&ibd_modlinkage); 772 if (status != 0) { 773 DPRINT(10, "_init:failed in mod_install()"); 774 ddi_soft_state_fini(&ibd_list); 775 mac_fini_ops(&ibd_dev_ops); 776 return (status); 777 } 778 779 #ifdef IBD_LOGGING 780 ibd_log_init(); 781 #endif 782 return (0); 783 } 784 785 int 786 _info(struct modinfo *modinfop) 787 { 788 return (mod_info(&ibd_modlinkage, modinfop)); 789 } 790 791 int 792 _fini() 793 { 794 int status; 795 796 status = mod_remove(&ibd_modlinkage); 797 if (status != 0) 798 return (status); 799 800 mac_fini_ops(&ibd_dev_ops); 801 ddi_soft_state_fini(&ibd_list); 802 #ifdef IBD_LOGGING 803 ibd_log_fini(); 804 #endif 805 return (0); 806 } 807 808 /* 809 * Convert the GID part of the mac address from network byte order 810 * to host order. 811 */ 812 static void 813 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 814 { 815 ib_sn_prefix_t nbopref; 816 ib_guid_t nboguid; 817 818 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 819 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 820 dgid->gid_prefix = b2h64(nbopref); 821 dgid->gid_guid = b2h64(nboguid); 822 } 823 824 /* 825 * Create the IPoIB address in network byte order from host order inputs. 826 */ 827 static void 828 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 829 ib_guid_t guid) 830 { 831 ib_sn_prefix_t nbopref; 832 ib_guid_t nboguid; 833 834 mac->ipoib_qpn = htonl(qpn); 835 nbopref = h2b64(prefix); 836 nboguid = h2b64(guid); 837 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 838 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 839 } 840 841 /* 842 * Send to the appropriate all-routers group when the IBA multicast group 843 * does not exist, based on whether the target group is v4 or v6. 844 */ 845 static boolean_t 846 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 847 ipoib_mac_t *rmac) 848 { 849 boolean_t retval = B_TRUE; 850 uint32_t adjscope = state->id_scope << 16; 851 uint32_t topword; 852 853 /* 854 * Copy the first 4 bytes in without assuming any alignment of 855 * input mac address; this will have IPoIB signature, flags and 856 * scope bits. 857 */ 858 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 859 topword = ntohl(topword); 860 861 /* 862 * Generate proper address for IPv4/v6, adding in the Pkey properly. 863 */ 864 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 865 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 866 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 867 ((uint32_t)(state->id_pkey << 16))), 868 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 869 else 870 /* 871 * Does not have proper bits in the mgid address. 872 */ 873 retval = B_FALSE; 874 875 return (retval); 876 } 877 878 /* 879 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 880 * front of optional src/tgt link layer address. Right now Solaris inserts 881 * padding by default at the end. The routine which is doing is nce_xmit() 882 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 883 * the packet comes down from IP layer to the IBD driver, it is in the 884 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 885 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 886 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 887 * 888 * The send routine at IBD driver changes this packet as follows: 889 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 890 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 891 * aligned. 892 * 893 * At the receiving side again ibd_process_rx takes the above packet and 894 * removes the two bytes of front padding and inserts it at the end. This 895 * is since the IP layer does not understand padding at the front. 896 */ 897 #define IBD_PAD_NSNA(ip6h, len, type) { \ 898 uchar_t *nd_lla_ptr; \ 899 icmp6_t *icmp6; \ 900 nd_opt_hdr_t *opt; \ 901 int i; \ 902 \ 903 icmp6 = (icmp6_t *)&ip6h[1]; \ 904 len -= sizeof (nd_neighbor_advert_t); \ 905 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 906 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 907 (len != 0)) { \ 908 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 909 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 910 ASSERT(opt != NULL); \ 911 nd_lla_ptr = (uchar_t *)&opt[1]; \ 912 if (type == IBD_SEND) { \ 913 for (i = IPOIB_ADDRL; i > 0; i--) \ 914 *(nd_lla_ptr + i + 1) = \ 915 *(nd_lla_ptr + i - 1); \ 916 } else { \ 917 for (i = 0; i < IPOIB_ADDRL; i++) \ 918 *(nd_lla_ptr + i) = \ 919 *(nd_lla_ptr + i + 2); \ 920 } \ 921 *(nd_lla_ptr + i) = 0; \ 922 *(nd_lla_ptr + i + 1) = 0; \ 923 } \ 924 } 925 926 /* 927 * Address handle entries maintained by the driver are kept in the 928 * free and active lists. Each entry starts out in the free list; 929 * it migrates to the active list when primed using ibt_get_paths() 930 * and ibt_modify_ud_dest() for transmission to a specific destination. 931 * In the active list, the entry has a reference count indicating the 932 * number of ongoing/uncompleted transmits that reference it. The 933 * entry is left in the active list even after the reference count 934 * goes to 0, since successive transmits can find it there and do 935 * not need to set up another entry (ie the path information is 936 * cached using the active list). Entries on the active list are 937 * also hashed using the destination link address as a key for faster 938 * lookups during transmits. 939 * 940 * For any destination address (unicast or multicast, whatever the 941 * join states), there will be at most one entry in the active list. 942 * Entries with a 0 reference count on the active list can be reused 943 * for a transmit to a new destination, if the free list is empty. 944 * 945 * The AH free list insertion/deletion is protected with the id_ac_mutex, 946 * since the async thread and Tx callback handlers insert/delete. The 947 * active list does not need a lock (all operations are done by the 948 * async thread) but updates to the reference count are atomically 949 * done (increments done by Tx path, decrements by the Tx callback handler). 950 */ 951 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 952 list_insert_head(&state->id_ah_free, ce) 953 #define IBD_ACACHE_GET_FREE(state) \ 954 list_get_head(&state->id_ah_free) 955 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 956 int _ret_; \ 957 list_insert_head(&state->id_ah_active, ce); \ 958 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 959 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 960 ASSERT(_ret_ == 0); \ 961 state->id_ac_hot_ace = ce; \ 962 } 963 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 964 list_remove(&state->id_ah_active, ce); \ 965 if (state->id_ac_hot_ace == ce) \ 966 state->id_ac_hot_ace = NULL; \ 967 (void) mod_hash_remove(state->id_ah_active_hash, \ 968 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 969 } 970 #define IBD_ACACHE_GET_ACTIVE(state) \ 971 list_get_head(&state->id_ah_active) 972 973 /* 974 * Membership states for different mcg's are tracked by two lists: 975 * the "non" list is used for promiscuous mode, when all mcg traffic 976 * needs to be inspected. This type of membership is never used for 977 * transmission, so there can not be an AH in the active list 978 * corresponding to a member in this list. This list does not need 979 * any protection, since all operations are performed by the async 980 * thread. 981 * 982 * "Full" and "SendOnly" membership is tracked using a single list, 983 * the "full" list. This is because this single list can then be 984 * searched during transmit to a multicast group (if an AH for the 985 * mcg is not found in the active list), since at least one type 986 * of membership must be present before initiating the transmit. 987 * This list is also emptied during driver detach, since sendonly 988 * membership acquired during transmit is dropped at detach time 989 * along with ipv4 broadcast full membership. Insert/deletes to 990 * this list are done only by the async thread, but it is also 991 * searched in program context (see multicast disable case), thus 992 * the id_mc_mutex protects the list. The driver detach path also 993 * deconstructs the "full" list, but it ensures that the async 994 * thread will not be accessing the list (by blocking out mcg 995 * trap handling and making sure no more Tx reaping will happen). 996 * 997 * Currently, an IBA attach is done in the SendOnly case too, 998 * although this is not required. 999 */ 1000 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1001 list_insert_head(&state->id_mc_full, mce) 1002 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1003 list_insert_head(&state->id_mc_non, mce) 1004 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1005 ibd_mcache_find(mgid, &state->id_mc_full) 1006 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1007 ibd_mcache_find(mgid, &state->id_mc_non) 1008 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1009 list_remove(&state->id_mc_full, mce) 1010 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1011 list_remove(&state->id_mc_non, mce) 1012 1013 /* 1014 * AH and MCE active list manipulation: 1015 * 1016 * Multicast disable requests and MCG delete traps are two cases 1017 * where the active AH entry for the mcg (if any unreferenced one exists) 1018 * will be moved to the free list (to force the next Tx to the mcg to 1019 * join the MCG in SendOnly mode). Port up handling will also move AHs 1020 * from active to free list. 1021 * 1022 * In the case when some transmits are still pending on an entry 1023 * for an mcg, but a multicast disable has already been issued on the 1024 * mcg, there are some options to consider to preserve the join state 1025 * to ensure the emitted packet is properly routed on the IBA fabric. 1026 * For the AH, we can 1027 * 1. take out of active list at multicast disable time. 1028 * 2. take out of active list only when last pending Tx completes. 1029 * For the MCE, we can 1030 * 3. take out of active list at multicast disable time. 1031 * 4. take out of active list only when last pending Tx completes. 1032 * 5. move from active list to stale list at multicast disable time. 1033 * We choose to use 2,4. We use option 4 so that if a multicast enable 1034 * is tried before the pending Tx completes, the enable code finds the 1035 * mce in the active list and just has to make sure it will not be reaped 1036 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1037 * a stale list (#5) that would be checked in the enable code would need 1038 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1039 * after the multicast disable would try to put an AH in the active list, 1040 * and associate the mce it finds in the active list to this new AH, 1041 * whereas the mce is already associated with the previous AH (taken off 1042 * the active list), and will be removed once the pending Tx's complete 1043 * (unless a reference count on mce's is implemented). One implication of 1044 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1045 * grab new references on the AH, further delaying the leave. 1046 * 1047 * In the case of mcg delete (or create) trap when the port is sendonly 1048 * joined, the AH and MCE handling is different: the AH and MCE has to be 1049 * immediately taken off the active lists (forcing a join and path lookup 1050 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1051 * to an mcg as it is repeatedly created and deleted and goes thru 1052 * reincarnations). 1053 * 1054 * When a port is already sendonly joined, and a multicast enable is 1055 * attempted, the same mce structure is promoted; this ensures only a 1056 * single mce on the active list tracks the most powerful join state. 1057 * 1058 * In the case of port up event handling, the MCE for sendonly membership 1059 * is freed up, and the ACE is put into the free list as soon as possible 1060 * (depending on whether posted Tx's have completed). For fullmembership 1061 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1062 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1063 * done; else the mce is deconstructed (mc_fullreap case). 1064 * 1065 * MCG creation and deletion trap handling: 1066 * 1067 * These traps are unreliable (meaning sometimes the trap might never 1068 * be delivered to the subscribed nodes) and may arrive out-of-order 1069 * since they use UD transport. An alternative to relying on these 1070 * unreliable traps is to poll for mcg presence every so often, but 1071 * instead of doing that, we try to be as conservative as possible 1072 * while handling the traps, and hope that the traps do arrive at 1073 * the subscribed nodes soon. Note that if a node is fullmember 1074 * joined to an mcg, it can not possibly receive a mcg create/delete 1075 * trap for that mcg (by fullmember definition); if it does, it is 1076 * an old trap from a previous incarnation of the mcg. 1077 * 1078 * Whenever a trap is received, the driver cleans up its sendonly 1079 * membership to the group; we choose to do a sendonly leave even 1080 * on a creation trap to handle the case of a prior deletion of the mcg 1081 * having gone unnoticed. Consider an example scenario: 1082 * T1: MCG M is deleted, and fires off deletion trap D1. 1083 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1084 * T3: Node N tries to transmit to M, joining in sendonly mode. 1085 * T4: MCG M is deleted, and fires off deletion trap D2. 1086 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1087 * If the trap is D2, then a LEAVE is not required, since the mcg 1088 * is already deleted; but if it is D1, a LEAVE is required. A safe 1089 * approach is to always LEAVE, but the SM may be confused if it 1090 * receives a LEAVE without a prior JOIN. 1091 * 1092 * Management of the non-membership to an mcg is similar to the above, 1093 * except that if the interface is in promiscuous mode, it is required 1094 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1095 * if the re-join attempt fails (in which case a warning message needs 1096 * to be printed), it is not clear whether it failed due to the mcg not 1097 * existing, or some fabric/hca issues, due to the delayed nature of 1098 * trap delivery. Querying the SA to establish presence/absence of the 1099 * mcg is also racy at best. Thus, the driver just prints a warning 1100 * message when it can not rejoin after receiving a create trap, although 1101 * this might be (on rare occasions) a mis-warning if the create trap is 1102 * received after the mcg was deleted. 1103 */ 1104 1105 /* 1106 * Implementation of atomic "recycle" bits and reference count 1107 * on address handles. This utilizes the fact that max reference 1108 * count on any handle is limited by number of send wqes, thus 1109 * high bits in the ac_ref field can be used as the recycle bits, 1110 * and only the low bits hold the number of pending Tx requests. 1111 * This atomic AH reference counting allows the Tx completion 1112 * handler not to acquire the id_ac_mutex to process every completion, 1113 * thus reducing lock contention problems between completion and 1114 * the Tx path. 1115 */ 1116 #define CYCLEVAL 0x80000 1117 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1118 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1119 #define GET_REF(ace) ((ace)->ac_ref) 1120 #define GET_REF_CYCLE(ace) ( \ 1121 /* \ 1122 * Make sure "cycle" bit is set. \ 1123 */ \ 1124 ASSERT(CYCLE_SET(ace)), \ 1125 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1126 ) 1127 #define INC_REF(ace, num) { \ 1128 atomic_add_32(&(ace)->ac_ref, num); \ 1129 } 1130 #define SET_CYCLE_IF_REF(ace) ( \ 1131 CYCLE_SET(ace) ? B_TRUE : \ 1132 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1133 CYCLEVAL ? \ 1134 /* \ 1135 * Clear the "cycle" bit we just set; \ 1136 * ref count known to be 0 from above. \ 1137 */ \ 1138 CLEAR_REFCYCLE(ace), B_FALSE : \ 1139 /* \ 1140 * We set "cycle" bit; let caller know. \ 1141 */ \ 1142 B_TRUE \ 1143 ) 1144 #define DEC_REF_DO_CYCLE(ace) ( \ 1145 atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ? \ 1146 /* \ 1147 * Ref count known to be 0 from above. \ 1148 */ \ 1149 B_TRUE : \ 1150 B_FALSE \ 1151 ) 1152 1153 static void * 1154 list_get_head(list_t *list) 1155 { 1156 list_node_t *lhead = list_head(list); 1157 1158 if (lhead != NULL) 1159 list_remove(list, lhead); 1160 return (lhead); 1161 } 1162 1163 /* 1164 * This is always guaranteed to be able to queue the work. 1165 */ 1166 static void 1167 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1168 { 1169 /* Initialize request */ 1170 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1171 ptr->rq_op = op; 1172 1173 /* 1174 * Queue provided slot onto request pool. 1175 */ 1176 mutex_enter(&state->id_acache_req_lock); 1177 list_insert_tail(&state->id_req_list, ptr); 1178 1179 /* Go, fetch, async thread */ 1180 cv_signal(&state->id_acache_req_cv); 1181 mutex_exit(&state->id_acache_req_lock); 1182 } 1183 1184 /* 1185 * Main body of the per interface async thread. 1186 */ 1187 static void 1188 ibd_async_work(ibd_state_t *state) 1189 { 1190 ibd_req_t *ptr; 1191 callb_cpr_t cprinfo; 1192 1193 mutex_enter(&state->id_acache_req_lock); 1194 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1195 callb_generic_cpr, "ibd_async_work"); 1196 1197 for (;;) { 1198 ptr = list_get_head(&state->id_req_list); 1199 if (ptr != NULL) { 1200 mutex_exit(&state->id_acache_req_lock); 1201 1202 /* 1203 * Once we have done the operation, there is no 1204 * guarantee the request slot is going to be valid, 1205 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1206 * TRAP). 1207 * 1208 * Perform the request. 1209 */ 1210 switch (ptr->rq_op) { 1211 case IBD_ASYNC_GETAH: 1212 ibd_async_acache(state, &ptr->rq_mac); 1213 break; 1214 case IBD_ASYNC_JOIN: 1215 case IBD_ASYNC_LEAVE: 1216 ibd_async_multicast(state, 1217 ptr->rq_gid, ptr->rq_op); 1218 break; 1219 case IBD_ASYNC_PROMON: 1220 ibd_async_setprom(state); 1221 break; 1222 case IBD_ASYNC_PROMOFF: 1223 ibd_async_unsetprom(state); 1224 break; 1225 case IBD_ASYNC_REAP: 1226 ibd_async_reap_group(state, 1227 ptr->rq_ptr, ptr->rq_gid, 1228 IB_MC_JSTATE_FULL); 1229 /* 1230 * the req buf contains in mce 1231 * structure, so we do not need 1232 * to free it here. 1233 */ 1234 ptr = NULL; 1235 break; 1236 case IBD_ASYNC_TRAP: 1237 ibd_async_trap(state, ptr); 1238 break; 1239 case IBD_ASYNC_SCHED: 1240 ibd_async_txsched(state); 1241 break; 1242 case IBD_ASYNC_LINK: 1243 ibd_async_link(state, ptr); 1244 break; 1245 case IBD_ASYNC_EXIT: 1246 mutex_enter(&state->id_acache_req_lock); 1247 #ifndef __lock_lint 1248 CALLB_CPR_EXIT(&cprinfo); 1249 #else 1250 mutex_exit(&state->id_acache_req_lock); 1251 #endif 1252 return; 1253 } 1254 if (ptr != NULL) 1255 kmem_cache_free(state->id_req_kmc, ptr); 1256 1257 mutex_enter(&state->id_acache_req_lock); 1258 } else { 1259 #ifndef __lock_lint 1260 /* 1261 * Nothing to do: wait till new request arrives. 1262 */ 1263 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1264 cv_wait(&state->id_acache_req_cv, 1265 &state->id_acache_req_lock); 1266 CALLB_CPR_SAFE_END(&cprinfo, 1267 &state->id_acache_req_lock); 1268 #endif 1269 } 1270 } 1271 1272 /*NOTREACHED*/ 1273 _NOTE(NOT_REACHED) 1274 } 1275 1276 /* 1277 * Return when it is safe to queue requests to the async daemon; primarily 1278 * for subnet trap and async event handling. Disallow requests before the 1279 * daemon is created, and when interface deinitilization starts. 1280 */ 1281 static boolean_t 1282 ibd_async_safe(ibd_state_t *state) 1283 { 1284 mutex_enter(&state->id_trap_lock); 1285 if (state->id_trap_stop) { 1286 mutex_exit(&state->id_trap_lock); 1287 return (B_FALSE); 1288 } 1289 state->id_trap_inprog++; 1290 mutex_exit(&state->id_trap_lock); 1291 return (B_TRUE); 1292 } 1293 1294 /* 1295 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1296 * trap or event handling to complete to kill the async thread and deconstruct 1297 * the mcg/ace list. 1298 */ 1299 static void 1300 ibd_async_done(ibd_state_t *state) 1301 { 1302 mutex_enter(&state->id_trap_lock); 1303 if (--state->id_trap_inprog == 0) 1304 cv_signal(&state->id_trap_cv); 1305 mutex_exit(&state->id_trap_lock); 1306 } 1307 1308 /* 1309 * Hash functions: 1310 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1311 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1312 * These operate on mac addresses input into ibd_send, but there is no 1313 * guarantee on the alignment of the ipoib_mac_t structure. 1314 */ 1315 /*ARGSUSED*/ 1316 static uint_t 1317 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1318 { 1319 ulong_t ptraddr = (ulong_t)key; 1320 uint_t hval; 1321 1322 /* 1323 * If the input address is 4 byte aligned, we can just dereference 1324 * it. This is most common, since IP will send in a 4 byte aligned 1325 * IP header, which implies the 24 byte IPoIB psuedo header will be 1326 * 4 byte aligned too. 1327 */ 1328 if ((ptraddr & 3) == 0) 1329 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1330 1331 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1332 return (hval); 1333 } 1334 1335 static int 1336 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1337 { 1338 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1339 return (0); 1340 else 1341 return (1); 1342 } 1343 1344 /* 1345 * Initialize all the per interface caches and lists; AH cache, 1346 * MCG list etc. 1347 */ 1348 static int 1349 ibd_acache_init(ibd_state_t *state) 1350 { 1351 ibd_ace_t *ce; 1352 int i; 1353 1354 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1355 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1356 1357 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1358 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1359 mutex_enter(&state->id_ac_mutex); 1360 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1361 offsetof(ibd_ace_t, ac_list)); 1362 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1363 offsetof(ibd_ace_t, ac_list)); 1364 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1365 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1366 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1367 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1368 offsetof(ibd_mce_t, mc_list)); 1369 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1370 offsetof(ibd_mce_t, mc_list)); 1371 list_create(&state->id_req_list, sizeof (ibd_req_t), 1372 offsetof(ibd_req_t, rq_list)); 1373 state->id_ac_hot_ace = NULL; 1374 1375 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1376 IBD_NUM_AH, KM_SLEEP); 1377 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1378 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1379 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1380 mutex_exit(&state->id_ac_mutex); 1381 ibd_acache_fini(state); 1382 return (DDI_FAILURE); 1383 } else { 1384 CLEAR_REFCYCLE(ce); 1385 ce->ac_mce = NULL; 1386 IBD_ACACHE_INSERT_FREE(state, ce); 1387 } 1388 } 1389 mutex_exit(&state->id_ac_mutex); 1390 return (DDI_SUCCESS); 1391 } 1392 1393 static void 1394 ibd_acache_fini(ibd_state_t *state) 1395 { 1396 ibd_ace_t *ptr; 1397 1398 mutex_enter(&state->id_ac_mutex); 1399 1400 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1401 ASSERT(GET_REF(ptr) == 0); 1402 (void) ibt_free_ud_dest(ptr->ac_dest); 1403 } 1404 1405 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1406 ASSERT(GET_REF(ptr) == 0); 1407 (void) ibt_free_ud_dest(ptr->ac_dest); 1408 } 1409 1410 list_destroy(&state->id_ah_free); 1411 list_destroy(&state->id_ah_active); 1412 list_destroy(&state->id_mc_full); 1413 list_destroy(&state->id_mc_non); 1414 list_destroy(&state->id_req_list); 1415 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1416 mutex_exit(&state->id_ac_mutex); 1417 mutex_destroy(&state->id_ac_mutex); 1418 mutex_destroy(&state->id_mc_mutex); 1419 mutex_destroy(&state->id_acache_req_lock); 1420 cv_destroy(&state->id_acache_req_cv); 1421 } 1422 1423 /* 1424 * Search AH active hash list for a cached path to input destination. 1425 * If we are "just looking", hold == F. When we are in the Tx path, 1426 * we set hold == T to grab a reference on the AH so that it can not 1427 * be recycled to a new destination while the Tx request is posted. 1428 */ 1429 static ibd_ace_t * 1430 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1431 { 1432 ibd_ace_t *ptr; 1433 1434 ASSERT(mutex_owned(&state->id_ac_mutex)); 1435 1436 /* 1437 * Do hash search. 1438 */ 1439 if (mod_hash_find(state->id_ah_active_hash, 1440 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1441 if (hold) 1442 INC_REF(ptr, num); 1443 return (ptr); 1444 } 1445 return (NULL); 1446 } 1447 1448 /* 1449 * This is called by the tx side; if an initialized AH is found in 1450 * the active list, it is locked down and can be used; if no entry 1451 * is found, an async request is queued to do path resolution. 1452 */ 1453 static ibd_ace_t * 1454 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1455 { 1456 ibd_ace_t *ptr; 1457 ibd_req_t *req; 1458 1459 /* 1460 * Only attempt to print when we can; in the mdt pattr case, the 1461 * address is not aligned properly. 1462 */ 1463 if (((ulong_t)mac & 3) == 0) { 1464 DPRINT(4, 1465 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1466 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1467 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1468 htonl(mac->ipoib_gidsuff[1])); 1469 } 1470 1471 mutex_enter(&state->id_ac_mutex); 1472 1473 if (((ptr = state->id_ac_hot_ace) != NULL) && 1474 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1475 INC_REF(ptr, numwqe); 1476 mutex_exit(&state->id_ac_mutex); 1477 return (ptr); 1478 } 1479 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1480 state->id_ac_hot_ace = ptr; 1481 mutex_exit(&state->id_ac_mutex); 1482 return (ptr); 1483 } 1484 1485 /* 1486 * Implementation of a single outstanding async request; if 1487 * the operation is not started yet, queue a request and move 1488 * to ongoing state. Remember in id_ah_addr for which address 1489 * we are queueing the request, in case we need to flag an error; 1490 * Any further requests, for the same or different address, until 1491 * the operation completes, is sent back to GLDv3 to be retried. 1492 * The async thread will update id_ah_op with an error indication 1493 * or will set it to indicate the next look up can start; either 1494 * way, it will mac_tx_update() so that all blocked requests come 1495 * back here. 1496 */ 1497 *err = EAGAIN; 1498 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1499 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1500 if (req != NULL) { 1501 /* 1502 * We did not even find the entry; queue a request 1503 * for it. 1504 */ 1505 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1506 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1507 state->id_ah_op = IBD_OP_ONGOING; 1508 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1509 } 1510 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1511 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1512 /* 1513 * Check the status of the pathrecord lookup request 1514 * we had queued before. 1515 */ 1516 if (state->id_ah_op == IBD_OP_ERRORED) { 1517 *err = EFAULT; 1518 state->id_ah_error++; 1519 } else { 1520 /* 1521 * IBD_OP_ROUTERED case: We need to send to the 1522 * all-router MCG. If we can find the AH for 1523 * the mcg, the Tx will be attempted. If we 1524 * do not find the AH, we return NORESOURCES 1525 * to retry. 1526 */ 1527 ipoib_mac_t routermac; 1528 1529 (void) ibd_get_allroutergroup(state, mac, &routermac); 1530 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1531 numwqe); 1532 } 1533 state->id_ah_op = IBD_OP_NOTSTARTED; 1534 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1535 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1536 /* 1537 * This case can happen when we get a higher band 1538 * packet. The easiest way is to reset the state machine 1539 * to accommodate the higher priority packet. 1540 */ 1541 state->id_ah_op = IBD_OP_NOTSTARTED; 1542 } 1543 mutex_exit(&state->id_ac_mutex); 1544 1545 return (ptr); 1546 } 1547 1548 /* 1549 * Grab a not-currently-in-use AH/PathRecord from the active 1550 * list to recycle to a new destination. Only the async thread 1551 * executes this code. 1552 */ 1553 static ibd_ace_t * 1554 ibd_acache_get_unref(ibd_state_t *state) 1555 { 1556 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1557 1558 ASSERT(mutex_owned(&state->id_ac_mutex)); 1559 1560 /* 1561 * Do plain linear search. 1562 */ 1563 while (ptr != NULL) { 1564 /* 1565 * Note that it is possible that the "cycle" bit 1566 * is set on the AH w/o any reference count. The 1567 * mcg must have been deleted, and the tx cleanup 1568 * just decremented the reference count to 0, but 1569 * hasn't gotten around to grabbing the id_ac_mutex 1570 * to move the AH into the free list. 1571 */ 1572 if (GET_REF(ptr) == 0) { 1573 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1574 break; 1575 } 1576 ptr = list_next(&state->id_ah_active, ptr); 1577 } 1578 return (ptr); 1579 } 1580 1581 /* 1582 * Invoked to clean up AH from active list in case of multicast 1583 * disable and to handle sendonly memberships during mcg traps. 1584 * And for port up processing for multicast and unicast AHs. 1585 * Normally, the AH is taken off the active list, and put into 1586 * the free list to be recycled for a new destination. In case 1587 * Tx requests on the AH have not completed yet, the AH is marked 1588 * for reaping (which will put the AH on the free list) once the Tx's 1589 * complete; in this case, depending on the "force" input, we take 1590 * out the AH from the active list right now, or leave it also for 1591 * the reap operation. Returns TRUE if the AH is taken off the active 1592 * list (and either put into the free list right now, or arranged for 1593 * later), FALSE otherwise. 1594 */ 1595 static boolean_t 1596 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1597 { 1598 ibd_ace_t *acactive; 1599 boolean_t ret = B_TRUE; 1600 1601 ASSERT(mutex_owned(&state->id_ac_mutex)); 1602 1603 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1604 1605 /* 1606 * Note that the AH might already have the cycle bit set 1607 * on it; this might happen if sequences of multicast 1608 * enables and disables are coming so fast, that posted 1609 * Tx's to the mcg have not completed yet, and the cycle 1610 * bit is set successively by each multicast disable. 1611 */ 1612 if (SET_CYCLE_IF_REF(acactive)) { 1613 if (!force) { 1614 /* 1615 * The ace is kept on the active list, further 1616 * Tx's can still grab a reference on it; the 1617 * ace is reaped when all pending Tx's 1618 * referencing the AH complete. 1619 */ 1620 ret = B_FALSE; 1621 } else { 1622 /* 1623 * In the mcg trap case, we always pull the 1624 * AH from the active list. And also the port 1625 * up multi/unicast case. 1626 */ 1627 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1628 acactive->ac_mce = NULL; 1629 } 1630 } else { 1631 /* 1632 * Determined the ref count is 0, thus reclaim 1633 * immediately after pulling out the ace from 1634 * the active list. 1635 */ 1636 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1637 acactive->ac_mce = NULL; 1638 IBD_ACACHE_INSERT_FREE(state, acactive); 1639 } 1640 1641 } 1642 return (ret); 1643 } 1644 1645 /* 1646 * Helper function for async path record lookup. If we are trying to 1647 * Tx to a MCG, check our membership, possibly trying to join the 1648 * group if required. If that fails, try to send the packet to the 1649 * all router group (indicated by the redirect output), pointing 1650 * the input mac address to the router mcg address. 1651 */ 1652 static ibd_mce_t * 1653 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1654 { 1655 ib_gid_t mgid; 1656 ibd_mce_t *mce; 1657 ipoib_mac_t routermac; 1658 1659 *redirect = B_FALSE; 1660 ibd_n2h_gid(mac, &mgid); 1661 1662 /* 1663 * Check the FullMember+SendOnlyNonMember list. 1664 * Since we are the only one who manipulates the 1665 * id_mc_full list, no locks are needed. 1666 */ 1667 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1668 if (mce != NULL) { 1669 DPRINT(4, "ibd_async_mcache : already joined to group"); 1670 return (mce); 1671 } 1672 1673 /* 1674 * Not found; try to join(SendOnlyNonMember) and attach. 1675 */ 1676 DPRINT(4, "ibd_async_mcache : not joined to group"); 1677 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1678 NULL) { 1679 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1680 return (mce); 1681 } 1682 1683 /* 1684 * MCGroup not present; try to join the all-router group. If 1685 * any of the following steps succeed, we will be redirecting 1686 * to the all router group. 1687 */ 1688 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1689 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1690 return (NULL); 1691 *redirect = B_TRUE; 1692 ibd_n2h_gid(&routermac, &mgid); 1693 bcopy(&routermac, mac, IPOIB_ADDRL); 1694 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1695 mgid.gid_prefix, mgid.gid_guid); 1696 1697 /* 1698 * Are we already joined to the router group? 1699 */ 1700 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1701 DPRINT(4, "ibd_async_mcache : using already joined router" 1702 "group\n"); 1703 return (mce); 1704 } 1705 1706 /* 1707 * Can we join(SendOnlyNonMember) the router group? 1708 */ 1709 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1710 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1711 NULL) { 1712 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1713 return (mce); 1714 } 1715 1716 return (NULL); 1717 } 1718 1719 /* 1720 * Async path record lookup code. 1721 */ 1722 static void 1723 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1724 { 1725 ibd_ace_t *ce; 1726 ibd_mce_t *mce = NULL; 1727 ibt_path_attr_t path_attr; 1728 ibt_path_info_t path_info; 1729 ib_gid_t destgid; 1730 char ret = IBD_OP_NOTSTARTED; 1731 1732 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1733 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1734 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1735 htonl(mac->ipoib_gidsuff[1])); 1736 1737 /* 1738 * Check whether we are trying to transmit to a MCG. 1739 * In that case, we need to make sure we are a member of 1740 * the MCG. 1741 */ 1742 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1743 boolean_t redirected; 1744 1745 /* 1746 * If we can not find or join the group or even 1747 * redirect, error out. 1748 */ 1749 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1750 NULL) { 1751 state->id_ah_op = IBD_OP_ERRORED; 1752 return; 1753 } 1754 1755 /* 1756 * If we got redirected, we need to determine whether 1757 * the AH for the new mcg is in the cache already, and 1758 * not pull it in then; otherwise proceed to get the 1759 * path for the new mcg. There is no guarantee that 1760 * if the AH is currently in the cache, it will still be 1761 * there when we look in ibd_acache_lookup(), but that's 1762 * okay, we will come back here. 1763 */ 1764 if (redirected) { 1765 ret = IBD_OP_ROUTERED; 1766 DPRINT(4, "ibd_async_acache : redirected to " 1767 "%08X:%08X:%08X:%08X:%08X", 1768 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1769 htonl(mac->ipoib_gidpref[1]), 1770 htonl(mac->ipoib_gidsuff[0]), 1771 htonl(mac->ipoib_gidsuff[1])); 1772 1773 mutex_enter(&state->id_ac_mutex); 1774 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1775 state->id_ah_op = IBD_OP_ROUTERED; 1776 mutex_exit(&state->id_ac_mutex); 1777 DPRINT(4, "ibd_async_acache : router AH found"); 1778 return; 1779 } 1780 mutex_exit(&state->id_ac_mutex); 1781 } 1782 } 1783 1784 /* 1785 * Get an AH from the free list. 1786 */ 1787 mutex_enter(&state->id_ac_mutex); 1788 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1789 /* 1790 * No free ones; try to grab an unreferenced active 1791 * one. Maybe we need to make the active list LRU, 1792 * but that will create more work for Tx callbacks. 1793 * Is there a way of not having to pull out the 1794 * entry from the active list, but just indicate it 1795 * is being recycled? Yes, but that creates one more 1796 * check in the fast lookup path. 1797 */ 1798 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1799 /* 1800 * Pretty serious shortage now. 1801 */ 1802 state->id_ah_op = IBD_OP_NOTSTARTED; 1803 mutex_exit(&state->id_ac_mutex); 1804 DPRINT(10, "ibd_async_acache : failed to find AH " 1805 "slot\n"); 1806 return; 1807 } 1808 /* 1809 * We could check whether ac_mce points to a SendOnly 1810 * member and drop that membership now. Or do it lazily 1811 * at detach time. 1812 */ 1813 ce->ac_mce = NULL; 1814 } 1815 mutex_exit(&state->id_ac_mutex); 1816 ASSERT(ce->ac_mce == NULL); 1817 1818 /* 1819 * Update the entry. 1820 */ 1821 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1822 1823 bzero(&path_info, sizeof (path_info)); 1824 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1825 path_attr.pa_sgid = state->id_sgid; 1826 path_attr.pa_num_dgids = 1; 1827 ibd_n2h_gid(&ce->ac_mac, &destgid); 1828 path_attr.pa_dgids = &destgid; 1829 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1830 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1831 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1832 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1833 goto error; 1834 } 1835 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1836 ntohl(ce->ac_mac.ipoib_qpn), 1837 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1838 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1839 goto error; 1840 } 1841 1842 /* 1843 * mce is set whenever an AH is being associated with a 1844 * MCG; this will come in handy when we leave the MCG. The 1845 * lock protects Tx fastpath from scanning the active list. 1846 */ 1847 if (mce != NULL) 1848 ce->ac_mce = mce; 1849 mutex_enter(&state->id_ac_mutex); 1850 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1851 state->id_ah_op = ret; 1852 mutex_exit(&state->id_ac_mutex); 1853 return; 1854 error: 1855 /* 1856 * We might want to drop SendOnly membership here if we 1857 * joined above. The lock protects Tx callbacks inserting 1858 * into the free list. 1859 */ 1860 mutex_enter(&state->id_ac_mutex); 1861 state->id_ah_op = IBD_OP_ERRORED; 1862 IBD_ACACHE_INSERT_FREE(state, ce); 1863 mutex_exit(&state->id_ac_mutex); 1864 } 1865 1866 /* 1867 * While restoring port's presence on the subnet on a port up, it is possible 1868 * that the port goes down again. 1869 */ 1870 static void 1871 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1872 { 1873 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1874 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1875 LINK_STATE_UP; 1876 ibd_mce_t *mce, *pmce; 1877 ibd_ace_t *ace, *pace; 1878 1879 DPRINT(10, "ibd_async_link(): %d", opcode); 1880 1881 /* 1882 * On a link up, revalidate the link speed/width. No point doing 1883 * this on a link down, since we will be unable to do SA operations, 1884 * defaulting to the lowest speed. Also notice that we update our 1885 * notion of speed before calling mac_link_update(), which will do 1886 * necessary higher level notifications for speed changes. 1887 */ 1888 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1889 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1890 state->id_link_speed = ibd_get_portspeed(state); 1891 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1892 } 1893 1894 /* 1895 * Do all the work required to establish our presence on 1896 * the subnet. 1897 */ 1898 if (opcode == IBD_LINK_UP_ABSENT) { 1899 /* 1900 * If in promiscuous mode ... 1901 */ 1902 if (state->id_prom_op == IBD_OP_COMPLETED) { 1903 /* 1904 * Drop all nonmembership. 1905 */ 1906 ibd_async_unsetprom(state); 1907 1908 /* 1909 * Then, try to regain nonmembership to all mcg's. 1910 */ 1911 ibd_async_setprom(state); 1912 1913 } 1914 1915 /* 1916 * Drop all sendonly membership (which also gets rid of the 1917 * AHs); try to reacquire all full membership. 1918 */ 1919 mce = list_head(&state->id_mc_full); 1920 while ((pmce = mce) != NULL) { 1921 mce = list_next(&state->id_mc_full, mce); 1922 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1923 ibd_leave_group(state, 1924 pmce->mc_info.mc_adds_vect.av_dgid, 1925 IB_MC_JSTATE_SEND_ONLY_NON); 1926 else 1927 ibd_reacquire_group(state, pmce); 1928 } 1929 1930 /* 1931 * Recycle all active AHs to free list (and if there are 1932 * pending posts, make sure they will go into the free list 1933 * once the Tx's complete). Grab the lock to prevent 1934 * concurrent Tx's as well as Tx cleanups. 1935 */ 1936 mutex_enter(&state->id_ac_mutex); 1937 ace = list_head(&state->id_ah_active); 1938 while ((pace = ace) != NULL) { 1939 boolean_t cycled; 1940 1941 ace = list_next(&state->id_ah_active, ace); 1942 mce = pace->ac_mce; 1943 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1944 B_TRUE); 1945 /* 1946 * If this is for an mcg, it must be for a fullmember, 1947 * since we got rid of send-only members above when 1948 * processing the mce list. 1949 */ 1950 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1951 IB_MC_JSTATE_FULL))); 1952 1953 /* 1954 * Check if the fullmember mce needs to be torn down, 1955 * ie whether the DLPI disable has already been done. 1956 * If so, do some of the work of tx_cleanup, namely 1957 * causing leave (which will fail), detach and 1958 * mce-freeing. tx_cleanup will put the AH into free 1959 * list. The reason to duplicate some of this 1960 * tx_cleanup work is because we want to delete the 1961 * AH right now instead of waiting for tx_cleanup, to 1962 * force subsequent Tx's to reacquire an AH. 1963 */ 1964 if ((mce != NULL) && (mce->mc_fullreap)) 1965 ibd_async_reap_group(state, mce, 1966 mce->mc_info.mc_adds_vect.av_dgid, 1967 mce->mc_jstate); 1968 } 1969 mutex_exit(&state->id_ac_mutex); 1970 } 1971 1972 /* 1973 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1974 * (which stops further events from being delivered) before 1975 * mac_unregister(). At this point, it is guaranteed that mac_register 1976 * has already been done. 1977 */ 1978 mutex_enter(&state->id_link_mutex); 1979 state->id_link_state = lstate; 1980 mac_link_update(state->id_mh, lstate); 1981 mutex_exit(&state->id_link_mutex); 1982 1983 ibd_async_done(state); 1984 } 1985 1986 /* 1987 * Check the pkey table to see if we can find the pkey we're looking for. 1988 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1989 * failure. 1990 */ 1991 static int 1992 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1993 uint16_t *pkix) 1994 { 1995 uint16_t ndx; 1996 1997 ASSERT(pkix != NULL); 1998 1999 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 2000 if (pkey_tbl[ndx] == pkey) { 2001 *pkix = ndx; 2002 return (0); 2003 } 2004 } 2005 return (-1); 2006 } 2007 2008 /* 2009 * When the link is notified up, we need to do a few things, based 2010 * on the port's current p_init_type_reply claiming a reinit has been 2011 * done or not. The reinit steps are: 2012 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2013 * the old Pkey and GID0 are correct. 2014 * 2. Register for mcg traps (already done by ibmf). 2015 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2016 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2017 * 4. Give up all sendonly memberships. 2018 * 5. Acquire all full memberships. 2019 * 6. In promiscuous mode, acquire all non memberships. 2020 * 7. Recycle all AHs to free list. 2021 */ 2022 static void 2023 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2024 { 2025 ibt_hca_portinfo_t *port_infop = NULL; 2026 ibt_status_t ibt_status; 2027 uint_t psize, port_infosz; 2028 ibd_link_op_t opcode; 2029 ibd_req_t *req; 2030 link_state_t new_link_state = LINK_STATE_UP; 2031 uint8_t itreply; 2032 uint16_t pkix; 2033 int ret; 2034 2035 /* 2036 * Let's not race with a plumb or an unplumb; if we detect a 2037 * pkey relocation event later on here, we may have to restart. 2038 */ 2039 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2040 2041 mutex_enter(&state->id_link_mutex); 2042 2043 /* 2044 * If the init code in ibd_m_start hasn't yet set up the 2045 * pkey/gid, nothing to do; that code will set the link state. 2046 */ 2047 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2048 mutex_exit(&state->id_link_mutex); 2049 goto link_mod_return; 2050 } 2051 2052 /* 2053 * If this routine was called in response to a port down event, 2054 * we just need to see if this should be informed. 2055 */ 2056 if (code == IBT_ERROR_PORT_DOWN) { 2057 new_link_state = LINK_STATE_DOWN; 2058 goto update_link_state; 2059 } 2060 2061 /* 2062 * If it's not a port down event we've received, try to get the port 2063 * attributes first. If we fail here, the port is as good as down. 2064 * Otherwise, if the link went down by the time the handler gets 2065 * here, give up - we cannot even validate the pkey/gid since those 2066 * are not valid and this is as bad as a port down anyway. 2067 */ 2068 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2069 &port_infop, &psize, &port_infosz); 2070 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2071 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2072 new_link_state = LINK_STATE_DOWN; 2073 goto update_link_state; 2074 } 2075 2076 /* 2077 * Check the SM InitTypeReply flags. If both NoLoadReply and 2078 * PreserveContentReply are 0, we don't know anything about the 2079 * data loaded into the port attributes, so we need to verify 2080 * if gid0 and pkey are still valid. 2081 */ 2082 itreply = port_infop->p_init_type_reply; 2083 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2084 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2085 /* 2086 * Check to see if the subnet part of GID0 has changed. If 2087 * not, check the simple case first to see if the pkey 2088 * index is the same as before; finally check to see if the 2089 * pkey has been relocated to a different index in the table. 2090 */ 2091 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2092 if (bcmp(port_infop->p_sgid_tbl, 2093 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2094 2095 new_link_state = LINK_STATE_DOWN; 2096 2097 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2098 state->id_pkey) { 2099 2100 new_link_state = LINK_STATE_UP; 2101 2102 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2103 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2104 2105 ibt_free_portinfo(port_infop, port_infosz); 2106 mutex_exit(&state->id_link_mutex); 2107 2108 /* 2109 * Currently a restart is required if our pkey has moved 2110 * in the pkey table. If we get the ibt_recycle_ud() to 2111 * work as documented (expected), we may be able to 2112 * avoid a complete restart. Note that we've already 2113 * marked both the start and stop 'in-progress' flags, 2114 * so it is ok to go ahead and do this restart. 2115 */ 2116 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2117 if ((ret = ibd_start(state)) != 0) { 2118 DPRINT(10, "ibd_restart: cannot restart, " 2119 "ret=%d", ret); 2120 } 2121 2122 goto link_mod_return; 2123 } else { 2124 new_link_state = LINK_STATE_DOWN; 2125 } 2126 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2127 } 2128 2129 update_link_state: 2130 if (port_infop) { 2131 ibt_free_portinfo(port_infop, port_infosz); 2132 } 2133 2134 /* 2135 * If the old state is the same as the new state, nothing to do 2136 */ 2137 if (state->id_link_state == new_link_state) { 2138 mutex_exit(&state->id_link_mutex); 2139 goto link_mod_return; 2140 } 2141 2142 /* 2143 * Ok, so there was a link state change; see if it's safe to ask 2144 * the async thread to do the work 2145 */ 2146 if (!ibd_async_safe(state)) { 2147 state->id_link_state = new_link_state; 2148 mutex_exit(&state->id_link_mutex); 2149 goto link_mod_return; 2150 } 2151 2152 mutex_exit(&state->id_link_mutex); 2153 2154 /* 2155 * If we're reporting a link up, check InitTypeReply to see if 2156 * the SM has ensured that the port's presence in mcg, traps, 2157 * etc. is intact. 2158 */ 2159 if (new_link_state == LINK_STATE_DOWN) { 2160 opcode = IBD_LINK_DOWN; 2161 } else { 2162 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2163 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2164 opcode = IBD_LINK_UP; 2165 } else { 2166 opcode = IBD_LINK_UP_ABSENT; 2167 } 2168 } 2169 2170 /* 2171 * Queue up a request for ibd_async_link() to handle this link 2172 * state change event 2173 */ 2174 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2175 req->rq_ptr = (void *)opcode; 2176 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2177 2178 link_mod_return: 2179 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2180 } 2181 2182 /* 2183 * For the port up/down events, IBTL guarantees there will not be concurrent 2184 * invocations of the handler. IBTL might coalesce link transition events, 2185 * and not invoke the handler for _each_ up/down transition, but it will 2186 * invoke the handler with last known state 2187 */ 2188 static void 2189 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2190 ibt_async_code_t code, ibt_async_event_t *event) 2191 { 2192 ibd_state_t *state = (ibd_state_t *)clnt_private; 2193 2194 switch (code) { 2195 case IBT_ERROR_CATASTROPHIC_CHAN: 2196 ibd_print_warn(state, "catastrophic channel error"); 2197 break; 2198 case IBT_ERROR_CQ: 2199 ibd_print_warn(state, "completion queue error"); 2200 break; 2201 case IBT_PORT_CHANGE_EVENT: 2202 /* 2203 * Events will be delivered to all instances that have 2204 * done ibt_open_hca() but not yet done ibt_close_hca(). 2205 * Only need to do work for our port; IBTF will deliver 2206 * events for other ports on the hca we have ibt_open_hca'ed 2207 * too. Note that id_port is initialized in ibd_attach() 2208 * before we do an ibt_open_hca() in ibd_attach(). 2209 */ 2210 ASSERT(state->id_hca_hdl == hca_hdl); 2211 if (state->id_port != event->ev_port) 2212 break; 2213 2214 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2215 IBT_PORT_CHANGE_PKEY) { 2216 ibd_link_mod(state, code); 2217 } 2218 break; 2219 case IBT_ERROR_PORT_DOWN: 2220 case IBT_CLNT_REREG_EVENT: 2221 case IBT_EVENT_PORT_UP: 2222 /* 2223 * Events will be delivered to all instances that have 2224 * done ibt_open_hca() but not yet done ibt_close_hca(). 2225 * Only need to do work for our port; IBTF will deliver 2226 * events for other ports on the hca we have ibt_open_hca'ed 2227 * too. Note that id_port is initialized in ibd_attach() 2228 * before we do an ibt_open_hca() in ibd_attach(). 2229 */ 2230 ASSERT(state->id_hca_hdl == hca_hdl); 2231 if (state->id_port != event->ev_port) 2232 break; 2233 2234 ibd_link_mod(state, code); 2235 break; 2236 2237 case IBT_HCA_ATTACH_EVENT: 2238 case IBT_HCA_DETACH_EVENT: 2239 /* 2240 * When a new card is plugged to the system, attach_event is 2241 * invoked. Additionally, a cfgadm needs to be run to make the 2242 * card known to the system, and an ifconfig needs to be run to 2243 * plumb up any ibd interfaces on the card. In the case of card 2244 * unplug, a cfgadm is run that will trigger any RCM scripts to 2245 * unplumb the ibd interfaces on the card; when the card is 2246 * actually unplugged, the detach_event is invoked; 2247 * additionally, if any ibd instances are still active on the 2248 * card (eg there were no associated RCM scripts), driver's 2249 * detach routine is invoked. 2250 */ 2251 break; 2252 default: 2253 break; 2254 } 2255 } 2256 2257 static int 2258 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2259 { 2260 mac_register_t *macp; 2261 int ret; 2262 2263 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2264 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2265 return (DDI_FAILURE); 2266 } 2267 2268 /* 2269 * Note that when we register with mac during attach, we don't 2270 * have the id_macaddr yet, so we'll simply be registering a 2271 * zero macaddr that we'll overwrite later during plumb (in 2272 * ibd_m_start()). Similar is the case with id_mtu - we'll 2273 * update the mac layer with the correct mtu during plumb. 2274 */ 2275 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2276 macp->m_driver = state; 2277 macp->m_dip = dip; 2278 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2279 macp->m_callbacks = &ibd_m_callbacks; 2280 macp->m_min_sdu = 0; 2281 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2282 2283 /* 2284 * Register ourselves with the GLDv3 interface 2285 */ 2286 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2287 mac_free(macp); 2288 DPRINT(10, 2289 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2290 return (DDI_FAILURE); 2291 } 2292 2293 mac_free(macp); 2294 return (DDI_SUCCESS); 2295 } 2296 2297 static int 2298 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2299 { 2300 ibt_hca_attr_t hca_attrs; 2301 ibt_status_t ibt_status; 2302 2303 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2304 2305 /* 2306 * Query the HCA and fetch its attributes 2307 */ 2308 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2309 ASSERT(ibt_status == IBT_SUCCESS); 2310 2311 /* 2312 * 1. Set the Hardware Checksum capability. Currently we only consider 2313 * full checksum offload. 2314 */ 2315 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 2316 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2317 } 2318 2319 /* 2320 * 2. Set LSO policy, capability and maximum length 2321 */ 2322 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2323 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2324 state->id_lso_policy = B_TRUE; 2325 } else { 2326 state->id_lso_policy = B_FALSE; 2327 } 2328 2329 if (hca_attrs.hca_max_lso_size > 0) { 2330 state->id_lso_capable = B_TRUE; 2331 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2332 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2333 else 2334 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 2335 } else { 2336 state->id_lso_capable = B_FALSE; 2337 state->id_lso_maxlen = 0; 2338 } 2339 2340 /* 2341 * 3. Set Reserved L_Key capability 2342 */ 2343 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2344 state->id_hca_res_lkey_capab = 1; 2345 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2346 } 2347 2348 /* 2349 * 4. Set maximum sqseg value after checking to see if extended sgl 2350 * size information is provided by the hca 2351 */ 2352 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2353 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2354 } else { 2355 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2356 } 2357 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2358 state->id_max_sqseg = IBD_MAX_SQSEG; 2359 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2360 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2361 state->id_max_sqseg, IBD_MAX_SQSEG); 2362 } 2363 2364 /* 2365 * Translating the virtual address regions into physical regions 2366 * for using the Reserved LKey feature results in a wr sgl that 2367 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2368 * we'll fix a high-water mark (65%) for when we should stop. 2369 */ 2370 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2371 2372 /* 2373 * 5. Set number of recv and send wqes after checking hca maximum 2374 * channel size 2375 */ 2376 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2377 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2378 } else { 2379 state->id_num_rwqe = IBD_NUM_RWQE; 2380 } 2381 state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN; 2382 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2383 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2384 } else { 2385 state->id_num_swqe = IBD_NUM_SWQE; 2386 } 2387 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2388 2389 return (DDI_SUCCESS); 2390 } 2391 2392 static int 2393 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2394 { 2395 int instance; 2396 uint32_t progress = state->id_mac_state; 2397 ibt_status_t ret; 2398 2399 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2400 cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n"); 2401 return (DDI_FAILURE); 2402 } 2403 2404 /* make sure rx resources are freed */ 2405 ibd_free_rx_rsrcs(state); 2406 2407 if (progress & IBD_DRV_MAC_REGISTERED) { 2408 (void) mac_unregister(state->id_mh); 2409 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2410 } 2411 2412 if (progress & IBD_DRV_PD_ALLOCD) { 2413 if ((ret = ibt_free_pd(state->id_hca_hdl, 2414 state->id_pd_hdl)) != IBT_SUCCESS) { 2415 ibd_print_warn(state, "failed to free " 2416 "protection domain, ret=%d", ret); 2417 } 2418 state->id_pd_hdl = NULL; 2419 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2420 } 2421 2422 if (progress & IBD_DRV_HCA_OPENED) { 2423 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2424 IBT_SUCCESS) { 2425 ibd_print_warn(state, "failed to close " 2426 "HCA device, ret=%d", ret); 2427 } 2428 state->id_hca_hdl = NULL; 2429 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2430 } 2431 2432 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2433 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2434 ibd_print_warn(state, 2435 "ibt_detach() failed, ret=%d", ret); 2436 } 2437 state->id_ibt_hdl = NULL; 2438 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2439 } 2440 2441 if (progress & IBD_DRV_TXINTR_ADDED) { 2442 ddi_remove_softintr(state->id_tx); 2443 state->id_tx = NULL; 2444 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2445 } 2446 2447 if (progress & IBD_DRV_RXINTR_ADDED) { 2448 ddi_remove_softintr(state->id_rx); 2449 state->id_rx = NULL; 2450 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2451 } 2452 2453 if (progress & IBD_DRV_STATE_INITIALIZED) { 2454 ibd_state_fini(state); 2455 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2456 } 2457 2458 instance = ddi_get_instance(dip); 2459 ddi_soft_state_free(ibd_list, instance); 2460 2461 return (DDI_SUCCESS); 2462 } 2463 2464 /* 2465 * Attach device to the IO framework. 2466 */ 2467 static int 2468 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2469 { 2470 ibd_state_t *state = NULL; 2471 ib_guid_t hca_guid; 2472 int instance; 2473 ibt_status_t ret; 2474 int rv; 2475 2476 /* 2477 * IBD doesn't support suspend/resume 2478 */ 2479 if (cmd != DDI_ATTACH) 2480 return (DDI_FAILURE); 2481 2482 /* 2483 * Allocate softstate structure 2484 */ 2485 instance = ddi_get_instance(dip); 2486 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2487 return (DDI_FAILURE); 2488 state = ddi_get_soft_state(ibd_list, instance); 2489 2490 /* 2491 * Initialize mutexes and condition variables 2492 */ 2493 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2494 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2495 goto attach_fail; 2496 } 2497 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2498 2499 /* 2500 * Allocate rx,tx softintr 2501 */ 2502 if (ibd_rx_softintr == 1) { 2503 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2504 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2505 DPRINT(10, "ibd_attach: failed in " 2506 "ddi_add_softintr(id_rx), ret=%d", rv); 2507 goto attach_fail; 2508 } 2509 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2510 } 2511 if (ibd_tx_softintr == 1) { 2512 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2513 NULL, NULL, ibd_tx_recycle, 2514 (caddr_t)state)) != DDI_SUCCESS) { 2515 DPRINT(10, "ibd_attach: failed in " 2516 "ddi_add_softintr(id_tx), ret=%d", rv); 2517 goto attach_fail; 2518 } 2519 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2520 } 2521 2522 /* 2523 * Obtain IBA P_Key, port number and HCA guid and validate 2524 * them (for P_Key, only full members are allowed as per 2525 * IPoIB specification; neither port number nor HCA guid 2526 * can be zero) 2527 */ 2528 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2529 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2530 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2531 state->id_pkey); 2532 goto attach_fail; 2533 } 2534 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2535 "port-number", 0)) == 0) { 2536 DPRINT(10, "ibd_attach: invalid port number (%d)", 2537 state->id_port); 2538 goto attach_fail; 2539 } 2540 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2541 "hca-guid", 0)) == 0) { 2542 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2543 hca_guid); 2544 goto attach_fail; 2545 } 2546 2547 /* 2548 * Attach to IBTL 2549 */ 2550 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2551 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2552 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2553 goto attach_fail; 2554 } 2555 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2556 2557 /* 2558 * Open the HCA 2559 */ 2560 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2561 &state->id_hca_hdl)) != IBT_SUCCESS) { 2562 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2563 goto attach_fail; 2564 } 2565 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2566 2567 /* 2568 * Record capabilities 2569 */ 2570 (void) ibd_record_capab(state, dip); 2571 2572 /* 2573 * Allocate a protection domain on the HCA 2574 */ 2575 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2576 &state->id_pd_hdl)) != IBT_SUCCESS) { 2577 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2578 goto attach_fail; 2579 } 2580 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2581 2582 2583 /* 2584 * Register ibd interfaces with the Nemo framework 2585 */ 2586 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2587 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2588 goto attach_fail; 2589 } 2590 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2591 2592 /* 2593 * We're done with everything we could to make the attach 2594 * succeed. All the buffer allocations and IPoIB broadcast 2595 * group joins are deferred to when the interface instance 2596 * is actually plumbed to avoid wasting memory. 2597 */ 2598 return (DDI_SUCCESS); 2599 2600 attach_fail: 2601 (void) ibd_unattach(state, dip); 2602 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2603 return (DDI_FAILURE); 2604 } 2605 2606 /* 2607 * Detach device from the IO framework. 2608 */ 2609 static int 2610 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2611 { 2612 ibd_state_t *state; 2613 int instance; 2614 2615 /* 2616 * IBD doesn't support suspend/resume 2617 */ 2618 if (cmd != DDI_DETACH) 2619 return (DDI_FAILURE); 2620 2621 /* 2622 * Get the instance softstate 2623 */ 2624 instance = ddi_get_instance(dip); 2625 state = ddi_get_soft_state(ibd_list, instance); 2626 2627 /* 2628 * Release all resources we're holding still. Note that if we'd 2629 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2630 * so far, we should find all the flags we need in id_mac_state. 2631 */ 2632 return (ibd_unattach(state, dip)); 2633 } 2634 2635 /* 2636 * Pre ibt_attach() driver initialization 2637 */ 2638 static int 2639 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2640 { 2641 char buf[64]; 2642 2643 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2644 state->id_link_state = LINK_STATE_UNKNOWN; 2645 2646 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2647 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2648 state->id_trap_stop = B_TRUE; 2649 state->id_trap_inprog = 0; 2650 2651 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2652 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2653 state->id_dip = dip; 2654 2655 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2656 2657 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2658 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2659 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2660 state->id_tx_busy = 0; 2661 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2662 2663 state->id_rx_list.dl_bufs_outstanding = 0; 2664 state->id_rx_list.dl_cnt = 0; 2665 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2666 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2667 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2668 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2669 0, NULL, NULL, NULL, NULL, NULL, 0); 2670 2671 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2672 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2673 2674 return (DDI_SUCCESS); 2675 } 2676 2677 /* 2678 * Post ibt_detach() driver deconstruction 2679 */ 2680 static void 2681 ibd_state_fini(ibd_state_t *state) 2682 { 2683 cv_destroy(&state->id_macst_cv); 2684 mutex_destroy(&state->id_macst_lock); 2685 2686 kmem_cache_destroy(state->id_req_kmc); 2687 2688 mutex_destroy(&state->id_rx_list.dl_mutex); 2689 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2690 2691 mutex_destroy(&state->id_txpost_lock); 2692 mutex_destroy(&state->id_tx_list.dl_mutex); 2693 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2694 mutex_destroy(&state->id_lso_lock); 2695 2696 mutex_destroy(&state->id_sched_lock); 2697 mutex_destroy(&state->id_scq_poll_lock); 2698 mutex_destroy(&state->id_rcq_poll_lock); 2699 2700 cv_destroy(&state->id_trap_cv); 2701 mutex_destroy(&state->id_trap_lock); 2702 mutex_destroy(&state->id_link_mutex); 2703 } 2704 2705 /* 2706 * Fetch link speed from SA for snmp ifspeed reporting. 2707 */ 2708 static uint64_t 2709 ibd_get_portspeed(ibd_state_t *state) 2710 { 2711 int ret; 2712 ibt_path_info_t path; 2713 ibt_path_attr_t path_attr; 2714 uint8_t num_paths; 2715 uint64_t ifspeed; 2716 2717 /* 2718 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2719 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2720 * 2000000000. Start with that as default. 2721 */ 2722 ifspeed = 2000000000; 2723 2724 bzero(&path_attr, sizeof (path_attr)); 2725 2726 /* 2727 * Get the port speed from Loopback path information. 2728 */ 2729 path_attr.pa_dgids = &state->id_sgid; 2730 path_attr.pa_num_dgids = 1; 2731 path_attr.pa_sgid = state->id_sgid; 2732 2733 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2734 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2735 goto earlydone; 2736 2737 if (num_paths < 1) 2738 goto earlydone; 2739 2740 /* 2741 * In case SA does not return an expected value, report the default 2742 * speed as 1X. 2743 */ 2744 ret = 1; 2745 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2746 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2747 ret = 1; 2748 break; 2749 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2750 ret = 4; 2751 break; 2752 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2753 ret = 12; 2754 break; 2755 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2756 ret = 2; 2757 break; 2758 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2759 ret = 8; 2760 break; 2761 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2762 ret = 16; 2763 break; 2764 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2765 ret = 24; 2766 break; 2767 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2768 ret = 32; 2769 break; 2770 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2771 ret = 48; 2772 break; 2773 } 2774 2775 ifspeed *= ret; 2776 2777 earlydone: 2778 return (ifspeed); 2779 } 2780 2781 /* 2782 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2783 * representing the input mcg mgid. 2784 */ 2785 static ibd_mce_t * 2786 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2787 { 2788 ibd_mce_t *ptr = list_head(mlist); 2789 2790 /* 2791 * Do plain linear search. 2792 */ 2793 while (ptr != NULL) { 2794 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2795 sizeof (ib_gid_t)) == 0) 2796 return (ptr); 2797 ptr = list_next(mlist, ptr); 2798 } 2799 return (NULL); 2800 } 2801 2802 /* 2803 * Execute IBA JOIN. 2804 */ 2805 static ibt_status_t 2806 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2807 { 2808 ibt_mcg_attr_t mcg_attr; 2809 2810 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2811 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2812 mcg_attr.mc_mgid = mgid; 2813 mcg_attr.mc_join_state = mce->mc_jstate; 2814 mcg_attr.mc_scope = state->id_scope; 2815 mcg_attr.mc_pkey = state->id_pkey; 2816 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2817 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2818 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2819 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2820 NULL, NULL)); 2821 } 2822 2823 /* 2824 * This code JOINs the port in the proper way (depending on the join 2825 * state) so that IBA fabric will forward mcg packets to/from the port. 2826 * It also attaches the QPN to the mcg so it can receive those mcg 2827 * packets. This code makes sure not to attach the mcg to the QP if 2828 * that has been previously done due to the mcg being joined with a 2829 * different join state, even though this is not required by SWG_0216, 2830 * refid 3610. 2831 */ 2832 static ibd_mce_t * 2833 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2834 { 2835 ibt_status_t ibt_status; 2836 ibd_mce_t *mce, *tmce, *omce = NULL; 2837 boolean_t do_attach = B_TRUE; 2838 2839 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2840 jstate, mgid.gid_prefix, mgid.gid_guid); 2841 2842 /* 2843 * For enable_multicast Full member joins, we need to do some 2844 * extra work. If there is already an mce on the list that 2845 * indicates full membership, that means the membership has 2846 * not yet been dropped (since the disable_multicast was issued) 2847 * because there are pending Tx's to the mcg; in that case, just 2848 * mark the mce not to be reaped when the Tx completion queues 2849 * an async reap operation. 2850 * 2851 * If there is already an mce on the list indicating sendonly 2852 * membership, try to promote to full membership. Be careful 2853 * not to deallocate the old mce, since there might be an AH 2854 * pointing to it; instead, update the old mce with new data 2855 * that tracks the full membership. 2856 */ 2857 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2858 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2859 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2860 ASSERT(omce->mc_fullreap); 2861 omce->mc_fullreap = B_FALSE; 2862 return (omce); 2863 } else { 2864 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2865 } 2866 } 2867 2868 /* 2869 * Allocate the ibd_mce_t to track this JOIN. 2870 */ 2871 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2872 mce->mc_fullreap = B_FALSE; 2873 mce->mc_jstate = jstate; 2874 2875 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2876 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2877 ibt_status); 2878 kmem_free(mce, sizeof (ibd_mce_t)); 2879 return (NULL); 2880 } 2881 2882 /* 2883 * Is an IBA attach required? Not if the interface is already joined 2884 * to the mcg in a different appropriate join state. 2885 */ 2886 if (jstate == IB_MC_JSTATE_NON) { 2887 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2888 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2889 do_attach = B_FALSE; 2890 } else if (jstate == IB_MC_JSTATE_FULL) { 2891 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2892 do_attach = B_FALSE; 2893 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2894 do_attach = B_FALSE; 2895 } 2896 2897 if (do_attach) { 2898 /* 2899 * Do the IBA attach. 2900 */ 2901 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2902 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2903 &mce->mc_info)) != IBT_SUCCESS) { 2904 DPRINT(10, "ibd_join_group : failed qp attachment " 2905 "%d\n", ibt_status); 2906 /* 2907 * NOTE that we should probably preserve the join info 2908 * in the list and later try to leave again at detach 2909 * time. 2910 */ 2911 (void) ibt_leave_mcg(state->id_sgid, mgid, 2912 state->id_sgid, jstate); 2913 kmem_free(mce, sizeof (ibd_mce_t)); 2914 return (NULL); 2915 } 2916 } 2917 2918 /* 2919 * Insert the ibd_mce_t in the proper list. 2920 */ 2921 if (jstate == IB_MC_JSTATE_NON) { 2922 IBD_MCACHE_INSERT_NON(state, mce); 2923 } else { 2924 /* 2925 * Set up the mc_req fields used for reaping the 2926 * mcg in case of delayed tx completion (see 2927 * ibd_tx_cleanup()). Also done for sendonly join in 2928 * case we are promoted to fullmembership later and 2929 * keep using the same mce. 2930 */ 2931 mce->mc_req.rq_gid = mgid; 2932 mce->mc_req.rq_ptr = mce; 2933 /* 2934 * Check whether this is the case of trying to join 2935 * full member, and we were already joined send only. 2936 * We try to drop our SendOnly membership, but it is 2937 * possible that the mcg does not exist anymore (and 2938 * the subnet trap never reached us), so the leave 2939 * operation might fail. 2940 */ 2941 if (omce != NULL) { 2942 (void) ibt_leave_mcg(state->id_sgid, mgid, 2943 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2944 omce->mc_jstate = IB_MC_JSTATE_FULL; 2945 bcopy(&mce->mc_info, &omce->mc_info, 2946 sizeof (ibt_mcg_info_t)); 2947 kmem_free(mce, sizeof (ibd_mce_t)); 2948 return (omce); 2949 } 2950 mutex_enter(&state->id_mc_mutex); 2951 IBD_MCACHE_INSERT_FULL(state, mce); 2952 mutex_exit(&state->id_mc_mutex); 2953 } 2954 2955 return (mce); 2956 } 2957 2958 /* 2959 * Called during port up event handling to attempt to reacquire full 2960 * membership to an mcg. Stripped down version of ibd_join_group(). 2961 * Note that it is possible that the mcg might have gone away, and 2962 * gets recreated at this point. 2963 */ 2964 static void 2965 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2966 { 2967 ib_gid_t mgid; 2968 2969 /* 2970 * If the mc_fullreap flag is set, or this join fails, a subsequent 2971 * reap/leave is going to try to leave the group. We could prevent 2972 * that by adding a boolean flag into ibd_mce_t, if required. 2973 */ 2974 if (mce->mc_fullreap) 2975 return; 2976 2977 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2978 2979 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2980 mgid.gid_guid); 2981 2982 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2983 ibd_print_warn(state, "Failure on port up to rejoin " 2984 "multicast gid %016llx:%016llx", 2985 (u_longlong_t)mgid.gid_prefix, 2986 (u_longlong_t)mgid.gid_guid); 2987 } 2988 2989 /* 2990 * This code handles delayed Tx completion cleanups for mcg's to which 2991 * disable_multicast has been issued, regular mcg related cleanups during 2992 * disable_multicast, disable_promiscuous and mcg traps, as well as 2993 * cleanups during driver detach time. Depending on the join state, 2994 * it deletes the mce from the appropriate list and issues the IBA 2995 * leave/detach; except in the disable_multicast case when the mce 2996 * is left on the active list for a subsequent Tx completion cleanup. 2997 */ 2998 static void 2999 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3000 uint8_t jstate) 3001 { 3002 ibd_mce_t *tmce; 3003 boolean_t do_detach = B_TRUE; 3004 3005 /* 3006 * Before detaching, we must check whether the other list 3007 * contains the mcg; if we detach blindly, the consumer 3008 * who set up the other list will also stop receiving 3009 * traffic. 3010 */ 3011 if (jstate == IB_MC_JSTATE_FULL) { 3012 /* 3013 * The following check is only relevant while coming 3014 * from the Tx completion path in the reap case. 3015 */ 3016 if (!mce->mc_fullreap) 3017 return; 3018 mutex_enter(&state->id_mc_mutex); 3019 IBD_MCACHE_PULLOUT_FULL(state, mce); 3020 mutex_exit(&state->id_mc_mutex); 3021 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3022 do_detach = B_FALSE; 3023 } else if (jstate == IB_MC_JSTATE_NON) { 3024 IBD_MCACHE_PULLOUT_NON(state, mce); 3025 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3026 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3027 do_detach = B_FALSE; 3028 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3029 mutex_enter(&state->id_mc_mutex); 3030 IBD_MCACHE_PULLOUT_FULL(state, mce); 3031 mutex_exit(&state->id_mc_mutex); 3032 do_detach = B_FALSE; 3033 } 3034 3035 /* 3036 * If we are reacting to a mcg trap and leaving our sendonly or 3037 * non membership, the mcg is possibly already gone, so attempting 3038 * to leave might fail. On the other hand, we must try to leave 3039 * anyway, since this might be a trap from long ago, and we could 3040 * have potentially sendonly joined to a recent incarnation of 3041 * the mcg and are about to loose track of this information. 3042 */ 3043 if (do_detach) { 3044 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3045 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3046 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3047 } 3048 3049 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3050 kmem_free(mce, sizeof (ibd_mce_t)); 3051 } 3052 3053 /* 3054 * Async code executed due to multicast and promiscuous disable requests 3055 * and mcg trap handling; also executed during driver detach. Mostly, a 3056 * leave and detach is done; except for the fullmember case when Tx 3057 * requests are pending, whence arrangements are made for subsequent 3058 * cleanup on Tx completion. 3059 */ 3060 static void 3061 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3062 { 3063 ipoib_mac_t mcmac; 3064 boolean_t recycled; 3065 ibd_mce_t *mce; 3066 3067 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3068 jstate, mgid.gid_prefix, mgid.gid_guid); 3069 3070 if (jstate == IB_MC_JSTATE_NON) { 3071 recycled = B_TRUE; 3072 mce = IBD_MCACHE_FIND_NON(state, mgid); 3073 /* 3074 * In case we are handling a mcg trap, we might not find 3075 * the mcg in the non list. 3076 */ 3077 if (mce == NULL) { 3078 return; 3079 } 3080 } else { 3081 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3082 3083 /* 3084 * In case we are handling a mcg trap, make sure the trap 3085 * is not arriving late; if we have an mce that indicates 3086 * that we are already a fullmember, that would be a clear 3087 * indication that the trap arrived late (ie, is for a 3088 * previous incarnation of the mcg). 3089 */ 3090 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3091 if ((mce == NULL) || (mce->mc_jstate == 3092 IB_MC_JSTATE_FULL)) { 3093 return; 3094 } 3095 } else { 3096 ASSERT(jstate == IB_MC_JSTATE_FULL); 3097 3098 /* 3099 * If join group failed, mce will be NULL here. 3100 * This is because in GLDv3 driver, set multicast 3101 * will always return success. 3102 */ 3103 if (mce == NULL) { 3104 return; 3105 } 3106 3107 mce->mc_fullreap = B_TRUE; 3108 } 3109 3110 /* 3111 * If no pending Tx's remain that reference the AH 3112 * for the mcg, recycle it from active to free list. 3113 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3114 * so the last completing Tx will cause an async reap 3115 * operation to be invoked, at which time we will drop our 3116 * membership to the mcg so that the pending Tx's complete 3117 * successfully. Refer to comments on "AH and MCE active 3118 * list manipulation" at top of this file. The lock protects 3119 * against Tx fast path and Tx cleanup code. 3120 */ 3121 mutex_enter(&state->id_ac_mutex); 3122 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3123 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3124 IB_MC_JSTATE_SEND_ONLY_NON)); 3125 mutex_exit(&state->id_ac_mutex); 3126 } 3127 3128 if (recycled) { 3129 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3130 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3131 ibd_async_reap_group(state, mce, mgid, jstate); 3132 } 3133 } 3134 3135 /* 3136 * Find the broadcast address as defined by IPoIB; implicitly 3137 * determines the IBA scope, mtu, tclass etc of the link the 3138 * interface is going to be a member of. 3139 */ 3140 static ibt_status_t 3141 ibd_find_bgroup(ibd_state_t *state) 3142 { 3143 ibt_mcg_attr_t mcg_attr; 3144 uint_t numg; 3145 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3146 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3147 IB_MC_SCOPE_GLOBAL }; 3148 int i, mcgmtu; 3149 boolean_t found = B_FALSE; 3150 int ret; 3151 ibt_mcg_info_t mcg_info; 3152 3153 state->id_bgroup_created = B_FALSE; 3154 3155 query_bcast_grp: 3156 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3157 mcg_attr.mc_pkey = state->id_pkey; 3158 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3159 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3160 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3161 3162 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3163 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3164 3165 /* 3166 * Look for the IPoIB broadcast group. 3167 */ 3168 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3169 state->id_mgid.gid_prefix = 3170 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3171 ((uint64_t)state->id_scope << 48) | 3172 ((uint32_t)(state->id_pkey << 16))); 3173 mcg_attr.mc_mgid = state->id_mgid; 3174 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3175 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3176 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3177 found = B_TRUE; 3178 break; 3179 } 3180 } 3181 3182 if (!found) { 3183 if (ibd_create_broadcast_group) { 3184 /* 3185 * If we created the broadcast group, but failed to 3186 * find it, we can't do anything except leave the 3187 * one we created and return failure. 3188 */ 3189 if (state->id_bgroup_created) { 3190 ibd_print_warn(state, "IPoIB broadcast group " 3191 "absent. Unable to query after create."); 3192 goto find_bgroup_fail; 3193 } 3194 3195 /* 3196 * Create the ipoib broadcast group if it didn't exist 3197 */ 3198 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3199 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3200 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3201 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3202 mcg_attr.mc_pkey = state->id_pkey; 3203 mcg_attr.mc_flow = 0; 3204 mcg_attr.mc_sl = 0; 3205 mcg_attr.mc_tclass = 0; 3206 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3207 state->id_mgid.gid_prefix = 3208 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3209 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3210 ((uint32_t)(state->id_pkey << 16))); 3211 mcg_attr.mc_mgid = state->id_mgid; 3212 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3213 3214 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3215 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3216 ibd_print_warn(state, "IPoIB broadcast group " 3217 "absent, create failed: ret = %d\n", ret); 3218 state->id_bgroup_created = B_FALSE; 3219 return (IBT_FAILURE); 3220 } 3221 state->id_bgroup_created = B_TRUE; 3222 goto query_bcast_grp; 3223 } else { 3224 ibd_print_warn(state, "IPoIB broadcast group absent"); 3225 return (IBT_FAILURE); 3226 } 3227 } 3228 3229 /* 3230 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3231 */ 3232 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3233 if (state->id_mtu < mcgmtu) { 3234 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3235 "greater than port's maximum MTU %d", mcgmtu, 3236 state->id_mtu); 3237 ibt_free_mcg_info(state->id_mcinfo, 1); 3238 goto find_bgroup_fail; 3239 } 3240 state->id_mtu = mcgmtu; 3241 3242 return (IBT_SUCCESS); 3243 3244 find_bgroup_fail: 3245 if (state->id_bgroup_created) { 3246 (void) ibt_leave_mcg(state->id_sgid, 3247 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3248 IB_MC_JSTATE_FULL); 3249 } 3250 3251 return (IBT_FAILURE); 3252 } 3253 3254 static int 3255 ibd_alloc_tx_copybufs(ibd_state_t *state) 3256 { 3257 ibt_mr_attr_t mem_attr; 3258 3259 /* 3260 * Allocate one big chunk for all regular tx copy bufs 3261 */ 3262 state->id_tx_buf_sz = state->id_mtu; 3263 if (state->id_lso_policy && state->id_lso_capable && 3264 (IBD_TX_BUF_SZ > state->id_mtu)) { 3265 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3266 } 3267 3268 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3269 state->id_tx_buf_sz, KM_SLEEP); 3270 3271 state->id_tx_wqes = kmem_zalloc(state->id_num_swqe * 3272 sizeof (ibd_swqe_t), KM_SLEEP); 3273 3274 /* 3275 * Do one memory registration on the entire txbuf area 3276 */ 3277 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3278 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3279 mem_attr.mr_as = NULL; 3280 mem_attr.mr_flags = IBT_MR_SLEEP; 3281 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3282 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3283 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3284 kmem_free(state->id_tx_wqes, 3285 state->id_num_swqe * sizeof (ibd_swqe_t)); 3286 kmem_free(state->id_tx_bufs, 3287 state->id_num_swqe * state->id_tx_buf_sz); 3288 state->id_tx_bufs = NULL; 3289 return (DDI_FAILURE); 3290 } 3291 3292 return (DDI_SUCCESS); 3293 } 3294 3295 static int 3296 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3297 { 3298 ibt_mr_attr_t mem_attr; 3299 ibd_lsobuf_t *buflist; 3300 ibd_lsobuf_t *lbufp; 3301 ibd_lsobuf_t *tail; 3302 ibd_lsobkt_t *bktp; 3303 uint8_t *membase; 3304 uint8_t *memp; 3305 uint_t memsz; 3306 int i; 3307 3308 /* 3309 * Allocate the lso bucket 3310 */ 3311 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3312 3313 /* 3314 * Allocate the entire lso memory and register it 3315 */ 3316 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3317 membase = kmem_zalloc(memsz, KM_SLEEP); 3318 3319 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3320 mem_attr.mr_len = memsz; 3321 mem_attr.mr_as = NULL; 3322 mem_attr.mr_flags = IBT_MR_SLEEP; 3323 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3324 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3325 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3326 kmem_free(membase, memsz); 3327 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3328 return (DDI_FAILURE); 3329 } 3330 3331 mutex_enter(&state->id_lso_lock); 3332 3333 /* 3334 * Now allocate the buflist. Note that the elements in the buflist and 3335 * the buffers in the lso memory have a permanent 1-1 relation, so we 3336 * can always derive the address of a buflist entry from the address of 3337 * an lso buffer. 3338 */ 3339 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3340 KM_SLEEP); 3341 3342 /* 3343 * Set up the lso buf chain 3344 */ 3345 memp = membase; 3346 lbufp = buflist; 3347 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3348 lbufp->lb_isfree = 1; 3349 lbufp->lb_buf = memp; 3350 lbufp->lb_next = lbufp + 1; 3351 3352 tail = lbufp; 3353 3354 memp += IBD_LSO_BUFSZ; 3355 lbufp++; 3356 } 3357 tail->lb_next = NULL; 3358 3359 /* 3360 * Set up the LSO buffer information in ibd state 3361 */ 3362 bktp->bkt_bufl = buflist; 3363 bktp->bkt_free_head = buflist; 3364 bktp->bkt_mem = membase; 3365 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3366 bktp->bkt_nfree = bktp->bkt_nelem; 3367 3368 state->id_lso = bktp; 3369 mutex_exit(&state->id_lso_lock); 3370 3371 return (DDI_SUCCESS); 3372 } 3373 3374 /* 3375 * Statically allocate Tx buffer list(s). 3376 */ 3377 static int 3378 ibd_init_txlist(ibd_state_t *state) 3379 { 3380 ibd_swqe_t *swqe; 3381 ibt_lkey_t lkey; 3382 int i; 3383 uint_t len; 3384 uint8_t *bufaddr; 3385 3386 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3387 return (DDI_FAILURE); 3388 3389 if (state->id_lso_policy && state->id_lso_capable) { 3390 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3391 state->id_lso_policy = B_FALSE; 3392 } 3393 3394 mutex_enter(&state->id_tx_list.dl_mutex); 3395 state->id_tx_list.dl_head = NULL; 3396 state->id_tx_list.dl_pending_sends = B_FALSE; 3397 state->id_tx_list.dl_cnt = 0; 3398 mutex_exit(&state->id_tx_list.dl_mutex); 3399 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3400 state->id_tx_rel_list.dl_head = NULL; 3401 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3402 state->id_tx_rel_list.dl_cnt = 0; 3403 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3404 3405 /* 3406 * Allocate and setup the swqe list 3407 */ 3408 lkey = state->id_tx_mr_desc.md_lkey; 3409 bufaddr = state->id_tx_bufs; 3410 len = state->id_tx_buf_sz; 3411 swqe = state->id_tx_wqes; 3412 mutex_enter(&state->id_tx_list.dl_mutex); 3413 for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) { 3414 swqe->swqe_next = NULL; 3415 swqe->swqe_im_mblk = NULL; 3416 3417 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3418 bufaddr; 3419 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3420 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3421 3422 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3423 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3424 swqe->w_swr.wr_trans = IBT_UD_SRV; 3425 3426 /* These are set in send */ 3427 swqe->w_swr.wr_nds = 0; 3428 swqe->w_swr.wr_sgl = NULL; 3429 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3430 3431 /* add to list */ 3432 state->id_tx_list.dl_cnt++; 3433 swqe->swqe_next = state->id_tx_list.dl_head; 3434 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3435 } 3436 mutex_exit(&state->id_tx_list.dl_mutex); 3437 3438 return (DDI_SUCCESS); 3439 } 3440 3441 static int 3442 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3443 uint32_t *nds_p) 3444 { 3445 ibd_lsobkt_t *bktp; 3446 ibd_lsobuf_t *lbufp; 3447 ibd_lsobuf_t *nextp; 3448 ibt_lkey_t lso_lkey; 3449 uint_t frag_sz; 3450 uint_t num_needed; 3451 int i; 3452 3453 ASSERT(sgl_p != NULL); 3454 ASSERT(nds_p != NULL); 3455 ASSERT(req_sz != 0); 3456 3457 /* 3458 * Determine how many bufs we'd need for the size requested 3459 */ 3460 num_needed = req_sz / IBD_LSO_BUFSZ; 3461 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3462 num_needed++; 3463 3464 mutex_enter(&state->id_lso_lock); 3465 3466 /* 3467 * If we don't have enough lso bufs, return failure 3468 */ 3469 ASSERT(state->id_lso != NULL); 3470 bktp = state->id_lso; 3471 if (bktp->bkt_nfree < num_needed) { 3472 mutex_exit(&state->id_lso_lock); 3473 return (-1); 3474 } 3475 3476 /* 3477 * Pick the first 'num_needed' bufs from the free list 3478 */ 3479 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3480 lbufp = bktp->bkt_free_head; 3481 for (i = 0; i < num_needed; i++) { 3482 ASSERT(lbufp->lb_isfree != 0); 3483 ASSERT(lbufp->lb_buf != NULL); 3484 3485 nextp = lbufp->lb_next; 3486 3487 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3488 sgl_p[i].ds_key = lso_lkey; 3489 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3490 3491 lbufp->lb_isfree = 0; 3492 lbufp->lb_next = NULL; 3493 3494 lbufp = nextp; 3495 } 3496 bktp->bkt_free_head = lbufp; 3497 3498 /* 3499 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3500 * to adjust the last sgl entry's length. Since we know we need atleast 3501 * one, the i-1 use below is ok. 3502 */ 3503 if (frag_sz) { 3504 sgl_p[i-1].ds_len = frag_sz; 3505 } 3506 3507 /* 3508 * Update nfree count and return 3509 */ 3510 bktp->bkt_nfree -= num_needed; 3511 3512 mutex_exit(&state->id_lso_lock); 3513 3514 *nds_p = num_needed; 3515 3516 return (0); 3517 } 3518 3519 static void 3520 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3521 { 3522 ibd_lsobkt_t *bktp; 3523 ibd_lsobuf_t *lbufp; 3524 uint8_t *lso_mem_end; 3525 uint_t ndx; 3526 int i; 3527 3528 mutex_enter(&state->id_lso_lock); 3529 3530 bktp = state->id_lso; 3531 ASSERT(bktp != NULL); 3532 3533 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3534 for (i = 0; i < nds; i++) { 3535 uint8_t *va; 3536 3537 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3538 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3539 3540 /* 3541 * Figure out the buflist element this sgl buffer corresponds 3542 * to and put it back at the head 3543 */ 3544 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3545 lbufp = bktp->bkt_bufl + ndx; 3546 3547 ASSERT(lbufp->lb_isfree == 0); 3548 ASSERT(lbufp->lb_buf == va); 3549 3550 lbufp->lb_isfree = 1; 3551 lbufp->lb_next = bktp->bkt_free_head; 3552 bktp->bkt_free_head = lbufp; 3553 } 3554 bktp->bkt_nfree += nds; 3555 3556 mutex_exit(&state->id_lso_lock); 3557 } 3558 3559 static void 3560 ibd_free_tx_copybufs(ibd_state_t *state) 3561 { 3562 /* 3563 * Unregister txbuf mr 3564 */ 3565 if (ibt_deregister_mr(state->id_hca_hdl, 3566 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3567 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3568 } 3569 state->id_tx_mr_hdl = NULL; 3570 3571 /* 3572 * Free txbuf memory 3573 */ 3574 kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t)); 3575 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3576 state->id_tx_wqes = NULL; 3577 state->id_tx_bufs = NULL; 3578 } 3579 3580 static void 3581 ibd_free_tx_lsobufs(ibd_state_t *state) 3582 { 3583 ibd_lsobkt_t *bktp; 3584 3585 mutex_enter(&state->id_lso_lock); 3586 3587 if ((bktp = state->id_lso) == NULL) { 3588 mutex_exit(&state->id_lso_lock); 3589 return; 3590 } 3591 3592 /* 3593 * First, free the buflist 3594 */ 3595 ASSERT(bktp->bkt_bufl != NULL); 3596 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3597 3598 /* 3599 * Unregister the LSO memory and free it 3600 */ 3601 ASSERT(bktp->bkt_mr_hdl != NULL); 3602 if (ibt_deregister_mr(state->id_hca_hdl, 3603 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3604 DPRINT(10, 3605 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3606 } 3607 ASSERT(bktp->bkt_mem); 3608 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3609 3610 /* 3611 * Finally free the bucket 3612 */ 3613 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3614 state->id_lso = NULL; 3615 3616 mutex_exit(&state->id_lso_lock); 3617 } 3618 3619 /* 3620 * Free the statically allocated Tx buffer list. 3621 */ 3622 static void 3623 ibd_fini_txlist(ibd_state_t *state) 3624 { 3625 /* 3626 * Free the allocated swqes 3627 */ 3628 mutex_enter(&state->id_tx_list.dl_mutex); 3629 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3630 state->id_tx_list.dl_head = NULL; 3631 state->id_tx_list.dl_pending_sends = B_FALSE; 3632 state->id_tx_list.dl_cnt = 0; 3633 state->id_tx_rel_list.dl_head = NULL; 3634 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3635 state->id_tx_rel_list.dl_cnt = 0; 3636 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3637 mutex_exit(&state->id_tx_list.dl_mutex); 3638 3639 ibd_free_tx_lsobufs(state); 3640 ibd_free_tx_copybufs(state); 3641 } 3642 3643 /* 3644 * post a list of rwqes, NULL terminated. 3645 */ 3646 static void 3647 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3648 { 3649 uint_t i; 3650 uint_t num_posted; 3651 ibt_status_t ibt_status; 3652 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3653 3654 while (rwqe) { 3655 /* Post up to IBD_RX_POST_CNT receive work requests */ 3656 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3657 wrs[i] = rwqe->w_rwr; 3658 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3659 if (rwqe == NULL) { 3660 i++; 3661 break; 3662 } 3663 } 3664 3665 /* 3666 * If posting fails for some reason, we'll never receive 3667 * completion intimation, so we'll need to cleanup. But 3668 * we need to make sure we don't clean up nodes whose 3669 * wrs have been successfully posted. We assume that the 3670 * hca driver returns on the first failure to post and 3671 * therefore the first 'num_posted' entries don't need 3672 * cleanup here. 3673 */ 3674 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3675 3676 num_posted = 0; 3677 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3678 &num_posted); 3679 if (ibt_status != IBT_SUCCESS) { 3680 /* This cannot happen unless the device has an error. */ 3681 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3682 "posting multiple wrs failed: " 3683 "requested=%d, done=%d, ret=%d", 3684 IBD_RX_POST_CNT, num_posted, ibt_status); 3685 atomic_add_32(&state->id_rx_list.dl_cnt, 3686 num_posted - i); 3687 } 3688 } 3689 } 3690 3691 /* 3692 * Grab a list of rwqes from the array of lists, and post the list. 3693 */ 3694 static void 3695 ibd_post_recv_intr(ibd_state_t *state) 3696 { 3697 ibd_rx_queue_t *rxp; 3698 ibd_rwqe_t *list; 3699 3700 /* rotate through the rx_queue array, expecting an adequate number */ 3701 state->id_rx_post_queue_index = 3702 (state->id_rx_post_queue_index + 1) & 3703 (state->id_rx_nqueues - 1); 3704 3705 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3706 mutex_enter(&rxp->rx_post_lock); 3707 list = WQE_TO_RWQE(rxp->rx_head); 3708 rxp->rx_head = NULL; 3709 rxp->rx_cnt = 0; 3710 mutex_exit(&rxp->rx_post_lock); 3711 ibd_post_recv_list(state, list); 3712 } 3713 3714 /* macro explained below */ 3715 #define RX_QUEUE_HASH(rwqe) \ 3716 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3717 3718 /* 3719 * Add a rwqe to one of the the Rx lists. If the list is large enough 3720 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3721 * 3722 * Note: one of 2^N lists is chosen via a hash. This is done 3723 * because using one list is contentious. If the first list is busy 3724 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3725 * 3726 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3727 * even distribution of mapping rwqes to the 2^N queues. 3728 */ 3729 static void 3730 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3731 { 3732 ibd_rx_queue_t *rxp; 3733 3734 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3735 3736 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3737 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3738 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3739 mutex_enter(&rxp->rx_post_lock); 3740 } 3741 rwqe->rwqe_next = rxp->rx_head; 3742 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 3743 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 3744 3745 /* only call ibt_post_recv() every Nth time through here */ 3746 if ((active & (state->id_rx_nqueues - 1)) == 0) { 3747 rxp->rx_head = NULL; 3748 rxp->rx_cnt = 0; 3749 mutex_exit(&rxp->rx_post_lock); 3750 ibd_post_recv_list(state, rwqe); 3751 return; 3752 } 3753 } 3754 rxp->rx_head = RWQE_TO_WQE(rwqe); 3755 mutex_exit(&rxp->rx_post_lock); 3756 } 3757 3758 static int 3759 ibd_alloc_rx_copybufs(ibd_state_t *state) 3760 { 3761 ibt_mr_attr_t mem_attr; 3762 int i; 3763 3764 /* 3765 * Allocate one big chunk for all regular rx copy bufs 3766 */ 3767 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 3768 3769 state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe * 3770 state->id_rx_buf_sz, KM_SLEEP); 3771 3772 state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe * 3773 sizeof (ibd_rwqe_t), KM_SLEEP); 3774 3775 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 3776 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 3777 sizeof (ibd_rx_queue_t), KM_SLEEP); 3778 for (i = 0; i < state->id_rx_nqueues; i++) { 3779 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3780 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 3781 } 3782 3783 /* 3784 * Do one memory registration on the entire rxbuf area 3785 */ 3786 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 3787 mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz; 3788 mem_attr.mr_as = NULL; 3789 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3790 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3791 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 3792 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 3793 kmem_free(state->id_rx_wqes, 3794 state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3795 kmem_free(state->id_rx_bufs, 3796 state->id_num_rwqe * state->id_rx_buf_sz); 3797 state->id_rx_bufs = NULL; 3798 state->id_rx_wqes = NULL; 3799 return (DDI_FAILURE); 3800 } 3801 3802 return (DDI_SUCCESS); 3803 } 3804 3805 /* 3806 * Allocate the statically allocated Rx buffer list. 3807 */ 3808 static int 3809 ibd_init_rxlist(ibd_state_t *state) 3810 { 3811 ibd_rwqe_t *rwqe, *next; 3812 ibd_wqe_t *list; 3813 ibt_lkey_t lkey; 3814 int i; 3815 uint_t len; 3816 uint8_t *bufaddr; 3817 3818 mutex_enter(&state->id_rx_free_list.dl_mutex); 3819 if (state->id_rx_free_list.dl_head != NULL) { 3820 /* rx rsrcs were never freed. Just repost them */ 3821 len = state->id_rx_buf_sz; 3822 list = state->id_rx_free_list.dl_head; 3823 state->id_rx_free_list.dl_head = NULL; 3824 state->id_rx_free_list.dl_cnt = 0; 3825 mutex_exit(&state->id_rx_free_list.dl_mutex); 3826 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3827 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3828 if ((rwqe->rwqe_im_mblk = desballoc( 3829 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 3830 &rwqe->w_freemsg_cb)) == NULL) { 3831 /* allow freemsg_cb to free the rwqes */ 3832 if (atomic_dec_32_nv(&state->id_running) != 0) { 3833 cmn_err(CE_WARN, "ibd_init_rxlist: " 3834 "id_running was not 1\n"); 3835 } 3836 DPRINT(10, "ibd_init_rxlist : " 3837 "failed in desballoc()"); 3838 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3839 rwqe = next) { 3840 next = WQE_TO_RWQE(rwqe->rwqe_next); 3841 freemsg(rwqe->rwqe_im_mblk); 3842 } 3843 atomic_inc_32(&state->id_running); 3844 return (DDI_FAILURE); 3845 } 3846 } 3847 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3848 return (DDI_SUCCESS); 3849 } 3850 mutex_exit(&state->id_rx_free_list.dl_mutex); 3851 3852 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 3853 return (DDI_FAILURE); 3854 3855 /* 3856 * Allocate and setup the rwqe list 3857 */ 3858 len = state->id_rx_buf_sz; 3859 lkey = state->id_rx_mr_desc.md_lkey; 3860 rwqe = state->id_rx_wqes; 3861 bufaddr = state->id_rx_bufs; 3862 list = NULL; 3863 for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) { 3864 rwqe->w_state = state; 3865 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3866 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3867 3868 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 3869 3870 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 3871 &rwqe->w_freemsg_cb)) == NULL) { 3872 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 3873 /* allow freemsg_cb to free the rwqes */ 3874 if (atomic_dec_32_nv(&state->id_running) != 0) { 3875 cmn_err(CE_WARN, "ibd_init_rxlist: " 3876 "id_running was not 1\n"); 3877 } 3878 DPRINT(10, "ibd_init_rxlist : " 3879 "failed in desballoc()"); 3880 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3881 rwqe = next) { 3882 next = WQE_TO_RWQE(rwqe->rwqe_next); 3883 freemsg(rwqe->rwqe_im_mblk); 3884 } 3885 atomic_inc_32(&state->id_running); 3886 return (DDI_FAILURE); 3887 } 3888 3889 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 3890 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3891 (ib_vaddr_t)(uintptr_t)bufaddr; 3892 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 3893 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3894 rwqe->w_rwr.wr_nds = 1; 3895 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3896 3897 rwqe->rwqe_next = list; 3898 list = RWQE_TO_WQE(rwqe); 3899 } 3900 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3901 3902 return (DDI_SUCCESS); 3903 } 3904 3905 static void 3906 ibd_free_rx_copybufs(ibd_state_t *state) 3907 { 3908 int i; 3909 3910 /* 3911 * Unregister rxbuf mr 3912 */ 3913 if (ibt_deregister_mr(state->id_hca_hdl, 3914 state->id_rx_mr_hdl) != IBT_SUCCESS) { 3915 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 3916 } 3917 state->id_rx_mr_hdl = NULL; 3918 3919 /* 3920 * Free rxbuf memory 3921 */ 3922 for (i = 0; i < state->id_rx_nqueues; i++) { 3923 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3924 mutex_destroy(&rxp->rx_post_lock); 3925 } 3926 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 3927 sizeof (ibd_rx_queue_t)); 3928 kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3929 kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz); 3930 state->id_rx_queues = NULL; 3931 state->id_rx_wqes = NULL; 3932 state->id_rx_bufs = NULL; 3933 } 3934 3935 static void 3936 ibd_free_rx_rsrcs(ibd_state_t *state) 3937 { 3938 mutex_enter(&state->id_rx_free_list.dl_mutex); 3939 if (state->id_rx_free_list.dl_head == NULL) { 3940 /* already freed */ 3941 mutex_exit(&state->id_rx_free_list.dl_mutex); 3942 return; 3943 } 3944 ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe); 3945 ibd_free_rx_copybufs(state); 3946 state->id_rx_free_list.dl_cnt = 0; 3947 state->id_rx_free_list.dl_head = NULL; 3948 mutex_exit(&state->id_rx_free_list.dl_mutex); 3949 } 3950 3951 /* 3952 * Free the statically allocated Rx buffer list. 3953 * 3954 */ 3955 static void 3956 ibd_fini_rxlist(ibd_state_t *state) 3957 { 3958 ibd_rwqe_t *rwqe; 3959 int i; 3960 3961 /* run through the rx_queue's, calling freemsg() */ 3962 for (i = 0; i < state->id_rx_nqueues; i++) { 3963 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3964 mutex_enter(&rxp->rx_post_lock); 3965 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 3966 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3967 freemsg(rwqe->rwqe_im_mblk); 3968 rxp->rx_cnt--; 3969 } 3970 rxp->rx_head = NULL; 3971 mutex_exit(&rxp->rx_post_lock); 3972 } 3973 3974 /* cannot free rx resources unless gld returned everything */ 3975 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 3976 ibd_free_rx_rsrcs(state); 3977 } 3978 3979 /* 3980 * Free an allocated recv wqe. 3981 */ 3982 /* ARGSUSED */ 3983 static void 3984 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3985 { 3986 /* 3987 * desballoc() failed (no memory). 3988 * 3989 * This rwqe is placed on a free list so that it 3990 * can be reinstated when memory is available. 3991 * 3992 * NOTE: no code currently exists to reinstate 3993 * these "lost" rwqes. 3994 */ 3995 mutex_enter(&state->id_rx_free_list.dl_mutex); 3996 state->id_rx_free_list.dl_cnt++; 3997 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 3998 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 3999 mutex_exit(&state->id_rx_free_list.dl_mutex); 4000 } 4001 4002 /* 4003 * IBA Rx completion queue handler. Guaranteed to be single 4004 * threaded and nonreentrant for this CQ. 4005 */ 4006 /* ARGSUSED */ 4007 static void 4008 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4009 { 4010 ibd_state_t *state = (ibd_state_t *)arg; 4011 4012 atomic_inc_64(&state->id_num_intrs); 4013 4014 if (ibd_rx_softintr == 1) { 4015 mutex_enter(&state->id_rcq_poll_lock); 4016 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4017 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4018 mutex_exit(&state->id_rcq_poll_lock); 4019 return; 4020 } else { 4021 mutex_exit(&state->id_rcq_poll_lock); 4022 ddi_trigger_softintr(state->id_rx); 4023 } 4024 } else 4025 (void) ibd_intr((caddr_t)state); 4026 } 4027 4028 /* 4029 * CQ handler for Tx completions, when the Tx CQ is in 4030 * interrupt driven mode. 4031 */ 4032 /* ARGSUSED */ 4033 static void 4034 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4035 { 4036 ibd_state_t *state = (ibd_state_t *)arg; 4037 4038 atomic_inc_64(&state->id_num_intrs); 4039 4040 if (ibd_tx_softintr == 1) { 4041 mutex_enter(&state->id_scq_poll_lock); 4042 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4043 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4044 mutex_exit(&state->id_scq_poll_lock); 4045 return; 4046 } else { 4047 mutex_exit(&state->id_scq_poll_lock); 4048 ddi_trigger_softintr(state->id_tx); 4049 } 4050 } else 4051 (void) ibd_tx_recycle((caddr_t)state); 4052 } 4053 4054 /* 4055 * Multicast group create/delete trap handler. These will be delivered 4056 * on a kernel thread (handling can thus block) and can be invoked 4057 * concurrently. The handler can be invoked anytime after it is 4058 * registered and before ibt_detach(). 4059 */ 4060 /* ARGSUSED */ 4061 static void 4062 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4063 ibt_subnet_event_t *event) 4064 { 4065 ibd_state_t *state = (ibd_state_t *)arg; 4066 ibd_req_t *req; 4067 4068 /* 4069 * The trap handler will get invoked once for every event for 4070 * every port. The input "gid" is the GID0 of the port the 4071 * trap came in on; we just need to act on traps that came 4072 * to our port, meaning the port on which the ipoib interface 4073 * resides. Since ipoib uses GID0 of the port, we just match 4074 * the gids to check whether we need to handle the trap. 4075 */ 4076 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4077 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4078 return; 4079 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4080 4081 DPRINT(10, "ibd_notices_handler : %d\n", code); 4082 4083 switch (code) { 4084 case IBT_SM_EVENT_UNAVAILABLE: 4085 /* 4086 * If we are in promiscuous mode or have 4087 * sendnonmembers, we need to print a warning 4088 * message right now. Else, just store the 4089 * information, print when we enter promiscuous 4090 * mode or attempt nonmember send. We might 4091 * also want to stop caching sendnonmember. 4092 */ 4093 ibd_print_warn(state, "IBA multicast support " 4094 "degraded due to unavailability of multicast " 4095 "traps"); 4096 break; 4097 case IBT_SM_EVENT_AVAILABLE: 4098 /* 4099 * If we printed a warning message above or 4100 * while trying to nonmember send or get into 4101 * promiscuous mode, print an okay message. 4102 */ 4103 ibd_print_warn(state, "IBA multicast support " 4104 "restored due to availability of multicast " 4105 "traps"); 4106 break; 4107 case IBT_SM_EVENT_MCG_CREATED: 4108 case IBT_SM_EVENT_MCG_DELETED: 4109 /* 4110 * Common processing of creation/deletion traps. 4111 * First check if the instance is being 4112 * [de]initialized; back off then, without doing 4113 * anything more, since we are not sure if the 4114 * async thread is around, or whether we might 4115 * be racing with the detach code in ibd_m_stop() 4116 * that scans the mcg list. 4117 */ 4118 if (!ibd_async_safe(state)) 4119 return; 4120 4121 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4122 req->rq_gid = event->sm_notice_gid; 4123 req->rq_ptr = (void *)code; 4124 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4125 break; 4126 } 4127 } 4128 4129 static void 4130 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4131 { 4132 ib_gid_t mgid = req->rq_gid; 4133 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4134 4135 DPRINT(10, "ibd_async_trap : %d\n", code); 4136 4137 /* 4138 * Atomically search the nonmember and sendonlymember lists and 4139 * delete. 4140 */ 4141 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4142 4143 if (state->id_prom_op == IBD_OP_COMPLETED) { 4144 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4145 4146 /* 4147 * If in promiscuous mode, try to join/attach to the new 4148 * mcg. Given the unreliable out-of-order mode of trap 4149 * delivery, we can never be sure whether it is a problem 4150 * if the join fails. Thus, we warn the admin of a failure 4151 * if this was a creation trap. Note that the trap might 4152 * actually be reporting a long past event, and the mcg 4153 * might already have been deleted, thus we might be warning 4154 * in vain. 4155 */ 4156 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4157 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4158 ibd_print_warn(state, "IBA promiscuous mode missed " 4159 "new multicast gid %016llx:%016llx", 4160 (u_longlong_t)mgid.gid_prefix, 4161 (u_longlong_t)mgid.gid_guid); 4162 } 4163 4164 /* 4165 * Free the request slot allocated by the subnet event thread. 4166 */ 4167 ibd_async_done(state); 4168 } 4169 4170 /* 4171 * GLDv3 entry point to get capabilities. 4172 */ 4173 static boolean_t 4174 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4175 { 4176 ibd_state_t *state = arg; 4177 4178 switch (cap) { 4179 case MAC_CAPAB_HCKSUM: { 4180 uint32_t *txflags = cap_data; 4181 4182 /* 4183 * We either do full checksum or not do it at all 4184 */ 4185 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4186 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4187 else 4188 return (B_FALSE); 4189 break; 4190 } 4191 4192 case MAC_CAPAB_LSO: { 4193 mac_capab_lso_t *cap_lso = cap_data; 4194 4195 /* 4196 * In addition to the capability and policy, since LSO 4197 * relies on hw checksum, we'll not enable LSO if we 4198 * don't have hw checksum. Of course, if the HCA doesn't 4199 * provide the reserved lkey capability, enabling LSO will 4200 * actually affect performance adversely, so we'll disable 4201 * LSO even for that case. 4202 */ 4203 if (!state->id_lso_policy || !state->id_lso_capable) 4204 return (B_FALSE); 4205 4206 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4207 return (B_FALSE); 4208 4209 if (state->id_hca_res_lkey_capab == 0) { 4210 ibd_print_warn(state, "no reserved-lkey capability, " 4211 "disabling LSO"); 4212 return (B_FALSE); 4213 } 4214 4215 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4216 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4217 break; 4218 } 4219 4220 default: 4221 return (B_FALSE); 4222 } 4223 4224 return (B_TRUE); 4225 } 4226 4227 static int 4228 ibd_get_port_details(ibd_state_t *state) 4229 { 4230 ibt_hca_portinfo_t *port_infop; 4231 ibt_status_t ret; 4232 uint_t psize, port_infosz; 4233 4234 mutex_enter(&state->id_link_mutex); 4235 4236 /* 4237 * Query for port information 4238 */ 4239 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4240 &port_infop, &psize, &port_infosz); 4241 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4242 mutex_exit(&state->id_link_mutex); 4243 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4244 "failed, ret=%d", ret); 4245 return (ENETDOWN); 4246 } 4247 4248 /* 4249 * If the link already went down by the time we get here, 4250 * give up 4251 */ 4252 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4253 mutex_exit(&state->id_link_mutex); 4254 ibt_free_portinfo(port_infop, port_infosz); 4255 DPRINT(10, "ibd_get_port_details: port is not active"); 4256 return (ENETDOWN); 4257 } 4258 4259 /* 4260 * If the link is active, verify the pkey 4261 */ 4262 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4263 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4264 mutex_exit(&state->id_link_mutex); 4265 ibt_free_portinfo(port_infop, port_infosz); 4266 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4267 "failed, ret=%d", ret); 4268 return (ENONET); 4269 } 4270 4271 state->id_mtu = (128 << port_infop->p_mtu); 4272 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4273 state->id_sgid = *port_infop->p_sgid_tbl; 4274 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4275 state->id_link_state = LINK_STATE_UP; 4276 4277 mutex_exit(&state->id_link_mutex); 4278 ibt_free_portinfo(port_infop, port_infosz); 4279 4280 /* 4281 * Now that the port is active, record the port speed 4282 */ 4283 state->id_link_speed = ibd_get_portspeed(state); 4284 4285 return (0); 4286 } 4287 4288 static int 4289 ibd_alloc_cqs(ibd_state_t *state) 4290 { 4291 ibt_hca_attr_t hca_attrs; 4292 ibt_cq_attr_t cq_attr; 4293 ibt_status_t ret; 4294 uint32_t real_size; 4295 4296 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4297 ASSERT(ret == IBT_SUCCESS); 4298 4299 /* 4300 * Allocate Rx/combined CQ: 4301 * Theoretically, there is no point in having more than #rwqe 4302 * plus #swqe cqe's, except that the CQ will be signaled for 4303 * overflow when the last wqe completes, if none of the previous 4304 * cqe's have been polled. Thus, we allocate just a few less wqe's 4305 * to make sure such overflow does not occur. 4306 */ 4307 cq_attr.cq_sched = NULL; 4308 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4309 4310 /* 4311 * Allocate Receive CQ. 4312 */ 4313 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4314 cq_attr.cq_size = state->id_num_rwqe + 1; 4315 } else { 4316 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4317 state->id_num_rwqe = cq_attr.cq_size - 1; 4318 } 4319 4320 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4321 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4322 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4323 "failed, ret=%d\n", ret); 4324 return (DDI_FAILURE); 4325 } 4326 4327 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4328 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4329 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4330 "moderation failed, ret=%d\n", ret); 4331 } 4332 4333 /* make the #rx wc's the same as max rx chain size */ 4334 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 4335 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4336 state->id_rxwcs_size, KM_SLEEP); 4337 4338 /* 4339 * Allocate Send CQ. 4340 */ 4341 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4342 cq_attr.cq_size = state->id_num_swqe + 1; 4343 } else { 4344 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4345 state->id_num_swqe = cq_attr.cq_size - 1; 4346 } 4347 4348 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4349 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4350 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4351 "failed, ret=%d\n", ret); 4352 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4353 state->id_rxwcs_size); 4354 (void) ibt_free_cq(state->id_rcq_hdl); 4355 return (DDI_FAILURE); 4356 } 4357 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4358 ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) { 4359 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4360 "moderation failed, ret=%d\n", ret); 4361 } 4362 4363 state->id_txwcs_size = IBD_TX_POLL_THRESH; 4364 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4365 state->id_txwcs_size, KM_SLEEP); 4366 4367 /* 4368 * Print message in case we could not allocate as many wqe's 4369 * as was requested. 4370 */ 4371 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4372 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4373 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4374 } 4375 if (state->id_num_swqe != IBD_NUM_SWQE) { 4376 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4377 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4378 } 4379 4380 return (DDI_SUCCESS); 4381 } 4382 4383 static int 4384 ibd_setup_ud_channel(ibd_state_t *state) 4385 { 4386 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4387 ibt_ud_chan_query_attr_t ud_chan_attr; 4388 ibt_status_t ret; 4389 4390 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 4391 if (state->id_hca_res_lkey_capab) 4392 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4393 if (state->id_lso_policy && state->id_lso_capable) 4394 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4395 4396 ud_alloc_attr.ud_hca_port_num = state->id_port; 4397 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4398 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4399 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4400 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4401 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4402 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4403 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4404 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4405 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4406 ud_alloc_attr.ud_clone_chan = NULL; 4407 4408 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4409 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4410 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4411 "failed, ret=%d\n", ret); 4412 return (DDI_FAILURE); 4413 } 4414 4415 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4416 &ud_chan_attr)) != IBT_SUCCESS) { 4417 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4418 "failed, ret=%d\n", ret); 4419 (void) ibt_free_channel(state->id_chnl_hdl); 4420 return (DDI_FAILURE); 4421 } 4422 4423 state->id_qpnum = ud_chan_attr.ud_qpn; 4424 4425 return (DDI_SUCCESS); 4426 } 4427 4428 static int 4429 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4430 { 4431 uint32_t progress = state->id_mac_state; 4432 uint_t attempts; 4433 ibt_status_t ret; 4434 ib_gid_t mgid; 4435 ibd_mce_t *mce; 4436 uint8_t jstate; 4437 4438 if (atomic_dec_32_nv(&state->id_running) != 0) 4439 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 4440 4441 /* 4442 * Before we try to stop/undo whatever we did in ibd_start(), 4443 * we need to mark the link state appropriately to prevent the 4444 * ip layer from using this instance for any new transfers. Note 4445 * that if the original state of the link was "up" when we're 4446 * here, we'll set the final link state to "unknown", to behave 4447 * in the same fashion as other ethernet drivers. 4448 */ 4449 mutex_enter(&state->id_link_mutex); 4450 if (cur_link_state == LINK_STATE_DOWN) { 4451 state->id_link_state = cur_link_state; 4452 } else { 4453 state->id_link_state = LINK_STATE_UNKNOWN; 4454 } 4455 mutex_exit(&state->id_link_mutex); 4456 mac_link_update(state->id_mh, state->id_link_state); 4457 4458 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4459 if (progress & IBD_DRV_STARTED) { 4460 state->id_mac_state &= (~IBD_DRV_STARTED); 4461 } 4462 4463 /* 4464 * First, stop receive interrupts; this stops the driver from 4465 * handing up buffers to higher layers. Wait for receive buffers 4466 * to be returned and give up after 1 second. 4467 */ 4468 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4469 attempts = 10; 4470 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 4471 0) > 0) { 4472 delay(drv_usectohz(100000)); 4473 if (--attempts == 0) { 4474 /* 4475 * There are pending bufs with the network 4476 * layer and we have no choice but to wait 4477 * for them to be done with. Reap all the 4478 * Tx/Rx completions that were posted since 4479 * we turned off the notification and 4480 * return failure. 4481 */ 4482 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 4483 DPRINT(2, "ibd_undo_start: " 4484 "reclaiming failed"); 4485 break; 4486 } 4487 } 4488 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4489 } 4490 4491 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4492 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4493 4494 mutex_enter(&state->id_trap_lock); 4495 state->id_trap_stop = B_TRUE; 4496 while (state->id_trap_inprog > 0) 4497 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4498 mutex_exit(&state->id_trap_lock); 4499 4500 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4501 } 4502 4503 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4504 /* 4505 * Flushing the channel ensures that all pending WQE's 4506 * are marked with flush_error and handed to the CQ. It 4507 * does not guarantee the invocation of the CQ handler. 4508 * This call is guaranteed to return successfully for 4509 * UD QPNs. 4510 */ 4511 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4512 IBT_SUCCESS) { 4513 DPRINT(10, "ibd_undo_start: flush_channel " 4514 "failed, ret=%d", ret); 4515 } 4516 4517 /* 4518 * Give some time for the TX CQ handler to process the 4519 * completions. 4520 */ 4521 mutex_enter(&state->id_tx_list.dl_mutex); 4522 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4523 attempts = 10; 4524 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 4525 != state->id_num_swqe) { 4526 if (--attempts == 0) 4527 break; 4528 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4529 mutex_exit(&state->id_tx_list.dl_mutex); 4530 delay(drv_usectohz(100000)); 4531 mutex_enter(&state->id_tx_list.dl_mutex); 4532 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4533 } 4534 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4535 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 4536 state->id_num_swqe) { 4537 cmn_err(CE_WARN, "tx resources not freed\n"); 4538 } 4539 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4540 mutex_exit(&state->id_tx_list.dl_mutex); 4541 4542 attempts = 10; 4543 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4544 if (--attempts == 0) 4545 break; 4546 delay(drv_usectohz(100000)); 4547 } 4548 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4549 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4550 cmn_err(CE_WARN, "rx resources not freed\n"); 4551 } 4552 4553 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4554 } 4555 4556 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4557 /* 4558 * No new async requests will be posted since the device 4559 * link state has been marked as unknown; completion handlers 4560 * have been turned off, so Tx handler will not cause any 4561 * more IBD_ASYNC_REAP requests. 4562 * 4563 * Queue a request for the async thread to exit, which will 4564 * be serviced after any pending ones. This can take a while, 4565 * specially if the SM is unreachable, since IBMF will slowly 4566 * timeout each SM request issued by the async thread. Reap 4567 * the thread before continuing on, we do not want it to be 4568 * lingering in modunloaded code (or we could move the reap 4569 * to ibd_detach(), provided we keep track of the current 4570 * id_async_thrid somewhere safe). 4571 */ 4572 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4573 thread_join(state->id_async_thrid); 4574 4575 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4576 } 4577 4578 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4579 /* 4580 * Drop all residual full/non membership. This includes full 4581 * membership to the broadcast group, and any nonmembership 4582 * acquired during transmits. We do this after the Tx completion 4583 * handlers are done, since those might result in some late 4584 * leaves; this also eliminates a potential race with that 4585 * path wrt the mc full list insert/delete. Trap handling 4586 * has also been suppressed at this point. Thus, no locks 4587 * are required while traversing the mc full list. 4588 */ 4589 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4590 mce = list_head(&state->id_mc_full); 4591 while (mce != NULL) { 4592 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4593 jstate = mce->mc_jstate; 4594 mce = list_next(&state->id_mc_full, mce); 4595 ibd_leave_group(state, mgid, jstate); 4596 } 4597 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4598 } 4599 4600 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4601 ibd_fini_rxlist(state); 4602 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4603 } 4604 4605 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4606 ibd_fini_txlist(state); 4607 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4608 } 4609 4610 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4611 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4612 IBT_SUCCESS) { 4613 DPRINT(10, "ibd_undo_start: free_channel " 4614 "failed, ret=%d", ret); 4615 } 4616 4617 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4618 } 4619 4620 if (progress & IBD_DRV_CQS_ALLOCD) { 4621 kmem_free(state->id_txwcs, 4622 sizeof (ibt_wc_t) * state->id_txwcs_size); 4623 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4624 IBT_SUCCESS) { 4625 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4626 "failed, ret=%d", ret); 4627 } 4628 4629 kmem_free(state->id_rxwcs, 4630 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4631 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4632 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4633 "ret=%d", ret); 4634 } 4635 4636 state->id_txwcs = NULL; 4637 state->id_rxwcs = NULL; 4638 state->id_scq_hdl = NULL; 4639 state->id_rcq_hdl = NULL; 4640 4641 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4642 } 4643 4644 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4645 mutex_enter(&state->id_ac_mutex); 4646 mod_hash_destroy_hash(state->id_ah_active_hash); 4647 mutex_exit(&state->id_ac_mutex); 4648 ibd_acache_fini(state); 4649 4650 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4651 } 4652 4653 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4654 /* 4655 * If we'd created the ipoib broadcast group and had 4656 * successfully joined it, leave it now 4657 */ 4658 if (state->id_bgroup_created) { 4659 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4660 jstate = IB_MC_JSTATE_FULL; 4661 (void) ibt_leave_mcg(state->id_sgid, mgid, 4662 state->id_sgid, jstate); 4663 } 4664 ibt_free_mcg_info(state->id_mcinfo, 1); 4665 4666 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4667 } 4668 4669 return (DDI_SUCCESS); 4670 } 4671 4672 /* 4673 * These pair of routines are used to set/clear the condition that 4674 * the caller is likely to do something to change the id_mac_state. 4675 * If there's already someone doing either a start or a stop (possibly 4676 * due to the async handler detecting a pkey relocation event, a plumb 4677 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4678 * that's done. 4679 */ 4680 static void 4681 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4682 { 4683 mutex_enter(&state->id_macst_lock); 4684 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4685 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4686 4687 state->id_mac_state |= flag; 4688 mutex_exit(&state->id_macst_lock); 4689 } 4690 4691 static void 4692 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4693 { 4694 mutex_enter(&state->id_macst_lock); 4695 state->id_mac_state &= (~flag); 4696 cv_signal(&state->id_macst_cv); 4697 mutex_exit(&state->id_macst_lock); 4698 } 4699 4700 /* 4701 * GLDv3 entry point to start hardware. 4702 */ 4703 /*ARGSUSED*/ 4704 static int 4705 ibd_m_start(void *arg) 4706 { 4707 ibd_state_t *state = arg; 4708 int ret; 4709 4710 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4711 4712 ret = ibd_start(state); 4713 4714 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4715 4716 return (ret); 4717 } 4718 4719 static int 4720 ibd_start(ibd_state_t *state) 4721 { 4722 kthread_t *kht; 4723 int err; 4724 ibt_status_t ret; 4725 4726 if (state->id_mac_state & IBD_DRV_STARTED) 4727 return (DDI_SUCCESS); 4728 4729 if (atomic_inc_32_nv(&state->id_running) != 1) { 4730 DPRINT(10, "ibd_start: id_running is non-zero"); 4731 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 4732 atomic_dec_32(&state->id_running); 4733 return (EINVAL); 4734 } 4735 4736 /* 4737 * Get port details; if we fail here, very likely the port 4738 * state is inactive or the pkey can't be verified. 4739 */ 4740 if ((err = ibd_get_port_details(state)) != 0) { 4741 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4742 goto start_fail; 4743 } 4744 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4745 4746 /* 4747 * Find the IPoIB broadcast group 4748 */ 4749 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4750 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4751 err = ENOTACTIVE; 4752 goto start_fail; 4753 } 4754 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4755 4756 /* 4757 * Initialize per-interface caches and lists; if we fail here, 4758 * it is most likely due to a lack of resources 4759 */ 4760 if (ibd_acache_init(state) != DDI_SUCCESS) { 4761 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4762 err = ENOMEM; 4763 goto start_fail; 4764 } 4765 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4766 4767 /* 4768 * Allocate send and receive completion queues 4769 */ 4770 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4771 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4772 err = ENOMEM; 4773 goto start_fail; 4774 } 4775 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4776 4777 /* 4778 * Setup a UD channel 4779 */ 4780 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4781 err = ENOMEM; 4782 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4783 goto start_fail; 4784 } 4785 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4786 4787 /* 4788 * Allocate and initialize the tx buffer list 4789 */ 4790 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4791 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4792 err = ENOMEM; 4793 goto start_fail; 4794 } 4795 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4796 4797 /* 4798 * Create the send cq handler here 4799 */ 4800 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4801 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4802 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4803 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4804 "failed, ret=%d", ret); 4805 err = EINVAL; 4806 goto start_fail; 4807 } 4808 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4809 4810 /* 4811 * Allocate and initialize the rx buffer list 4812 */ 4813 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4814 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4815 err = ENOMEM; 4816 goto start_fail; 4817 } 4818 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4819 4820 /* 4821 * Join IPoIB broadcast group 4822 */ 4823 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4824 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4825 err = ENOTACTIVE; 4826 goto start_fail; 4827 } 4828 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4829 4830 /* 4831 * Create the async thread; thread_create never fails. 4832 */ 4833 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4834 TS_RUN, minclsyspri); 4835 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4836 state->id_async_thrid = kht->t_did; 4837 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4838 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4839 4840 /* 4841 * When we did mac_register() in ibd_attach(), we didn't register 4842 * the real macaddr and we didn't have the true port mtu. Now that 4843 * we're almost ready, set the local mac address and broadcast 4844 * addresses and update gldv3 about the real values of these 4845 * parameters. 4846 */ 4847 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4848 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4849 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4850 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4851 4852 (void) mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); 4853 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4854 4855 /* 4856 * Setup the receive cq handler 4857 */ 4858 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4859 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4860 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4861 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4862 "failed, ret=%d", ret); 4863 err = EINVAL; 4864 goto start_fail; 4865 } 4866 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4867 4868 /* 4869 * Setup the subnet notices handler after we've initialized the acache/ 4870 * mcache and started the async thread, both of which are required for 4871 * the trap handler to function properly. 4872 * 4873 * Now that the async thread has been started (and we've already done 4874 * a mac_register() during attach so mac_tx_update() can be called 4875 * if necessary without any problem), we can enable the trap handler 4876 * to queue requests to the async thread. 4877 */ 4878 ibt_register_subnet_notices(state->id_ibt_hdl, 4879 ibd_snet_notices_handler, state); 4880 mutex_enter(&state->id_trap_lock); 4881 state->id_trap_stop = B_FALSE; 4882 mutex_exit(&state->id_trap_lock); 4883 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4884 4885 /* 4886 * Indicate link status to GLDv3 and higher layers. By default, 4887 * we assume we are in up state (which must have been true at 4888 * least at the time the broadcast mcg's were probed); if there 4889 * were any up/down transitions till the time we come here, the 4890 * async handler will have updated last known state, which we 4891 * use to tell GLDv3. The async handler will not send any 4892 * notifications to GLDv3 till we reach here in the initialization 4893 * sequence. 4894 */ 4895 state->id_mac_state |= IBD_DRV_STARTED; 4896 mac_link_update(state->id_mh, state->id_link_state); 4897 4898 return (DDI_SUCCESS); 4899 4900 start_fail: 4901 /* 4902 * If we ran into a problem during ibd_start() and ran into 4903 * some other problem during undoing our partial work, we can't 4904 * do anything about it. Ignore any errors we might get from 4905 * ibd_undo_start() and just return the original error we got. 4906 */ 4907 (void) ibd_undo_start(state, LINK_STATE_DOWN); 4908 return (err); 4909 } 4910 4911 /* 4912 * GLDv3 entry point to stop hardware from receiving packets. 4913 */ 4914 /*ARGSUSED*/ 4915 static void 4916 ibd_m_stop(void *arg) 4917 { 4918 ibd_state_t *state = (ibd_state_t *)arg; 4919 4920 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4921 4922 (void) ibd_undo_start(state, state->id_link_state); 4923 4924 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4925 } 4926 4927 /* 4928 * GLDv3 entry point to modify device's mac address. We do not 4929 * allow address modifications. 4930 */ 4931 static int 4932 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4933 { 4934 ibd_state_t *state = arg; 4935 4936 /* 4937 * Don't bother even comparing the macaddr if we haven't 4938 * completed ibd_m_start(). 4939 */ 4940 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4941 return (0); 4942 4943 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4944 return (0); 4945 else 4946 return (EINVAL); 4947 } 4948 4949 /* 4950 * The blocking part of the IBA join/leave operations are done out 4951 * of here on the async thread. 4952 */ 4953 static void 4954 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4955 { 4956 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4957 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4958 4959 if (op == IBD_ASYNC_JOIN) { 4960 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4961 ibd_print_warn(state, "Join multicast group failed :" 4962 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4963 } 4964 } else { 4965 /* 4966 * Here, we must search for the proper mcg_info and 4967 * use that to leave the group. 4968 */ 4969 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4970 } 4971 } 4972 4973 /* 4974 * GLDv3 entry point for multicast enable/disable requests. 4975 * This function queues the operation to the async thread and 4976 * return success for a valid multicast address. 4977 */ 4978 static int 4979 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4980 { 4981 ibd_state_t *state = (ibd_state_t *)arg; 4982 ipoib_mac_t maddr, *mcast; 4983 ib_gid_t mgid; 4984 ibd_req_t *req; 4985 4986 /* 4987 * If we haven't completed ibd_m_start(), async thread wouldn't 4988 * have been started and id_bcaddr wouldn't be set, so there's 4989 * no point in continuing. 4990 */ 4991 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4992 return (0); 4993 4994 /* 4995 * The incoming multicast address might not be aligned properly 4996 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4997 * it to look like one though, to get the offsets of the mc gid, 4998 * since we know we are not going to dereference any values with 4999 * the ipoib_mac_t pointer. 5000 */ 5001 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 5002 mcast = &maddr; 5003 5004 /* 5005 * Check validity of MCG address. We could additionally check 5006 * that a enable/disable is not being issued on the "broadcast" 5007 * mcg, but since this operation is only invokable by privileged 5008 * programs anyway, we allow the flexibility to those dlpi apps. 5009 * Note that we do not validate the "scope" of the IBA mcg. 5010 */ 5011 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 5012 return (EINVAL); 5013 5014 /* 5015 * fill in multicast pkey and scope 5016 */ 5017 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 5018 5019 /* 5020 * If someone is trying to JOIN/LEAVE the broadcast group, we do 5021 * nothing (i.e. we stay JOINed to the broadcast group done in 5022 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 5023 * requires to be joined to broadcast groups at all times. 5024 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 5025 * depends on this. 5026 */ 5027 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5028 return (0); 5029 5030 ibd_n2h_gid(mcast, &mgid); 5031 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5032 if (req == NULL) 5033 return (ENOMEM); 5034 5035 req->rq_gid = mgid; 5036 5037 if (add) { 5038 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 5039 mgid.gid_prefix, mgid.gid_guid); 5040 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 5041 } else { 5042 DPRINT(1, "ibd_m_multicst : unset_multicast : " 5043 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5044 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 5045 } 5046 return (0); 5047 } 5048 5049 /* 5050 * The blocking part of the IBA promiscuous operations are done 5051 * out of here on the async thread. The dlpireq parameter indicates 5052 * whether this invocation is due to a dlpi request or due to 5053 * a port up/down event. 5054 */ 5055 static void 5056 ibd_async_unsetprom(ibd_state_t *state) 5057 { 5058 ibd_mce_t *mce = list_head(&state->id_mc_non); 5059 ib_gid_t mgid; 5060 5061 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 5062 5063 while (mce != NULL) { 5064 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5065 mce = list_next(&state->id_mc_non, mce); 5066 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 5067 } 5068 state->id_prom_op = IBD_OP_NOTSTARTED; 5069 } 5070 5071 /* 5072 * The blocking part of the IBA promiscuous operations are done 5073 * out of here on the async thread. The dlpireq parameter indicates 5074 * whether this invocation is due to a dlpi request or due to 5075 * a port up/down event. 5076 */ 5077 static void 5078 ibd_async_setprom(ibd_state_t *state) 5079 { 5080 ibt_mcg_attr_t mcg_attr; 5081 ibt_mcg_info_t *mcg_info; 5082 ib_gid_t mgid; 5083 uint_t numg; 5084 int i; 5085 char ret = IBD_OP_COMPLETED; 5086 5087 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 5088 5089 /* 5090 * Obtain all active MC groups on the IB fabric with 5091 * specified criteria (scope + Pkey + Qkey + mtu). 5092 */ 5093 bzero(&mcg_attr, sizeof (mcg_attr)); 5094 mcg_attr.mc_pkey = state->id_pkey; 5095 mcg_attr.mc_scope = state->id_scope; 5096 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 5097 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 5098 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 5099 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 5100 IBT_SUCCESS) { 5101 ibd_print_warn(state, "Could not get list of IBA multicast " 5102 "groups"); 5103 ret = IBD_OP_ERRORED; 5104 goto done; 5105 } 5106 5107 /* 5108 * Iterate over the returned mcg's and join as NonMember 5109 * to the IP mcg's. 5110 */ 5111 for (i = 0; i < numg; i++) { 5112 /* 5113 * Do a NonMember JOIN on the MC group. 5114 */ 5115 mgid = mcg_info[i].mc_adds_vect.av_dgid; 5116 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 5117 ibd_print_warn(state, "IBA promiscuous mode missed " 5118 "multicast gid %016llx:%016llx", 5119 (u_longlong_t)mgid.gid_prefix, 5120 (u_longlong_t)mgid.gid_guid); 5121 } 5122 5123 ibt_free_mcg_info(mcg_info, numg); 5124 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 5125 done: 5126 state->id_prom_op = ret; 5127 } 5128 5129 /* 5130 * GLDv3 entry point for multicast promiscuous enable/disable requests. 5131 * GLDv3 assumes phys state receives more packets than multi state, 5132 * which is not true for IPoIB. Thus, treat the multi and phys 5133 * promiscuous states the same way to work with GLDv3's assumption. 5134 */ 5135 static int 5136 ibd_m_promisc(void *arg, boolean_t on) 5137 { 5138 ibd_state_t *state = (ibd_state_t *)arg; 5139 ibd_req_t *req; 5140 5141 /* 5142 * Async thread wouldn't have been started if we haven't 5143 * passed ibd_m_start() 5144 */ 5145 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5146 return (0); 5147 5148 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5149 if (req == NULL) 5150 return (ENOMEM); 5151 if (on) { 5152 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 5153 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 5154 } else { 5155 DPRINT(1, "ibd_m_promisc : unset_promisc"); 5156 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 5157 } 5158 5159 return (0); 5160 } 5161 5162 /* 5163 * GLDv3 entry point for gathering statistics. 5164 */ 5165 static int 5166 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 5167 { 5168 ibd_state_t *state = (ibd_state_t *)arg; 5169 5170 switch (stat) { 5171 case MAC_STAT_IFSPEED: 5172 *val = state->id_link_speed; 5173 break; 5174 case MAC_STAT_MULTIRCV: 5175 *val = state->id_multi_rcv; 5176 break; 5177 case MAC_STAT_BRDCSTRCV: 5178 *val = state->id_brd_rcv; 5179 break; 5180 case MAC_STAT_MULTIXMT: 5181 *val = state->id_multi_xmt; 5182 break; 5183 case MAC_STAT_BRDCSTXMT: 5184 *val = state->id_brd_xmt; 5185 break; 5186 case MAC_STAT_RBYTES: 5187 *val = state->id_rcv_bytes; 5188 break; 5189 case MAC_STAT_IPACKETS: 5190 *val = state->id_rcv_pkt; 5191 break; 5192 case MAC_STAT_OBYTES: 5193 *val = state->id_xmt_bytes; 5194 break; 5195 case MAC_STAT_OPACKETS: 5196 *val = state->id_xmt_pkt; 5197 break; 5198 case MAC_STAT_OERRORS: 5199 *val = state->id_ah_error; /* failed AH translation */ 5200 break; 5201 case MAC_STAT_IERRORS: 5202 *val = 0; 5203 break; 5204 case MAC_STAT_NOXMTBUF: 5205 *val = state->id_tx_short; 5206 break; 5207 case MAC_STAT_NORCVBUF: 5208 default: 5209 return (ENOTSUP); 5210 } 5211 5212 return (0); 5213 } 5214 5215 static void 5216 ibd_async_txsched(ibd_state_t *state) 5217 { 5218 ibd_resume_transmission(state); 5219 } 5220 5221 static void 5222 ibd_resume_transmission(ibd_state_t *state) 5223 { 5224 int flag; 5225 int met_thresh = 0; 5226 int thresh = 0; 5227 int ret = -1; 5228 5229 mutex_enter(&state->id_sched_lock); 5230 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5231 mutex_enter(&state->id_tx_list.dl_mutex); 5232 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5233 met_thresh = state->id_tx_list.dl_cnt + 5234 state->id_tx_rel_list.dl_cnt; 5235 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5236 mutex_exit(&state->id_tx_list.dl_mutex); 5237 thresh = IBD_FREE_SWQES_THRESH; 5238 flag = IBD_RSRC_SWQE; 5239 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5240 ASSERT(state->id_lso != NULL); 5241 mutex_enter(&state->id_lso_lock); 5242 met_thresh = state->id_lso->bkt_nfree; 5243 thresh = IBD_FREE_LSOS_THRESH; 5244 mutex_exit(&state->id_lso_lock); 5245 flag = IBD_RSRC_LSOBUF; 5246 if (met_thresh > thresh) 5247 state->id_sched_lso_cnt++; 5248 } 5249 if (met_thresh > thresh) { 5250 state->id_sched_needed &= ~flag; 5251 state->id_sched_cnt++; 5252 ret = 0; 5253 } 5254 mutex_exit(&state->id_sched_lock); 5255 5256 if (ret == 0) 5257 mac_tx_update(state->id_mh); 5258 } 5259 5260 /* 5261 * Release the send wqe back into free list. 5262 */ 5263 static void 5264 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 5265 { 5266 /* 5267 * Add back on Tx list for reuse. 5268 */ 5269 ASSERT(tail->swqe_next == NULL); 5270 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5271 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 5272 tail->swqe_next = state->id_tx_rel_list.dl_head; 5273 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 5274 state->id_tx_rel_list.dl_cnt += n; 5275 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5276 } 5277 5278 /* 5279 * Acquire a send wqe from free list. 5280 * Returns error number and send wqe pointer. 5281 */ 5282 static ibd_swqe_t * 5283 ibd_acquire_swqe(ibd_state_t *state) 5284 { 5285 ibd_swqe_t *wqe; 5286 5287 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5288 if (state->id_tx_rel_list.dl_head != NULL) { 5289 /* transfer id_tx_rel_list to id_tx_list */ 5290 state->id_tx_list.dl_head = 5291 state->id_tx_rel_list.dl_head; 5292 state->id_tx_list.dl_cnt = 5293 state->id_tx_rel_list.dl_cnt; 5294 state->id_tx_list.dl_pending_sends = B_FALSE; 5295 5296 /* clear id_tx_rel_list */ 5297 state->id_tx_rel_list.dl_head = NULL; 5298 state->id_tx_rel_list.dl_cnt = 0; 5299 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5300 5301 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5302 state->id_tx_list.dl_cnt -= 1; 5303 state->id_tx_list.dl_head = wqe->swqe_next; 5304 } else { /* no free swqe */ 5305 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5306 state->id_tx_list.dl_pending_sends = B_TRUE; 5307 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5308 state->id_tx_short++; 5309 wqe = NULL; 5310 } 5311 return (wqe); 5312 } 5313 5314 static int 5315 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5316 ibt_ud_dest_hdl_t ud_dest) 5317 { 5318 mblk_t *nmp; 5319 int iph_len, tcph_len; 5320 ibt_wr_lso_t *lso; 5321 uintptr_t ip_start, tcp_start; 5322 uint8_t *dst; 5323 uint_t pending, mblen; 5324 5325 /* 5326 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5327 * we need to adjust it here for lso. 5328 */ 5329 lso = &(node->w_swr.wr.ud_lso); 5330 lso->lso_ud_dest = ud_dest; 5331 lso->lso_mss = mss; 5332 5333 /* 5334 * Calculate the LSO header size and set it in the UD LSO structure. 5335 * Note that the only assumption we make is that each of the IPoIB, 5336 * IP and TCP headers will be contained in a single mblk fragment; 5337 * together, the headers may span multiple mblk fragments. 5338 */ 5339 nmp = mp; 5340 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5341 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5342 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5343 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5344 nmp = nmp->b_cont; 5345 5346 } 5347 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5348 5349 tcp_start = ip_start + iph_len; 5350 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5351 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5352 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5353 nmp = nmp->b_cont; 5354 } 5355 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5356 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5357 5358 /* 5359 * If the lso header fits entirely within a single mblk fragment, 5360 * we'll avoid an additional copy of the lso header here and just 5361 * pass the b_rptr of the mblk directly. 5362 * 5363 * If this isn't true, we'd have to allocate for it explicitly. 5364 */ 5365 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5366 lso->lso_hdr = mp->b_rptr; 5367 } else { 5368 /* On work completion, remember to free this allocated hdr */ 5369 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5370 if (lso->lso_hdr == NULL) { 5371 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5372 "sz = %d", lso->lso_hdr_sz); 5373 lso->lso_hdr_sz = 0; 5374 lso->lso_mss = 0; 5375 return (-1); 5376 } 5377 } 5378 5379 /* 5380 * Copy in the lso header only if we need to 5381 */ 5382 if (lso->lso_hdr != mp->b_rptr) { 5383 dst = lso->lso_hdr; 5384 pending = lso->lso_hdr_sz; 5385 5386 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5387 mblen = MBLKL(nmp); 5388 if (pending > mblen) { 5389 bcopy(nmp->b_rptr, dst, mblen); 5390 dst += mblen; 5391 pending -= mblen; 5392 } else { 5393 bcopy(nmp->b_rptr, dst, pending); 5394 break; 5395 } 5396 } 5397 } 5398 5399 return (0); 5400 } 5401 5402 static void 5403 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5404 { 5405 ibt_wr_lso_t *lso; 5406 5407 if ((!node) || (!mp)) 5408 return; 5409 5410 /* 5411 * Free any header space that we might've allocated if we 5412 * did an LSO 5413 */ 5414 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5415 lso = &(node->w_swr.wr.ud_lso); 5416 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5417 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5418 lso->lso_hdr = NULL; 5419 lso->lso_hdr_sz = 0; 5420 } 5421 } 5422 } 5423 5424 static void 5425 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5426 { 5427 uint_t i; 5428 uint_t num_posted; 5429 uint_t n_wrs; 5430 ibt_status_t ibt_status; 5431 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 5432 ibd_swqe_t *tx_head, *elem; 5433 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 5434 5435 /* post the one request, then check for more */ 5436 ibt_status = ibt_post_send(state->id_chnl_hdl, 5437 &node->w_swr, 1, NULL); 5438 if (ibt_status != IBT_SUCCESS) { 5439 ibd_print_warn(state, "ibd_post_send: " 5440 "posting one wr failed: ret=%d", ibt_status); 5441 ibd_tx_cleanup(state, node); 5442 } 5443 5444 tx_head = NULL; 5445 for (;;) { 5446 if (tx_head == NULL) { 5447 mutex_enter(&state->id_txpost_lock); 5448 tx_head = state->id_tx_head; 5449 if (tx_head == NULL) { 5450 state->id_tx_busy = 0; 5451 mutex_exit(&state->id_txpost_lock); 5452 return; 5453 } 5454 state->id_tx_head = NULL; 5455 mutex_exit(&state->id_txpost_lock); 5456 } 5457 5458 /* 5459 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 5460 * at a time if possible, and keep posting them. 5461 */ 5462 for (n_wrs = 0, elem = tx_head; 5463 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 5464 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5465 nodes[n_wrs] = elem; 5466 wrs[n_wrs] = elem->w_swr; 5467 } 5468 tx_head = elem; 5469 5470 ASSERT(n_wrs != 0); 5471 5472 /* 5473 * If posting fails for some reason, we'll never receive 5474 * completion intimation, so we'll need to cleanup. But 5475 * we need to make sure we don't clean up nodes whose 5476 * wrs have been successfully posted. We assume that the 5477 * hca driver returns on the first failure to post and 5478 * therefore the first 'num_posted' entries don't need 5479 * cleanup here. 5480 */ 5481 num_posted = 0; 5482 ibt_status = ibt_post_send(state->id_chnl_hdl, 5483 wrs, n_wrs, &num_posted); 5484 if (ibt_status != IBT_SUCCESS) { 5485 ibd_print_warn(state, "ibd_post_send: " 5486 "posting multiple wrs failed: " 5487 "requested=%d, done=%d, ret=%d", 5488 n_wrs, num_posted, ibt_status); 5489 5490 for (i = num_posted; i < n_wrs; i++) 5491 ibd_tx_cleanup(state, nodes[i]); 5492 } 5493 } 5494 } 5495 5496 static int 5497 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5498 uint_t lsohdr_sz) 5499 { 5500 ibt_wr_ds_t *sgl; 5501 ibt_status_t ibt_status; 5502 mblk_t *nmp; 5503 mblk_t *data_mp; 5504 uchar_t *bufp; 5505 size_t blksize; 5506 size_t skip; 5507 size_t avail; 5508 uint_t pktsize; 5509 uint_t frag_len; 5510 uint_t pending_hdr; 5511 int nmblks; 5512 int i; 5513 5514 /* 5515 * Let's skip ahead to the data if this is LSO 5516 */ 5517 data_mp = mp; 5518 pending_hdr = 0; 5519 if (lsohdr_sz) { 5520 pending_hdr = lsohdr_sz; 5521 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5522 frag_len = nmp->b_wptr - nmp->b_rptr; 5523 if (frag_len > pending_hdr) 5524 break; 5525 pending_hdr -= frag_len; 5526 } 5527 data_mp = nmp; /* start of data past lso header */ 5528 ASSERT(data_mp != NULL); 5529 } 5530 5531 /* 5532 * Calculate the size of message data and number of msg blocks 5533 */ 5534 pktsize = 0; 5535 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5536 nmp = nmp->b_cont, nmblks++) { 5537 pktsize += MBLKL(nmp); 5538 } 5539 pktsize -= pending_hdr; 5540 5541 /* 5542 * We only do ibt_map_mem_iov() if the pktsize is above the 5543 * "copy-threshold", and if the number of mp fragments is less than 5544 * the maximum acceptable. 5545 */ 5546 if ((state->id_hca_res_lkey_capab) && 5547 (pktsize > IBD_TX_COPY_THRESH) && 5548 (nmblks < state->id_max_sqseg_hiwm)) { 5549 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5550 ibt_iov_attr_t iov_attr; 5551 5552 iov_attr.iov_as = NULL; 5553 iov_attr.iov = iov_arr; 5554 iov_attr.iov_buf = NULL; 5555 iov_attr.iov_list_len = nmblks; 5556 iov_attr.iov_wr_nds = state->id_max_sqseg; 5557 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5558 iov_attr.iov_flags = IBT_IOV_SLEEP; 5559 5560 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5561 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5562 iov_arr[i].iov_len = MBLKL(nmp); 5563 if (i == 0) { 5564 iov_arr[i].iov_addr += pending_hdr; 5565 iov_arr[i].iov_len -= pending_hdr; 5566 } 5567 } 5568 5569 node->w_buftype = IBD_WQE_MAPPED; 5570 node->w_swr.wr_sgl = node->w_sgl; 5571 5572 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5573 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5574 if (ibt_status != IBT_SUCCESS) { 5575 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5576 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5577 goto ibd_copy_path; 5578 } 5579 5580 return (0); 5581 } 5582 5583 ibd_copy_path: 5584 if (pktsize <= state->id_tx_buf_sz) { 5585 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5586 node->w_swr.wr_nds = 1; 5587 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5588 node->w_buftype = IBD_WQE_TXBUF; 5589 5590 /* 5591 * Even though this is the copy path for transfers less than 5592 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5593 * is possible the first data mblk fragment (data_mp) still 5594 * contains part of the LSO header that we need to skip. 5595 */ 5596 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5597 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5598 blksize = MBLKL(nmp) - pending_hdr; 5599 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5600 bufp += blksize; 5601 pending_hdr = 0; 5602 } 5603 5604 return (0); 5605 } 5606 5607 /* 5608 * Copy path for transfers greater than id_tx_buf_sz 5609 */ 5610 node->w_swr.wr_sgl = node->w_sgl; 5611 if (ibd_acquire_lsobufs(state, pktsize, 5612 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5613 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5614 return (-1); 5615 } 5616 node->w_buftype = IBD_WQE_LSOBUF; 5617 5618 /* 5619 * Copy the larger-than-id_tx_buf_sz packet into a set of 5620 * fixed-sized, pre-mapped LSO buffers. Note that we might 5621 * need to skip part of the LSO header in the first fragment 5622 * as before. 5623 */ 5624 nmp = data_mp; 5625 skip = pending_hdr; 5626 for (i = 0; i < node->w_swr.wr_nds; i++) { 5627 sgl = node->w_swr.wr_sgl + i; 5628 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5629 avail = IBD_LSO_BUFSZ; 5630 while (nmp && avail) { 5631 blksize = MBLKL(nmp) - skip; 5632 if (blksize > avail) { 5633 bcopy(nmp->b_rptr + skip, bufp, avail); 5634 skip += avail; 5635 avail = 0; 5636 } else { 5637 bcopy(nmp->b_rptr + skip, bufp, blksize); 5638 skip = 0; 5639 avail -= blksize; 5640 bufp += blksize; 5641 nmp = nmp->b_cont; 5642 } 5643 } 5644 } 5645 5646 return (0); 5647 } 5648 5649 /* 5650 * Schedule a completion queue polling to reap the resource we're 5651 * short on. If we implement the change to reap tx completions 5652 * in a separate thread, we'll need to wake up that thread here. 5653 */ 5654 static int 5655 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5656 { 5657 ibd_req_t *req; 5658 5659 mutex_enter(&state->id_sched_lock); 5660 state->id_sched_needed |= resource_type; 5661 mutex_exit(&state->id_sched_lock); 5662 5663 /* 5664 * If we are asked to queue a work entry, we need to do it 5665 */ 5666 if (q_flag) { 5667 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5668 if (req == NULL) 5669 return (-1); 5670 5671 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5672 } 5673 5674 return (0); 5675 } 5676 5677 /* 5678 * The passed in packet has this format: 5679 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5680 */ 5681 static boolean_t 5682 ibd_send(ibd_state_t *state, mblk_t *mp) 5683 { 5684 ibd_ace_t *ace; 5685 ibd_swqe_t *node; 5686 ipoib_mac_t *dest; 5687 ib_header_info_t *ipibp; 5688 ip6_t *ip6h; 5689 uint_t pktsize; 5690 uint32_t mss; 5691 uint32_t hckflags; 5692 uint32_t lsoflags = 0; 5693 uint_t lsohdr_sz = 0; 5694 int ret, len; 5695 boolean_t dofree = B_FALSE; 5696 boolean_t rc; 5697 5698 /* 5699 * If we aren't done with the device initialization and start, 5700 * we shouldn't be here. 5701 */ 5702 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5703 return (B_FALSE); 5704 5705 /* 5706 * Obtain an address handle for the destination. 5707 */ 5708 ipibp = (ib_header_info_t *)mp->b_rptr; 5709 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5710 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5711 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5712 5713 ace = ibd_acache_lookup(state, dest, &ret, 1); 5714 5715 mutex_enter(&state->id_tx_list.dl_mutex); 5716 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 5717 if (node != NULL) { 5718 state->id_tx_list.dl_cnt -= 1; 5719 state->id_tx_list.dl_head = node->swqe_next; 5720 } else { 5721 node = ibd_acquire_swqe(state); 5722 } 5723 mutex_exit(&state->id_tx_list.dl_mutex); 5724 if (node == NULL) { 5725 /* 5726 * If we don't have an swqe available, schedule a transmit 5727 * completion queue cleanup and hold off on sending more 5728 * more packets until we have some free swqes 5729 */ 5730 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) 5731 return (B_FALSE); 5732 5733 /* 5734 * If a poll cannot be scheduled, we have no choice but 5735 * to drop this packet 5736 */ 5737 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5738 return (B_TRUE); 5739 } 5740 5741 /* 5742 * Initialize the commonly used fields in swqe to NULL to protect 5743 * against ibd_tx_cleanup accidentally misinterpreting these on a 5744 * failure. 5745 */ 5746 node->swqe_im_mblk = NULL; 5747 node->w_swr.wr_nds = 0; 5748 node->w_swr.wr_sgl = NULL; 5749 node->w_swr.wr_opcode = IBT_WRC_SEND; 5750 5751 pktsize = msgsize(mp); 5752 5753 atomic_add_64(&state->id_xmt_bytes, pktsize); 5754 atomic_inc_64(&state->id_xmt_pkt); 5755 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5756 atomic_inc_64(&state->id_brd_xmt); 5757 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5758 atomic_inc_64(&state->id_multi_xmt); 5759 5760 if (ace != NULL) { 5761 node->w_ahandle = ace; 5762 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5763 } else { 5764 DPRINT(5, 5765 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5766 ((ret == EFAULT) ? "failed" : "queued"), 5767 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5768 htonl(dest->ipoib_gidpref[1]), 5769 htonl(dest->ipoib_gidsuff[0]), 5770 htonl(dest->ipoib_gidsuff[1])); 5771 node->w_ahandle = NULL; 5772 5773 /* 5774 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5775 * can not find a path for the specific dest address. We 5776 * should get rid of this kind of packet. We also should get 5777 * rid of the packet if we cannot schedule a poll via the 5778 * async thread. For the normal case, ibd will return the 5779 * packet to upper layer and wait for AH creating. 5780 * 5781 * Note that we always queue a work slot entry for the async 5782 * thread when we fail AH lookup (even in intr mode); this is 5783 * due to the convoluted way the code currently looks for AH. 5784 */ 5785 if (ret == EFAULT) { 5786 dofree = B_TRUE; 5787 rc = B_TRUE; 5788 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5789 dofree = B_TRUE; 5790 rc = B_TRUE; 5791 } else { 5792 dofree = B_FALSE; 5793 rc = B_FALSE; 5794 } 5795 goto ibd_send_fail; 5796 } 5797 5798 /* 5799 * For ND6 packets, padding is at the front of the source lladdr. 5800 * Insert the padding at front. 5801 */ 5802 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 5803 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5804 if (!pullupmsg(mp, IPV6_HDR_LEN + 5805 sizeof (ib_header_info_t))) { 5806 DPRINT(10, "ibd_send: pullupmsg failure "); 5807 dofree = B_TRUE; 5808 rc = B_TRUE; 5809 goto ibd_send_fail; 5810 } 5811 ipibp = (ib_header_info_t *)mp->b_rptr; 5812 } 5813 ip6h = (ip6_t *)((uchar_t *)ipibp + 5814 sizeof (ib_header_info_t)); 5815 len = ntohs(ip6h->ip6_plen); 5816 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5817 mblk_t *pad; 5818 5819 pad = allocb(4, 0); 5820 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5821 linkb(mp, pad); 5822 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5823 IPV6_HDR_LEN + len + 4) { 5824 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5825 IPV6_HDR_LEN + len + 4)) { 5826 DPRINT(10, "ibd_send: pullupmsg " 5827 "failure "); 5828 dofree = B_TRUE; 5829 rc = B_TRUE; 5830 goto ibd_send_fail; 5831 } 5832 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5833 sizeof (ib_header_info_t)); 5834 } 5835 5836 /* LINTED: E_CONSTANT_CONDITION */ 5837 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5838 } 5839 } 5840 5841 mp->b_rptr += sizeof (ib_addrs_t); 5842 5843 /* 5844 * Do LSO and checksum related work here. For LSO send, adjust the 5845 * ud destination, the opcode and the LSO header information to the 5846 * work request. 5847 */ 5848 lso_info_get(mp, &mss, &lsoflags); 5849 if ((lsoflags & HW_LSO) != HW_LSO) { 5850 node->w_swr.wr_opcode = IBT_WRC_SEND; 5851 lsohdr_sz = 0; 5852 } else { 5853 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5854 /* 5855 * The routine can only fail if there's no memory; we 5856 * can only drop the packet if this happens 5857 */ 5858 ibd_print_warn(state, 5859 "ibd_send: no memory, lso posting failed"); 5860 dofree = B_TRUE; 5861 rc = B_TRUE; 5862 goto ibd_send_fail; 5863 } 5864 5865 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5866 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5867 } 5868 5869 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5870 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5871 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5872 else 5873 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5874 5875 /* 5876 * Prepare the sgl for posting; the routine can only fail if there's 5877 * no lso buf available for posting. If this is the case, we should 5878 * probably resched for lso bufs to become available and then try again. 5879 */ 5880 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5881 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5882 dofree = B_TRUE; 5883 rc = B_TRUE; 5884 } else { 5885 dofree = B_FALSE; 5886 rc = B_FALSE; 5887 } 5888 goto ibd_send_fail; 5889 } 5890 node->swqe_im_mblk = mp; 5891 5892 /* 5893 * Queue the wqe to hardware; since we can now simply queue a 5894 * post instead of doing it serially, we cannot assume anything 5895 * about the 'node' after ibd_post_send() returns. 5896 */ 5897 node->swqe_next = NULL; 5898 5899 mutex_enter(&state->id_txpost_lock); 5900 if (state->id_tx_busy) { 5901 if (state->id_tx_head) { 5902 state->id_tx_tail->swqe_next = 5903 SWQE_TO_WQE(node); 5904 } else { 5905 state->id_tx_head = node; 5906 } 5907 state->id_tx_tail = node; 5908 mutex_exit(&state->id_txpost_lock); 5909 } else { 5910 state->id_tx_busy = 1; 5911 mutex_exit(&state->id_txpost_lock); 5912 ibd_post_send(state, node); 5913 } 5914 5915 return (B_TRUE); 5916 5917 ibd_send_fail: 5918 if (node && mp) 5919 ibd_free_lsohdr(node, mp); 5920 5921 if (dofree) 5922 freemsg(mp); 5923 5924 if (node != NULL) 5925 ibd_tx_cleanup(state, node); 5926 5927 return (rc); 5928 } 5929 5930 /* 5931 * GLDv3 entry point for transmitting datagram. 5932 */ 5933 static mblk_t * 5934 ibd_m_tx(void *arg, mblk_t *mp) 5935 { 5936 ibd_state_t *state = (ibd_state_t *)arg; 5937 mblk_t *next; 5938 5939 if (state->id_link_state != LINK_STATE_UP) { 5940 freemsgchain(mp); 5941 mp = NULL; 5942 } 5943 5944 while (mp != NULL) { 5945 next = mp->b_next; 5946 mp->b_next = NULL; 5947 if (ibd_send(state, mp) == B_FALSE) { 5948 /* Send fail */ 5949 mp->b_next = next; 5950 break; 5951 } 5952 mp = next; 5953 } 5954 5955 return (mp); 5956 } 5957 5958 /* 5959 * this handles Tx and Rx completions. With separate CQs, this handles 5960 * only Rx completions. 5961 */ 5962 static uint_t 5963 ibd_intr(caddr_t arg) 5964 { 5965 ibd_state_t *state = (ibd_state_t *)arg; 5966 5967 ibd_poll_rcq(state, state->id_rcq_hdl); 5968 5969 return (DDI_INTR_CLAIMED); 5970 } 5971 5972 /* 5973 * Poll and fully drain the send cq 5974 */ 5975 static void 5976 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5977 { 5978 ibt_wc_t *wcs = state->id_txwcs; 5979 uint_t numwcs = state->id_txwcs_size; 5980 ibd_wqe_t *wqe; 5981 ibd_swqe_t *head, *tail; 5982 ibt_wc_t *wc; 5983 uint_t num_polled; 5984 int i; 5985 5986 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5987 head = tail = NULL; 5988 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5989 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5990 if (wc->wc_status != IBT_WC_SUCCESS) { 5991 /* 5992 * Channel being torn down. 5993 */ 5994 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5995 DPRINT(5, "ibd_drain_scq: flush error"); 5996 DPRINT(10, "ibd_drain_scq: Bad " 5997 "status %d", wc->wc_status); 5998 } else { 5999 DPRINT(10, "ibd_drain_scq: " 6000 "unexpected wc_status %d", 6001 wc->wc_status); 6002 } 6003 /* 6004 * Fallthrough to invoke the Tx handler to 6005 * release held resources, e.g., AH refcount. 6006 */ 6007 } 6008 /* 6009 * Add this swqe to the list to be cleaned up. 6010 */ 6011 if (head) 6012 tail->swqe_next = wqe; 6013 else 6014 head = WQE_TO_SWQE(wqe); 6015 tail = WQE_TO_SWQE(wqe); 6016 } 6017 tail->swqe_next = NULL; 6018 ibd_tx_cleanup_list(state, head, tail); 6019 6020 /* 6021 * Resume any blocked transmissions if possible 6022 */ 6023 ibd_resume_transmission(state); 6024 } 6025 } 6026 6027 /* 6028 * Poll and fully drain the receive cq 6029 */ 6030 static void 6031 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6032 { 6033 ibt_wc_t *wcs = state->id_rxwcs; 6034 uint_t numwcs = state->id_rxwcs_size; 6035 ibd_rwqe_t *rwqe; 6036 ibt_wc_t *wc; 6037 uint_t num_polled; 6038 int i; 6039 mblk_t *head, *tail, *mp; 6040 6041 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 6042 head = tail = NULL; 6043 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 6044 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 6045 if (wc->wc_status != IBT_WC_SUCCESS) { 6046 /* 6047 * Channel being torn down. 6048 */ 6049 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 6050 DPRINT(5, "ibd_drain_rcq: " 6051 "expected flushed rwqe"); 6052 } else { 6053 DPRINT(5, "ibd_drain_rcq: " 6054 "unexpected wc_status %d", 6055 wc->wc_status); 6056 } 6057 atomic_inc_32( 6058 &state->id_rx_list.dl_bufs_outstanding); 6059 freemsg(rwqe->rwqe_im_mblk); 6060 continue; 6061 } 6062 mp = ibd_process_rx(state, rwqe, wc); 6063 if (mp == NULL) 6064 continue; 6065 6066 /* 6067 * Add this mp to the list to send to the nw layer. 6068 */ 6069 if (head) 6070 tail->b_next = mp; 6071 else 6072 head = mp; 6073 tail = mp; 6074 } 6075 if (head) 6076 mac_rx(state->id_mh, state->id_rh, head); 6077 6078 /* 6079 * Account for #rwqes polled. 6080 * Post more here, if less than one fourth full. 6081 */ 6082 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 6083 (state->id_num_rwqe / 4)) 6084 ibd_post_recv_intr(state); 6085 } 6086 } 6087 6088 /* 6089 * Common code for interrupt handling as well as for polling 6090 * for all completed wqe's while detaching. 6091 */ 6092 static void 6093 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6094 { 6095 int flag, redo_flag; 6096 int redo = 1; 6097 6098 flag = IBD_CQ_POLLING; 6099 redo_flag = IBD_REDO_CQ_POLLING; 6100 6101 mutex_enter(&state->id_scq_poll_lock); 6102 if (state->id_scq_poll_busy & flag) { 6103 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 6104 state->id_scq_poll_busy |= redo_flag; 6105 mutex_exit(&state->id_scq_poll_lock); 6106 return; 6107 } 6108 state->id_scq_poll_busy |= flag; 6109 mutex_exit(&state->id_scq_poll_lock); 6110 6111 /* 6112 * In some cases (eg detaching), this code can be invoked on 6113 * any cpu after disabling cq notification (thus no concurrency 6114 * exists). Apart from that, the following applies normally: 6115 * Transmit completion handling could be from any cpu if 6116 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 6117 * is interrupt driven. 6118 */ 6119 6120 /* 6121 * Poll and drain the CQ 6122 */ 6123 ibd_drain_scq(state, cq_hdl); 6124 6125 /* 6126 * Enable CQ notifications and redrain the cq to catch any 6127 * completions we might have missed after the ibd_drain_scq() 6128 * above and before the ibt_enable_cq_notify() that follows. 6129 * Finally, service any new requests to poll the cq that 6130 * could've come in after the ibt_enable_cq_notify(). 6131 */ 6132 do { 6133 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 6134 IBT_SUCCESS) { 6135 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6136 } 6137 6138 ibd_drain_scq(state, cq_hdl); 6139 6140 mutex_enter(&state->id_scq_poll_lock); 6141 if (state->id_scq_poll_busy & redo_flag) 6142 state->id_scq_poll_busy &= ~redo_flag; 6143 else { 6144 state->id_scq_poll_busy &= ~flag; 6145 redo = 0; 6146 } 6147 mutex_exit(&state->id_scq_poll_lock); 6148 6149 } while (redo); 6150 } 6151 6152 /* 6153 * Common code for interrupt handling as well as for polling 6154 * for all completed wqe's while detaching. 6155 */ 6156 static void 6157 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 6158 { 6159 int flag, redo_flag; 6160 int redo = 1; 6161 6162 flag = IBD_CQ_POLLING; 6163 redo_flag = IBD_REDO_CQ_POLLING; 6164 6165 mutex_enter(&state->id_rcq_poll_lock); 6166 if (state->id_rcq_poll_busy & flag) { 6167 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 6168 state->id_rcq_poll_busy |= redo_flag; 6169 mutex_exit(&state->id_rcq_poll_lock); 6170 return; 6171 } 6172 state->id_rcq_poll_busy |= flag; 6173 mutex_exit(&state->id_rcq_poll_lock); 6174 6175 /* 6176 * Poll and drain the CQ 6177 */ 6178 ibd_drain_rcq(state, rcq); 6179 6180 /* 6181 * Enable CQ notifications and redrain the cq to catch any 6182 * completions we might have missed after the ibd_drain_cq() 6183 * above and before the ibt_enable_cq_notify() that follows. 6184 * Finally, service any new requests to poll the cq that 6185 * could've come in after the ibt_enable_cq_notify(). 6186 */ 6187 do { 6188 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 6189 IBT_SUCCESS) { 6190 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6191 } 6192 6193 ibd_drain_rcq(state, rcq); 6194 6195 mutex_enter(&state->id_rcq_poll_lock); 6196 if (state->id_rcq_poll_busy & redo_flag) 6197 state->id_rcq_poll_busy &= ~redo_flag; 6198 else { 6199 state->id_rcq_poll_busy &= ~flag; 6200 redo = 0; 6201 } 6202 mutex_exit(&state->id_rcq_poll_lock); 6203 6204 } while (redo); 6205 } 6206 6207 /* 6208 * Unmap the memory area associated with a given swqe. 6209 */ 6210 static void 6211 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 6212 { 6213 ibt_status_t stat; 6214 6215 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6216 6217 if (swqe->w_mi_hdl) { 6218 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6219 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6220 DPRINT(10, 6221 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6222 } 6223 swqe->w_mi_hdl = NULL; 6224 } 6225 swqe->w_swr.wr_nds = 0; 6226 } 6227 6228 static void 6229 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 6230 { 6231 /* 6232 * The recycling logic can be eliminated from here 6233 * and put into the async thread if we create another 6234 * list to hold ACE's for unjoined mcg's. 6235 */ 6236 if (DEC_REF_DO_CYCLE(ace)) { 6237 ibd_mce_t *mce; 6238 6239 /* 6240 * Check with the lock taken: we decremented 6241 * reference count without the lock, and some 6242 * transmitter might already have bumped the 6243 * reference count (possible in case of multicast 6244 * disable when we leave the AH on the active 6245 * list). If not still 0, get out, leaving the 6246 * recycle bit intact. 6247 * 6248 * Atomically transition the AH from active 6249 * to free list, and queue a work request to 6250 * leave the group and destroy the mce. No 6251 * transmitter can be looking at the AH or 6252 * the MCE in between, since we have the 6253 * ac_mutex lock. In the SendOnly reap case, 6254 * it is not necessary to hold the ac_mutex 6255 * and recheck the ref count (since the AH was 6256 * taken off the active list), we just do it 6257 * to have uniform processing with the Full 6258 * reap case. 6259 */ 6260 mutex_enter(&state->id_ac_mutex); 6261 mce = ace->ac_mce; 6262 if (GET_REF_CYCLE(ace) == 0) { 6263 CLEAR_REFCYCLE(ace); 6264 /* 6265 * Identify the case of fullmember reap as 6266 * opposed to mcg trap reap. Also, port up 6267 * might set ac_mce to NULL to indicate Tx 6268 * cleanup should do no more than put the 6269 * AH in the free list (see ibd_async_link). 6270 */ 6271 if (mce != NULL) { 6272 ace->ac_mce = NULL; 6273 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6274 /* 6275 * mc_req was initialized at mce 6276 * creation time. 6277 */ 6278 ibd_queue_work_slot(state, 6279 &mce->mc_req, IBD_ASYNC_REAP); 6280 } 6281 IBD_ACACHE_INSERT_FREE(state, ace); 6282 } 6283 mutex_exit(&state->id_ac_mutex); 6284 } 6285 } 6286 6287 /* 6288 * Common code that deals with clean ups after a successful or 6289 * erroneous transmission attempt. 6290 */ 6291 static void 6292 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6293 { 6294 ibd_ace_t *ace = swqe->w_ahandle; 6295 6296 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6297 6298 /* 6299 * If this was a dynamic mapping in ibd_send(), we need to 6300 * unmap here. If this was an lso buffer we'd used for sending, 6301 * we need to release the lso buf to the pool, since the resource 6302 * is scarce. However, if this was simply a normal send using 6303 * the copybuf (present in each swqe), we don't need to release it. 6304 */ 6305 if (swqe->swqe_im_mblk != NULL) { 6306 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6307 ibd_unmap_mem(state, swqe); 6308 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6309 ibd_release_lsobufs(state, 6310 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6311 } 6312 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6313 freemsg(swqe->swqe_im_mblk); 6314 swqe->swqe_im_mblk = NULL; 6315 } 6316 6317 /* 6318 * Drop the reference count on the AH; it can be reused 6319 * now for a different destination if there are no more 6320 * posted sends that will use it. This can be eliminated 6321 * if we can always associate each Tx buffer with an AH. 6322 * The ace can be null if we are cleaning up from the 6323 * ibd_send() error path. 6324 */ 6325 if (ace != NULL) { 6326 ibd_dec_ref_ace(state, ace); 6327 } 6328 6329 /* 6330 * Release the send wqe for reuse. 6331 */ 6332 swqe->swqe_next = NULL; 6333 ibd_release_swqe(state, swqe, swqe, 1); 6334 } 6335 6336 static void 6337 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 6338 { 6339 ibd_ace_t *ace; 6340 ibd_swqe_t *swqe; 6341 int n = 0; 6342 6343 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 6344 6345 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 6346 6347 /* 6348 * If this was a dynamic mapping in ibd_send(), we need to 6349 * unmap here. If this was an lso buffer we'd used for sending, 6350 * we need to release the lso buf to the pool, since the 6351 * resource is scarce. However, if this was simply a normal 6352 * send using the copybuf (present in each swqe), we don't need 6353 * to release it. 6354 */ 6355 if (swqe->swqe_im_mblk != NULL) { 6356 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6357 ibd_unmap_mem(state, swqe); 6358 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6359 ibd_release_lsobufs(state, 6360 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6361 } 6362 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6363 freemsg(swqe->swqe_im_mblk); 6364 swqe->swqe_im_mblk = NULL; 6365 } 6366 6367 /* 6368 * Drop the reference count on the AH; it can be reused 6369 * now for a different destination if there are no more 6370 * posted sends that will use it. This can be eliminated 6371 * if we can always associate each Tx buffer with an AH. 6372 * The ace can be null if we are cleaning up from the 6373 * ibd_send() error path. 6374 */ 6375 ace = swqe->w_ahandle; 6376 if (ace != NULL) { 6377 ibd_dec_ref_ace(state, ace); 6378 } 6379 n++; 6380 } 6381 6382 /* 6383 * Release the send wqes for reuse. 6384 */ 6385 ibd_release_swqe(state, head, tail, n); 6386 } 6387 6388 /* 6389 * Processing to be done after receipt of a packet; hand off to GLD 6390 * in the format expected by GLD. The received packet has this 6391 * format: 2b sap :: 00 :: data. 6392 */ 6393 static mblk_t * 6394 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6395 { 6396 ib_header_info_t *phdr; 6397 mblk_t *mp; 6398 ipoib_hdr_t *ipibp; 6399 ipha_t *iphap; 6400 ip6_t *ip6h; 6401 int len; 6402 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 6403 uint32_t bufs; 6404 6405 /* 6406 * Track number handed to upper layer that need to be returned. 6407 */ 6408 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 6409 6410 /* Never run out of rwqes, use allocb when running low */ 6411 if (bufs >= state->id_rx_bufs_outstanding_limit) { 6412 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6413 atomic_inc_32(&state->id_rx_allocb); 6414 mp = allocb(pkt_len, BPRI_HI); 6415 if (mp) { 6416 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 6417 ibd_post_recv(state, rwqe); 6418 } else { /* no memory */ 6419 atomic_inc_32(&state->id_rx_allocb_failed); 6420 ibd_post_recv(state, rwqe); 6421 return (NULL); 6422 } 6423 } else { 6424 mp = rwqe->rwqe_im_mblk; 6425 } 6426 6427 6428 /* 6429 * Adjust write pointer depending on how much data came in. 6430 */ 6431 mp->b_wptr = mp->b_rptr + pkt_len; 6432 6433 /* 6434 * Make sure this is NULL or we're in trouble. 6435 */ 6436 if (mp->b_next != NULL) { 6437 ibd_print_warn(state, 6438 "ibd_process_rx: got duplicate mp from rcq?"); 6439 mp->b_next = NULL; 6440 } 6441 6442 /* 6443 * the IB link will deliver one of the IB link layer 6444 * headers called, the Global Routing Header (GRH). 6445 * ibd driver uses the information in GRH to build the 6446 * Header_info structure and pass it with the datagram up 6447 * to GLDv3. 6448 * If the GRH is not valid, indicate to GLDv3 by setting 6449 * the VerTcFlow field to 0. 6450 */ 6451 phdr = (ib_header_info_t *)mp->b_rptr; 6452 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6453 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6454 6455 /* if it is loop back packet, just drop it. */ 6456 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6457 IPOIB_ADDRL) == 0) { 6458 freemsg(mp); 6459 return (NULL); 6460 } 6461 6462 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6463 sizeof (ipoib_mac_t)); 6464 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6465 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6466 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6467 } else { 6468 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6469 } 6470 } else { 6471 /* 6472 * It can not be a IBA multicast packet. Must have been 6473 * unicast for us. Just copy the interface address to dst. 6474 */ 6475 phdr->ib_grh.ipoib_vertcflow = 0; 6476 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6477 sizeof (ipoib_mac_t)); 6478 } 6479 6480 /* 6481 * For ND6 packets, padding is at the front of the source/target 6482 * lladdr. However the inet6 layer is not aware of it, hence remove 6483 * the padding from such packets. 6484 */ 6485 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6486 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 6487 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6488 len = ntohs(ip6h->ip6_plen); 6489 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6490 /* LINTED: E_CONSTANT_CONDITION */ 6491 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6492 } 6493 } 6494 6495 /* 6496 * Update statistics 6497 */ 6498 atomic_add_64(&state->id_rcv_bytes, pkt_len); 6499 atomic_inc_64(&state->id_rcv_pkt); 6500 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6501 atomic_inc_64(&state->id_brd_rcv); 6502 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6503 atomic_inc_64(&state->id_multi_rcv); 6504 6505 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6506 /* 6507 * Set receive checksum status in mp 6508 * Hardware checksumming can be considered valid only if: 6509 * 1. CQE.IP_OK bit is set 6510 * 2. CQE.CKSUM = 0xffff 6511 * 3. IPv6 routing header is not present in the packet 6512 * 4. If there are no IP_OPTIONS in the IP HEADER 6513 */ 6514 6515 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6516 (wc->wc_cksum == 0xFFFF) && 6517 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6518 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6519 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6520 } 6521 6522 return (mp); 6523 } 6524 6525 /* 6526 * Callback code invoked from STREAMs when the receive data buffer is 6527 * free for recycling. 6528 */ 6529 static void 6530 ibd_freemsg_cb(char *arg) 6531 { 6532 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6533 ibd_state_t *state = rwqe->w_state; 6534 6535 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6536 6537 /* 6538 * If the driver is stopped, just free the rwqe. 6539 */ 6540 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 6541 DPRINT(6, "ibd_freemsg: wqe being freed"); 6542 rwqe->rwqe_im_mblk = NULL; 6543 ibd_free_rwqe(state, rwqe); 6544 return; 6545 } 6546 6547 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6548 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6549 if (rwqe->rwqe_im_mblk == NULL) { 6550 ibd_free_rwqe(state, rwqe); 6551 DPRINT(6, "ibd_freemsg: desballoc failed"); 6552 return; 6553 } 6554 6555 ibd_post_recv(state, rwqe); 6556 } 6557 6558 static uint_t 6559 ibd_tx_recycle(caddr_t arg) 6560 { 6561 ibd_state_t *state = (ibd_state_t *)arg; 6562 6563 /* 6564 * Poll for completed entries 6565 */ 6566 ibd_poll_scq(state, state->id_scq_hdl); 6567 6568 return (DDI_INTR_CLAIMED); 6569 } 6570 6571 #ifdef IBD_LOGGING 6572 static void 6573 ibd_log_init(void) 6574 { 6575 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6576 ibd_lbuf_ndx = 0; 6577 6578 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 6579 } 6580 6581 static void 6582 ibd_log_fini(void) 6583 { 6584 if (ibd_lbuf) 6585 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6586 ibd_lbuf_ndx = 0; 6587 ibd_lbuf = NULL; 6588 6589 mutex_destroy(&ibd_lbuf_lock); 6590 } 6591 6592 static void 6593 ibd_log(const char *fmt, ...) 6594 { 6595 va_list ap; 6596 uint32_t off; 6597 uint32_t msglen; 6598 char tmpbuf[IBD_DMAX_LINE]; 6599 6600 if (ibd_lbuf == NULL) 6601 return; 6602 6603 va_start(ap, fmt); 6604 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6605 va_end(ap); 6606 6607 if (msglen >= IBD_DMAX_LINE) 6608 msglen = IBD_DMAX_LINE - 1; 6609 6610 mutex_enter(&ibd_lbuf_lock); 6611 6612 off = ibd_lbuf_ndx; /* current msg should go here */ 6613 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6614 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6615 6616 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6617 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6618 6619 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6620 ibd_lbuf_ndx = 0; 6621 6622 mutex_exit(&ibd_lbuf_lock); 6623 6624 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6625 } 6626 #endif 6627