/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2022 Garrett D'Amore */ /* * An implementation of the IPoIB standard based on PSARC 2001/289. */ #include #include #include #include #include #include #include #include #include #include #include #include /* for HCK_FULLCKSUM */ #include /* for offsetof */ #include /* for async thread pri */ #include /* for atomic_add*() */ #include /* for ETHERTYPE_IPV6 */ #include /* for netinet/ip.h below */ #include /* for struct ip */ #include /* for struct udphdr */ #include /* for inet/ip.h below */ #include /* for ipha_t */ #include /* for ip6_t */ #include /* for tcph_t */ #include /* for icmp6_t */ #include #include #include #include /* for SM_INIT_TYPE_* */ #include #include /* for ibd_get_portspeed */ #include #include #include #include #include #include /* * The write-up below includes details on the following: * 1. The dladm administrative model. * 2. Late HCA initialization feature. * 3. Brussels support and its implications to the current architecture. * * 1. The dladm administrative model. * ------------------------------------------ * With the dladm model, ibnex will create one ibd instance per port. These * instances will be created independent of the port state. * * The ibd driver is two faceted: One side of it working as the port driver and * the other as the partition object driver. * * The port instance is a child of the HCA, and will have an entry in the devfs. * A DDI attach only happens for the port driver, and its attach is * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is * handled in ibd_port_unattach(). * * The partition object is only a registrant to the mac layer via mac_register() * and does not have an entry in the device tree. There is no DDI softstate * managed by the DDI framework for the partition objects. However, the state is * managed inside the ibd driver, and every partition object hangs off the * "ibd_objlist_head". * * The partition object first comes into existence when a user runs the * 'create-part' subcommand of dladm. This is like invoking the attach entry * point of the partition object. The partition object goes away with the * 'delete-part' subcommand of dladm. This is like invoking the detach entry * point of the partition object. * * The create-part and delete-part subcommands result in dld ioctls that end up * calling ibd_create_parition() and ibd_delete_partition respectively. * There ioctls are registered with the dld layer in _init() via a call to * dld_ioc_register(). * * The port instance by itself cannot be plumbed. It is only the partition * objects that can be plumbed and they alone participate in I/O and not the * port driver. * * There are some info ioctls supported in ibd which are used by dladm(8) to * display useful information. The info entry point for ibd is * ibd_get_partition_info(). * * 2. Late HCA initialization feature. * ------------------------------------ * As mentioned in section 1, the user creates the partition objects via * dladm(8). It is possible that: * a) The physical port itself is down and the SM cannot be reached. * b) The PKEY specified by the used has not been created in the SM yet. * c) An IPoIB broadcast group for the specified PKEY is not present. * * In all of the above cases, complete initialization of the partition object is * not possible. However, the new model allows the creation of partition * objects even in such cases but will defer the initialization for later. * When such a partition object is plumbed, the link state will be displayed as * "down". * The driver, at this point, is listening to events that herald the * availability of resources - * i) LINK_UP when the link becomes available * ii) PORT_CHANGE when the PKEY has been created * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been * created * via ibd_async_handler() for events i) and ii), and via * ibd_snet_notices_handler() for iii. * The driver handles these events (as and when they arrive) and completes the * initialization of the partition object and transitions it to a usable state. * * 3. Brussels support and its implications to the current architecture. * --------------------------------------------------------------------- * The brussels support introduces two new interfaces to the ibd driver - * ibd_m_getprop() and ibd_m_setprop(). * These interfaces allow setting and retrieval of certain properties. * Some of them are public properties while most other are private properties * meant to be used by developers. Tuning the latter kind can cause * performance issues and should not be used without understanding the * implications. All properties are specific to an instance of either the * partition object or the port driver. * * The public properties are : mtu and linkmode. * mtu is a read-only property. * linkmode can take two values - UD and CM. * * Changing the linkmode requires some bookkeeping in the driver. The * capabilities need to be re-reported to the mac layer. This is done by * calling mac_capab_update(). The maxsdu is updated by calling * mac_maxsdu_update2(). * The private properties retain their values across the change of linkmode. * NOTE: * - The port driver does not support any property apart from mtu. * - All other properties are only meant for the partition object. * - The properties cannot be set when an instance is plumbed. The * instance has to be unplumbed to effect any setting. */ /* * Driver wide tunables * * ibd_tx_softintr * ibd_rx_softintr * The softintr mechanism allows ibd to avoid event queue overflows if * the receive/completion handlers are to be expensive. These are enabled * by default. * * ibd_log_sz * This specifies the size of the ibd log buffer in bytes. The buffer is * allocated and logging is enabled only when IBD_LOGGING is defined. * */ uint_t ibd_rx_softintr = 1; uint_t ibd_tx_softintr = 1; #ifdef IBD_LOGGING uint_t ibd_log_sz = 0x20000; #endif #ifdef IBD_LOGGING #define IBD_LOG_SZ ibd_log_sz #endif /* Post IBD_RX_POST_CNT receive work requests at a time. */ #define IBD_RX_POST_CNT 8 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ #define IBD_LOG_RX_POST 4 /* Minimum number of receive work requests driver needs to always have */ #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) /* * LSO parameters */ #define IBD_LSO_MAXLEN 65536 #define IBD_LSO_BUFSZ 8192 /* * Async operation states */ #define IBD_OP_NOTSTARTED 0 #define IBD_OP_ONGOING 1 #define IBD_OP_COMPLETED 2 #define IBD_OP_ERRORED 3 #define IBD_OP_ROUTERED 4 /* * Start/stop in-progress flags; note that restart must always remain * the OR of start and stop flag values. */ #define IBD_DRV_START_IN_PROGRESS 0x10000000 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS /* * Miscellaneous constants */ #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF #define IBD_DEF_MAX_SDU 2044 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) #define IBD_DEF_RC_MAX_SDU 65520 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) #define IBD_DEFAULT_QKEY 0xB1B #ifdef IBD_LOGGING #define IBD_DMAX_LINE 100 #endif /* * Enumerations for link states */ typedef enum { IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT } ibd_link_op_t; /* * Driver State Pointer */ void *ibd_list; /* * Driver Global Data */ ibd_global_state_t ibd_gstate; /* * Partition object list */ ibd_state_t *ibd_objlist_head = NULL; kmutex_t ibd_objlist_lock; int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */ /* * Logging */ #ifdef IBD_LOGGING kmutex_t ibd_lbuf_lock; uint8_t *ibd_lbuf; uint32_t ibd_lbuf_ndx; #endif /* * Required system entry points */ static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); /* * Required driver entry points for GLDv3 */ static int ibd_m_stat(void *, uint_t, uint64_t *); static int ibd_m_start(void *); static void ibd_m_stop(void *); static int ibd_m_promisc(void *, boolean_t); static int ibd_m_multicst(void *, boolean_t, const uint8_t *); static int ibd_m_unicst(void *, const uint8_t *); static mblk_t *ibd_m_tx(void *, mblk_t *); static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, const void *); static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, mac_prop_info_handle_t); static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, const void *); static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); /* * Private driver entry points for GLDv3 */ /* * Initialization */ static int ibd_state_init(ibd_state_t *, dev_info_t *); static int ibd_init_txlist(ibd_state_t *); static int ibd_init_rxlist(ibd_state_t *); static int ibd_acache_init(ibd_state_t *); #ifdef IBD_LOGGING static void ibd_log_init(void); #endif /* * Termination/cleanup */ static void ibd_state_fini(ibd_state_t *); static void ibd_fini_txlist(ibd_state_t *); static void ibd_fini_rxlist(ibd_state_t *); static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); static void ibd_acache_fini(ibd_state_t *); #ifdef IBD_LOGGING static void ibd_log_fini(void); #endif /* * Allocation/acquire/map routines */ static int ibd_alloc_tx_copybufs(ibd_state_t *); static int ibd_alloc_rx_copybufs(ibd_state_t *); static int ibd_alloc_tx_lsobufs(ibd_state_t *); static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, uint32_t *); /* * Free/release/unmap routines */ static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); static void ibd_free_tx_copybufs(ibd_state_t *); static void ibd_free_rx_copybufs(ibd_state_t *); static void ibd_free_rx_rsrcs(ibd_state_t *); static void ibd_free_tx_lsobufs(ibd_state_t *); static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); /* * Handlers/callback routines */ static uint_t ibd_intr(caddr_t); static uint_t ibd_tx_recycle(caddr_t); static void ibd_rcq_handler(ibt_cq_hdl_t, void *); static void ibd_scq_handler(ibt_cq_hdl_t, void *); static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); static void ibd_freemsg_cb(char *); static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); static void ibd_snet_notices_handler(void *, ib_gid_t, ibt_subnet_event_code_t, ibt_subnet_event_t *); /* * Send/receive routines */ static boolean_t ibd_send(ibd_state_t *, mblk_t *); static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); /* * Threads */ static void ibd_async_work(ibd_state_t *); /* * Async tasks */ static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); static void ibd_async_setprom(ibd_state_t *); static void ibd_async_unsetprom(ibd_state_t *); static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); static void ibd_async_trap(ibd_state_t *, ibd_req_t *); static void ibd_async_txsched(ibd_state_t *); static void ibd_async_link(ibd_state_t *, ibd_req_t *); /* * Async task helpers */ static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *, ipoib_mac_t *); static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); static ibt_status_t ibd_find_bgroup(ibd_state_t *); static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); static uint64_t ibd_get_portspeed(ibd_state_t *); static boolean_t ibd_async_safe(ibd_state_t *); static void ibd_async_done(ibd_state_t *); static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); /* * Helpers for attach/start routines */ static int ibd_register_mac(ibd_state_t *, dev_info_t *); static int ibd_record_capab(ibd_state_t *); static int ibd_get_port_details(ibd_state_t *); static int ibd_alloc_cqs(ibd_state_t *); static int ibd_setup_ud_channel(ibd_state_t *); static int ibd_start(ibd_state_t *); static int ibd_undo_start(ibd_state_t *, link_state_t); static void ibd_set_mac_progress(ibd_state_t *, uint_t); static void ibd_clr_mac_progress(ibd_state_t *, uint_t); static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); static void ibd_part_unattach(ibd_state_t *state); static int ibd_port_attach(dev_info_t *); static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); static int ibd_get_port_state(ibd_state_t *, link_state_t *); static int ibd_part_busy(ibd_state_t *); /* * Miscellaneous helpers */ static int ibd_sched_poll(ibd_state_t *, int, int); static void ibd_resume_transmission(ibd_state_t *); static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); static void *list_get_head(list_t *); static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); static uint_t ibd_hash_by_id(void *, mod_hash_key_t); ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); #ifdef IBD_LOGGING static void ibd_log(const char *, ...); #endif DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); /* Module Driver Info */ static struct modldrv ibd_modldrv = { &mod_driverops, /* This one is a driver */ "InfiniBand GLDv3 Driver", /* short description */ &ibd_dev_ops /* driver specific ops */ }; /* Module Linkage */ static struct modlinkage ibd_modlinkage = { MODREV_1, (void *)&ibd_modldrv, NULL }; /* * Module (static) info passed to IBTL during ibt_attach */ static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { IBTI_V_CURR, IBT_NETWORK, ibd_async_handler, NULL, "IBPART" }; static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { IBTI_V_CURR, IBT_NETWORK, ibdpd_async_handler, NULL, "IPIB" }; /* * GLDv3 entry points */ #define IBD_M_CALLBACK_FLAGS \ (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) static mac_callbacks_t ibd_m_callbacks = { IBD_M_CALLBACK_FLAGS, ibd_m_stat, ibd_m_start, ibd_m_stop, ibd_m_promisc, ibd_m_multicst, ibd_m_unicst, ibd_m_tx, NULL, NULL, ibd_m_getcapab, NULL, NULL, ibd_m_setprop, ibd_m_getprop, ibd_m_propinfo }; /* Private properties */ char *ibd_priv_props[] = { "_ibd_broadcast_group", "_ibd_coalesce_completions", "_ibd_create_broadcast_group", "_ibd_hash_size", "_ibd_lso_enable", "_ibd_num_ah", "_ibd_num_lso_bufs", "_ibd_rc_enable_srq", "_ibd_rc_num_rwqe", "_ibd_rc_num_srq", "_ibd_rc_num_swqe", "_ibd_rc_rx_comp_count", "_ibd_rc_rx_comp_usec", "_ibd_rc_rx_copy_thresh", "_ibd_rc_rx_rwqe_thresh", "_ibd_rc_tx_comp_count", "_ibd_rc_tx_comp_usec", "_ibd_rc_tx_copy_thresh", "_ibd_ud_num_rwqe", "_ibd_ud_num_swqe", "_ibd_ud_rx_comp_count", "_ibd_ud_rx_comp_usec", "_ibd_ud_tx_comp_count", "_ibd_ud_tx_comp_usec", "_ibd_ud_tx_copy_thresh", NULL }; static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); static dld_ioc_info_t ibd_dld_ioctl_list[] = { {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), ibd_create_partition, secpolicy_dl_config}, {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), ibd_delete_partition, secpolicy_dl_config}, {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), ibd_get_partition_info, NULL} }; /* * Fill/clear and in multicast/broadcast address */ #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ { \ *(uint32_t *)((char *)(maddr) + 4) |= \ htonl((uint32_t)(scope) << 16); \ *(uint32_t *)((char *)(maddr) + 8) |= \ htonl((uint32_t)(pkey) << 16); \ } #define IBD_CLEAR_SCOPE_PKEY(maddr) \ { \ *(uint32_t *)((char *)(maddr) + 4) &= \ htonl(~((uint32_t)0xF << 16)); \ *(uint32_t *)((char *)(maddr) + 8) &= \ htonl(~((uint32_t)0xFFFF << 16)); \ } /* * Rudimentary debugging support */ #ifdef DEBUG int ibd_debuglevel = 100; void debug_print(int l, char *fmt, ...) { va_list ap; if (l < ibd_debuglevel) return; va_start(ap, fmt); vcmn_err(CE_CONT, fmt, ap); va_end(ap); } #endif /* * Common routine to print warning messages; adds in hca guid, port number * and pkey to be able to identify the IBA interface. */ void ibd_print_warn(ibd_state_t *state, char *fmt, ...) { ib_guid_t hca_guid; char ibd_print_buf[MAXNAMELEN + 256]; int len; va_list ap; char part_name[MAXNAMELEN]; datalink_id_t linkid = state->id_plinkid; hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 0, "hca-guid", 0); (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL); len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ", ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), (u_longlong_t)hca_guid, state->id_port, state->id_pkey, part_name); va_start(ap, fmt); (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, fmt, ap); cmn_err(CE_NOTE, "!%s", ibd_print_buf); va_end(ap); } /* * Warlock directives */ /* * id_lso_lock * * state->id_lso->bkt_nfree may be accessed without a lock to * determine the threshold at which we have to ask the nw layer * to resume transmission (see ibd_resume_transmission()). */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, ibd_state_t::id_lso)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) /* * id_scq_poll_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, ibd_state_t::id_scq_poll_busy)) /* * id_txpost_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, ibd_state_t::id_tx_head)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, ibd_state_t::id_tx_busy)) /* * id_acache_req_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, ibd_state_t::id_acache_req_cv)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, ibd_state_t::id_req_list)) _NOTE(SCHEME_PROTECTS_DATA("atomic", ibd_acache_s::ac_ref)) /* * id_ac_mutex * * This mutex is actually supposed to protect id_ah_op as well, * but this path of the code isn't clean (see update of id_ah_op * in ibd_async_acache(), immediately after the call to * ibd_async_mcache()). For now, we'll skip this check by * declaring that id_ah_op is protected by some internal scheme * that warlock isn't aware of. */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_active)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_addr)) _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", ibd_state_t::id_ah_op)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_error)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ac_hot_ace)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) /* * id_mc_mutex */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, ibd_state_t::id_mc_full)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, ibd_state_t::id_mc_non)) /* * id_trap_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, ibd_state_t::id_trap_cv)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, ibd_state_t::id_trap_stop)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, ibd_state_t::id_trap_inprog)) /* * id_prom_op */ _NOTE(SCHEME_PROTECTS_DATA("only by async thread", ibd_state_t::id_prom_op)) /* * id_sched_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, ibd_state_t::id_sched_needed)) /* * id_link_mutex */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, ibd_state_t::id_link_state)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", ibd_state_t::id_link_speed)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) /* * id_tx_list.dl_mutex */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, ibd_state_t::id_tx_list.dl_head)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, ibd_state_t::id_tx_list.dl_pending_sends)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, ibd_state_t::id_tx_list.dl_cnt)) /* * id_rx_list.dl_mutex */ _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", ibd_state_t::id_rx_list.dl_bufs_outstanding)) _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", ibd_state_t::id_rx_list.dl_cnt)) /* * rc_timeout_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, ibd_state_t::rc_timeout_start)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, ibd_state_t::rc_timeout)) /* * Items protected by atomic updates */ _NOTE(SCHEME_PROTECTS_DATA("atomic update only", ibd_state_s::id_brd_rcv ibd_state_s::id_brd_xmt ibd_state_s::id_multi_rcv ibd_state_s::id_multi_xmt ibd_state_s::id_num_intrs ibd_state_s::id_rcv_bytes ibd_state_s::id_rcv_pkt ibd_state_s::id_rx_post_queue_index ibd_state_s::id_tx_short ibd_state_s::id_xmt_bytes ibd_state_s::id_xmt_pkt ibd_state_s::rc_rcv_trans_byte ibd_state_s::rc_rcv_trans_pkt ibd_state_s::rc_rcv_copy_byte ibd_state_s::rc_rcv_copy_pkt ibd_state_s::rc_xmt_bytes ibd_state_s::rc_xmt_small_pkt ibd_state_s::rc_xmt_fragmented_pkt ibd_state_s::rc_xmt_map_fail_pkt ibd_state_s::rc_xmt_map_succ_pkt ibd_rc_chan_s::rcq_invoking)) /* * Non-mutex protection schemes for data elements. Almost all of * these are non-shared items. */ _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", callb_cpr ib_gid_s ib_header_info ibd_acache_rq ibd_acache_s::ac_mce ibd_acache_s::ac_chan ibd_mcache::mc_fullreap ibd_mcache::mc_jstate ibd_mcache::mc_req ibd_rwqe_s ibd_swqe_s ibd_wqe_s ibt_wr_ds_s::ds_va ibt_wr_lso_s ipoib_mac::ipoib_qpn mac_capab_lso_s msgb::b_next msgb::b_cont msgb::b_rptr msgb::b_wptr ibd_state_s::id_bgroup_created ibd_state_s::id_mac_state ibd_state_s::id_mtu ibd_state_s::id_ud_num_rwqe ibd_state_s::id_ud_num_swqe ibd_state_s::id_qpnum ibd_state_s::id_rcq_hdl ibd_state_s::id_rx_buf_sz ibd_state_s::id_rx_bufs ibd_state_s::id_rx_mr_hdl ibd_state_s::id_rx_wqes ibd_state_s::id_rxwcs ibd_state_s::id_rxwcs_size ibd_state_s::id_rx_nqueues ibd_state_s::id_rx_queues ibd_state_s::id_scope ibd_state_s::id_scq_hdl ibd_state_s::id_tx_buf_sz ibd_state_s::id_tx_bufs ibd_state_s::id_tx_mr_hdl ibd_state_s::id_tx_rel_list.dl_cnt ibd_state_s::id_tx_wqes ibd_state_s::id_txwcs ibd_state_s::id_txwcs_size ibd_state_s::rc_listen_hdl ibd_state_s::rc_listen_hdl_OFED_interop ibd_state_s::rc_srq_size ibd_state_s::rc_srq_rwqes ibd_state_s::rc_srq_rx_bufs ibd_state_s::rc_srq_rx_mr_hdl ibd_state_s::rc_tx_largebuf_desc_base ibd_state_s::rc_tx_mr_bufs ibd_state_s::rc_tx_mr_hdl ipha_s icmph_s ibt_path_info_s::pi_sid ibd_rc_chan_s::ace ibd_rc_chan_s::chan_hdl ibd_rc_chan_s::state ibd_rc_chan_s::chan_state ibd_rc_chan_s::is_tx_chan ibd_rc_chan_s::rcq_hdl ibd_rc_chan_s::rcq_size ibd_rc_chan_s::scq_hdl ibd_rc_chan_s::scq_size ibd_rc_chan_s::rx_bufs ibd_rc_chan_s::rx_mr_hdl ibd_rc_chan_s::rx_rwqes ibd_rc_chan_s::tx_wqes ibd_rc_chan_s::tx_mr_bufs ibd_rc_chan_s::tx_mr_hdl ibd_rc_chan_s::tx_rel_list.dl_cnt ibd_rc_chan_s::is_used ibd_rc_tx_largebuf_s::lb_buf ibd_rc_msg_hello_s ibt_cm_return_args_s)) /* * ibd_rc_chan_s::next is protected by two mutexes: * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. */ _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", ibd_rc_chan_s::next)) /* * ibd_state_s.rc_tx_large_bufs_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, ibd_state_s::rc_tx_largebuf_free_head)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, ibd_state_s::rc_tx_largebuf_nfree)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, ibd_rc_tx_largebuf_s::lb_next)) /* * ibd_acache_s.tx_too_big_mutex */ _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, ibd_acache_s::tx_too_big_ongoing)) /* * tx_wqe_list.dl_mutex */ _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, ibd_rc_chan_s::tx_wqe_list.dl_head)) _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, ibd_rc_chan_s::tx_wqe_list.dl_cnt)) /* * ibd_state_s.rc_ace_recycle_lock */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, ibd_state_s::rc_ace_recycle)) /* * rc_srq_rwqe_list.dl_mutex */ _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", ibd_state_t::rc_srq_rwqe_list.dl_cnt)) /* * Non-mutex protection schemes for data elements. They are counters * for problem diagnosis. Don't need be protected. */ _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", ibd_state_s::rc_rcv_alloc_fail ibd_state_s::rc_rcq_err ibd_state_s::rc_ace_not_found ibd_state_s::rc_xmt_drop_too_long_pkt ibd_state_s::rc_xmt_icmp_too_long_pkt ibd_state_s::rc_xmt_reenter_too_long_pkt ibd_state_s::rc_swqe_short ibd_state_s::rc_swqe_mac_update ibd_state_s::rc_xmt_buf_short ibd_state_s::rc_xmt_buf_mac_update ibd_state_s::rc_scq_no_swqe ibd_state_s::rc_scq_no_largebuf ibd_state_s::rc_conn_succ ibd_state_s::rc_conn_fail ibd_state_s::rc_null_conn ibd_state_s::rc_no_estab_conn ibd_state_s::rc_act_close ibd_state_s::rc_pas_close ibd_state_s::rc_delay_ace_recycle ibd_state_s::rc_act_close_simultaneous ibd_state_s::rc_act_close_not_clean ibd_state_s::rc_pas_close_rcq_invoking ibd_state_s::rc_reset_cnt ibd_state_s::rc_timeout_act ibd_state_s::rc_timeout_pas ibd_state_s::rc_stop_connect)) #ifdef DEBUG /* * Non-mutex protection schemes for data elements. They are counters * for problem diagnosis. Don't need be protected. */ _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", ibd_state_s::rc_rwqe_short ibd_rc_stat_s::rc_rcv_trans_byte ibd_rc_stat_s::rc_rcv_trans_pkt ibd_rc_stat_s::rc_rcv_copy_byte ibd_rc_stat_s::rc_rcv_copy_pkt ibd_rc_stat_s::rc_rcv_alloc_fail ibd_rc_stat_s::rc_rcq_err ibd_rc_stat_s::rc_rwqe_short ibd_rc_stat_s::rc_xmt_bytes ibd_rc_stat_s::rc_xmt_small_pkt ibd_rc_stat_s::rc_xmt_fragmented_pkt ibd_rc_stat_s::rc_xmt_map_fail_pkt ibd_rc_stat_s::rc_xmt_map_succ_pkt ibd_rc_stat_s::rc_ace_not_found ibd_rc_stat_s::rc_scq_no_swqe ibd_rc_stat_s::rc_scq_no_largebuf ibd_rc_stat_s::rc_swqe_short ibd_rc_stat_s::rc_swqe_mac_update ibd_rc_stat_s::rc_xmt_buf_short ibd_rc_stat_s::rc_xmt_buf_mac_update ibd_rc_stat_s::rc_conn_succ ibd_rc_stat_s::rc_conn_fail ibd_rc_stat_s::rc_null_conn ibd_rc_stat_s::rc_no_estab_conn ibd_rc_stat_s::rc_act_close ibd_rc_stat_s::rc_pas_close ibd_rc_stat_s::rc_delay_ace_recycle ibd_rc_stat_s::rc_act_close_simultaneous ibd_rc_stat_s::rc_reset_cnt ibd_rc_stat_s::rc_timeout_act ibd_rc_stat_s::rc_timeout_pas)) #endif int _init() { int status; status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), PAGESIZE), 0); if (status != 0) { DPRINT(10, "_init:failed in ddi_soft_state_init()"); return (status); } mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); mac_init_ops(&ibd_dev_ops, "ibp"); status = mod_install(&ibd_modlinkage); if (status != 0) { DPRINT(10, "_init:failed in mod_install()"); ddi_soft_state_fini(&ibd_list); mac_fini_ops(&ibd_dev_ops); return (status); } mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); mutex_enter(&ibd_gstate.ig_mutex); ibd_gstate.ig_ibt_hdl = NULL; ibd_gstate.ig_ibt_hdl_ref_cnt = 0; ibd_gstate.ig_service_list = NULL; mutex_exit(&ibd_gstate.ig_mutex); if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { return (EIO); } ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); #ifdef IBD_LOGGING ibd_log_init(); #endif return (0); } int _info(struct modinfo *modinfop) { return (mod_info(&ibd_modlinkage, modinfop)); } int _fini() { int status; status = mod_remove(&ibd_modlinkage); if (status != 0) return (status); ibt_unregister_part_attr_cb(); mac_fini_ops(&ibd_dev_ops); mutex_destroy(&ibd_objlist_lock); ddi_soft_state_fini(&ibd_list); mutex_destroy(&ibd_gstate.ig_mutex); #ifdef IBD_LOGGING ibd_log_fini(); #endif return (0); } /* * Convert the GID part of the mac address from network byte order * to host order. */ static void ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) { ib_sn_prefix_t nbopref; ib_guid_t nboguid; bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); dgid->gid_prefix = b2h64(nbopref); dgid->gid_guid = b2h64(nboguid); } /* * Create the IPoIB address in network byte order from host order inputs. */ static void ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, ib_guid_t guid) { ib_sn_prefix_t nbopref; ib_guid_t nboguid; mac->ipoib_qpn = htonl(qpn); nbopref = h2b64(prefix); nboguid = h2b64(guid); bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); } /* * Send to the appropriate all-routers group when the IBA multicast group * does not exist, based on whether the target group is v4 or v6. */ static boolean_t ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, ipoib_mac_t *rmac) { boolean_t retval = B_TRUE; uint32_t adjscope = state->id_scope << 16; uint32_t topword; /* * Copy the first 4 bytes in without assuming any alignment of * input mac address; this will have IPoIB signature, flags and * scope bits. */ bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); topword = ntohl(topword); /* * Generate proper address for IPv4/v6, adding in the Pkey properly. */ if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | ((uint32_t)(state->id_pkey << 16))), (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); else /* * Does not have proper bits in the mgid address. */ retval = B_FALSE; return (retval); } /* * Membership states for different mcg's are tracked by two lists: * the "non" list is used for promiscuous mode, when all mcg traffic * needs to be inspected. This type of membership is never used for * transmission, so there can not be an AH in the active list * corresponding to a member in this list. This list does not need * any protection, since all operations are performed by the async * thread. * * "Full" and "SendOnly" membership is tracked using a single list, * the "full" list. This is because this single list can then be * searched during transmit to a multicast group (if an AH for the * mcg is not found in the active list), since at least one type * of membership must be present before initiating the transmit. * This list is also emptied during driver detach, since sendonly * membership acquired during transmit is dropped at detach time * along with ipv4 broadcast full membership. Insert/deletes to * this list are done only by the async thread, but it is also * searched in program context (see multicast disable case), thus * the id_mc_mutex protects the list. The driver detach path also * deconstructs the "full" list, but it ensures that the async * thread will not be accessing the list (by blocking out mcg * trap handling and making sure no more Tx reaping will happen). * * Currently, an IBA attach is done in the SendOnly case too, * although this is not required. */ #define IBD_MCACHE_INSERT_FULL(state, mce) \ list_insert_head(&state->id_mc_full, mce) #define IBD_MCACHE_INSERT_NON(state, mce) \ list_insert_head(&state->id_mc_non, mce) #define IBD_MCACHE_FIND_FULL(state, mgid) \ ibd_mcache_find(mgid, &state->id_mc_full) #define IBD_MCACHE_FIND_NON(state, mgid) \ ibd_mcache_find(mgid, &state->id_mc_non) #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ list_remove(&state->id_mc_full, mce) #define IBD_MCACHE_PULLOUT_NON(state, mce) \ list_remove(&state->id_mc_non, mce) static void * list_get_head(list_t *list) { list_node_t *lhead = list_head(list); if (lhead != NULL) list_remove(list, lhead); return (lhead); } /* * This is always guaranteed to be able to queue the work. */ void ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) { /* Initialize request */ DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); ptr->rq_op = op; /* * Queue provided slot onto request pool. */ mutex_enter(&state->id_acache_req_lock); list_insert_tail(&state->id_req_list, ptr); /* Go, fetch, async thread */ cv_signal(&state->id_acache_req_cv); mutex_exit(&state->id_acache_req_lock); } /* * Main body of the per interface async thread. */ static void ibd_async_work(ibd_state_t *state) { ibd_req_t *ptr; callb_cpr_t cprinfo; mutex_enter(&state->id_acache_req_lock); CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, callb_generic_cpr, "ibd_async_work"); for (;;) { ptr = list_get_head(&state->id_req_list); if (ptr != NULL) { mutex_exit(&state->id_acache_req_lock); /* * If we are in late hca initialization mode, do not * process any other async request other than TRAP. TRAP * is used for indicating creation of a broadcast group; * in which case, we need to join/create the group. */ if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && (ptr->rq_op != IBD_ASYNC_TRAP)) { goto free_req_and_continue; } /* * Once we have done the operation, there is no * guarantee the request slot is going to be valid, * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, * TRAP). * * Perform the request. */ switch (ptr->rq_op) { case IBD_ASYNC_GETAH: ibd_async_acache(state, &ptr->rq_mac); break; case IBD_ASYNC_JOIN: case IBD_ASYNC_LEAVE: ibd_async_multicast(state, ptr->rq_gid, ptr->rq_op); break; case IBD_ASYNC_PROMON: ibd_async_setprom(state); break; case IBD_ASYNC_PROMOFF: ibd_async_unsetprom(state); break; case IBD_ASYNC_REAP: ibd_async_reap_group(state, ptr->rq_ptr, ptr->rq_gid, IB_MC_JSTATE_FULL); /* * the req buf contains in mce * structure, so we do not need * to free it here. */ ptr = NULL; break; case IBD_ASYNC_TRAP: ibd_async_trap(state, ptr); break; case IBD_ASYNC_SCHED: ibd_async_txsched(state); break; case IBD_ASYNC_LINK: ibd_async_link(state, ptr); break; case IBD_ASYNC_EXIT: mutex_enter(&state->id_acache_req_lock); #ifndef __lock_lint CALLB_CPR_EXIT(&cprinfo); #else mutex_exit(&state->id_acache_req_lock); #endif return; case IBD_ASYNC_RC_TOO_BIG: ibd_async_rc_process_too_big(state, ptr); break; case IBD_ASYNC_RC_CLOSE_ACT_CHAN: ibd_async_rc_close_act_chan(state, ptr); break; case IBD_ASYNC_RC_RECYCLE_ACE: ibd_async_rc_recycle_ace(state, ptr); break; case IBD_ASYNC_RC_CLOSE_PAS_CHAN: (void) ibd_rc_pas_close(ptr->rq_ptr, B_TRUE, B_TRUE); break; } free_req_and_continue: if (ptr != NULL) kmem_cache_free(state->id_req_kmc, ptr); mutex_enter(&state->id_acache_req_lock); } else { #ifndef __lock_lint /* * Nothing to do: wait till new request arrives. */ CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&state->id_acache_req_cv, &state->id_acache_req_lock); CALLB_CPR_SAFE_END(&cprinfo, &state->id_acache_req_lock); #endif } } /*NOTREACHED*/ _NOTE(NOT_REACHED) } /* * Return when it is safe to queue requests to the async daemon; primarily * for subnet trap and async event handling. Disallow requests before the * daemon is created, and when interface deinitilization starts. */ static boolean_t ibd_async_safe(ibd_state_t *state) { mutex_enter(&state->id_trap_lock); if (state->id_trap_stop) { mutex_exit(&state->id_trap_lock); return (B_FALSE); } state->id_trap_inprog++; mutex_exit(&state->id_trap_lock); return (B_TRUE); } /* * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet * trap or event handling to complete to kill the async thread and deconstruct * the mcg/ace list. */ static void ibd_async_done(ibd_state_t *state) { mutex_enter(&state->id_trap_lock); if (--state->id_trap_inprog == 0) cv_signal(&state->id_trap_cv); mutex_exit(&state->id_trap_lock); } /* * Hash functions: * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. * These operate on mac addresses input into ibd_send, but there is no * guarantee on the alignment of the ipoib_mac_t structure. */ /*ARGSUSED*/ static uint_t ibd_hash_by_id(void *hash_data, mod_hash_key_t key) { ulong_t ptraddr = (ulong_t)key; uint_t hval; /* * If the input address is 4 byte aligned, we can just dereference * it. This is most common, since IP will send in a 4 byte aligned * IP header, which implies the 24 byte IPoIB psuedo header will be * 4 byte aligned too. */ if ((ptraddr & 3) == 0) return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); return (hval); } static int ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) { if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) return (0); else return (1); } /* * Initialize all the per interface caches and lists; AH cache, * MCG list etc. */ static int ibd_acache_init(ibd_state_t *state) { ibd_ace_t *ce; int i; mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); mutex_enter(&state->id_ac_mutex); list_create(&state->id_ah_free, sizeof (ibd_ace_t), offsetof(ibd_ace_t, ac_list)); list_create(&state->id_ah_active, sizeof (ibd_ace_t), offsetof(ibd_ace_t, ac_list)); state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); list_create(&state->id_mc_full, sizeof (ibd_mce_t), offsetof(ibd_mce_t, mc_list)); list_create(&state->id_mc_non, sizeof (ibd_mce_t), offsetof(ibd_mce_t, mc_list)); state->id_ac_hot_ace = NULL; state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * state->id_num_ah, KM_SLEEP); for (i = 0; i < state->id_num_ah; i++, ce++) { if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { mutex_exit(&state->id_ac_mutex); ibd_acache_fini(state); return (DDI_FAILURE); } else { CLEAR_REFCYCLE(ce); ce->ac_mce = NULL; mutex_init(&ce->tx_too_big_mutex, NULL, MUTEX_DRIVER, NULL); IBD_ACACHE_INSERT_FREE(state, ce); } } mutex_exit(&state->id_ac_mutex); return (DDI_SUCCESS); } static void ibd_acache_fini(ibd_state_t *state) { ibd_ace_t *ptr; mutex_enter(&state->id_ac_mutex); while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { ASSERT(GET_REF(ptr) == 0); mutex_destroy(&ptr->tx_too_big_mutex); (void) ibt_free_ud_dest(ptr->ac_dest); } while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { ASSERT(GET_REF(ptr) == 0); mutex_destroy(&ptr->tx_too_big_mutex); (void) ibt_free_ud_dest(ptr->ac_dest); } list_destroy(&state->id_ah_free); list_destroy(&state->id_ah_active); list_destroy(&state->id_mc_full); list_destroy(&state->id_mc_non); kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); mutex_exit(&state->id_ac_mutex); mutex_destroy(&state->id_ac_mutex); mutex_destroy(&state->id_mc_mutex); } /* * Search AH active hash list for a cached path to input destination. * If we are "just looking", hold == F. When we are in the Tx path, * we set hold == T to grab a reference on the AH so that it can not * be recycled to a new destination while the Tx request is posted. */ ibd_ace_t * ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) { ibd_ace_t *ptr; ASSERT(mutex_owned(&state->id_ac_mutex)); /* * Do hash search. */ if (mod_hash_find(state->id_ah_active_hash, (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { if (hold) INC_REF(ptr, num); return (ptr); } return (NULL); } /* * This is called by the tx side; if an initialized AH is found in * the active list, it is locked down and can be used; if no entry * is found, an async request is queued to do path resolution. */ static ibd_ace_t * ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) { ibd_ace_t *ptr; ibd_req_t *req; /* * Only attempt to print when we can; in the mdt pattr case, the * address is not aligned properly. */ if (((ulong_t)mac & 3) == 0) { DPRINT(4, "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), htonl(mac->ipoib_gidsuff[1])); } mutex_enter(&state->id_ac_mutex); if (((ptr = state->id_ac_hot_ace) != NULL) && (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { INC_REF(ptr, numwqe); mutex_exit(&state->id_ac_mutex); return (ptr); } if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { state->id_ac_hot_ace = ptr; mutex_exit(&state->id_ac_mutex); return (ptr); } /* * Implementation of a single outstanding async request; if * the operation is not started yet, queue a request and move * to ongoing state. Remember in id_ah_addr for which address * we are queueing the request, in case we need to flag an error; * Any further requests, for the same or different address, until * the operation completes, is sent back to GLDv3 to be retried. * The async thread will update id_ah_op with an error indication * or will set it to indicate the next look up can start; either * way, it will mac_tx_update() so that all blocked requests come * back here. */ *err = EAGAIN; if (state->id_ah_op == IBD_OP_NOTSTARTED) { req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req != NULL) { /* * We did not even find the entry; queue a request * for it. */ bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); state->id_ah_op = IBD_OP_ONGOING; ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); } } else if ((state->id_ah_op != IBD_OP_ONGOING) && (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { /* * Check the status of the pathrecord lookup request * we had queued before. */ if (state->id_ah_op == IBD_OP_ERRORED) { *err = EFAULT; state->id_ah_error++; } else { /* * IBD_OP_ROUTERED case: We need to send to the * all-router MCG. If we can find the AH for * the mcg, the Tx will be attempted. If we * do not find the AH, we return NORESOURCES * to retry. */ ipoib_mac_t routermac; (void) ibd_get_allroutergroup(state, mac, &routermac); ptr = ibd_acache_find(state, &routermac, B_TRUE, numwqe); } state->id_ah_op = IBD_OP_NOTSTARTED; } else if ((state->id_ah_op != IBD_OP_ONGOING) && (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { /* * This case can happen when we get a higher band * packet. The easiest way is to reset the state machine * to accommodate the higher priority packet. */ state->id_ah_op = IBD_OP_NOTSTARTED; } mutex_exit(&state->id_ac_mutex); return (ptr); } /* * Grab a not-currently-in-use AH/PathRecord from the active * list to recycle to a new destination. Only the async thread * executes this code. */ static ibd_ace_t * ibd_acache_get_unref(ibd_state_t *state) { ibd_ace_t *ptr = list_tail(&state->id_ah_active); boolean_t try_rc_chan_recycle = B_FALSE; ASSERT(mutex_owned(&state->id_ac_mutex)); /* * Do plain linear search. */ while (ptr != NULL) { /* * Note that it is possible that the "cycle" bit * is set on the AH w/o any reference count. The * mcg must have been deleted, and the tx cleanup * just decremented the reference count to 0, but * hasn't gotten around to grabbing the id_ac_mutex * to move the AH into the free list. */ if (GET_REF(ptr) == 0) { if (ptr->ac_chan != NULL) { ASSERT(state->id_enable_rc == B_TRUE); if (!try_rc_chan_recycle) { try_rc_chan_recycle = B_TRUE; ibd_rc_signal_ace_recycle(state, ptr); } } else { IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); break; } } ptr = list_prev(&state->id_ah_active, ptr); } return (ptr); } /* * Invoked to clean up AH from active list in case of multicast * disable and to handle sendonly memberships during mcg traps. * And for port up processing for multicast and unicast AHs. * Normally, the AH is taken off the active list, and put into * the free list to be recycled for a new destination. In case * Tx requests on the AH have not completed yet, the AH is marked * for reaping (which will put the AH on the free list) once the Tx's * complete; in this case, depending on the "force" input, we take * out the AH from the active list right now, or leave it also for * the reap operation. Returns TRUE if the AH is taken off the active * list (and either put into the free list right now, or arranged for * later), FALSE otherwise. */ boolean_t ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) { ibd_ace_t *acactive; boolean_t ret = B_TRUE; ASSERT(mutex_owned(&state->id_ac_mutex)); if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { /* * Note that the AH might already have the cycle bit set * on it; this might happen if sequences of multicast * enables and disables are coming so fast, that posted * Tx's to the mcg have not completed yet, and the cycle * bit is set successively by each multicast disable. */ if (SET_CYCLE_IF_REF(acactive)) { if (!force) { /* * The ace is kept on the active list, further * Tx's can still grab a reference on it; the * ace is reaped when all pending Tx's * referencing the AH complete. */ ret = B_FALSE; } else { /* * In the mcg trap case, we always pull the * AH from the active list. And also the port * up multi/unicast case. */ ASSERT(acactive->ac_chan == NULL); IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); acactive->ac_mce = NULL; } } else { /* * Determined the ref count is 0, thus reclaim * immediately after pulling out the ace from * the active list. */ ASSERT(acactive->ac_chan == NULL); IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); acactive->ac_mce = NULL; IBD_ACACHE_INSERT_FREE(state, acactive); } } return (ret); } /* * Helper function for async path record lookup. If we are trying to * Tx to a MCG, check our membership, possibly trying to join the * group if required. If that fails, try to send the packet to the * all router group (indicated by the redirect output), pointing * the input mac address to the router mcg address. */ static ibd_mce_t * ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) { ib_gid_t mgid; ibd_mce_t *mce; ipoib_mac_t routermac; *redirect = B_FALSE; ibd_n2h_gid(mac, &mgid); /* * Check the FullMember+SendOnlyNonMember list. * Since we are the only one who manipulates the * id_mc_full list, no locks are needed. */ mce = IBD_MCACHE_FIND_FULL(state, mgid); if (mce != NULL) { DPRINT(4, "ibd_async_mcache : already joined to group"); return (mce); } /* * Not found; try to join(SendOnlyNonMember) and attach. */ DPRINT(4, "ibd_async_mcache : not joined to group"); if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != NULL) { DPRINT(4, "ibd_async_mcache : nonmem joined to group"); return (mce); } /* * MCGroup not present; try to join the all-router group. If * any of the following steps succeed, we will be redirecting * to the all router group. */ DPRINT(4, "ibd_async_mcache : nonmem join failed"); if (!ibd_get_allroutergroup(state, mac, &routermac)) return (NULL); *redirect = B_TRUE; ibd_n2h_gid(&routermac, &mgid); bcopy(&routermac, mac, IPOIB_ADDRL); DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); /* * Are we already joined to the router group? */ if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { DPRINT(4, "ibd_async_mcache : using already joined router" "group\n"); return (mce); } /* * Can we join(SendOnlyNonMember) the router group? */ DPRINT(4, "ibd_async_mcache : attempting join to router grp"); if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != NULL) { DPRINT(4, "ibd_async_mcache : joined to router grp"); return (mce); } return (NULL); } /* * Async path record lookup code. */ static void ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) { ibd_ace_t *ce; ibd_mce_t *mce = NULL; ibt_path_attr_t path_attr; ibt_path_info_t path_info; ib_gid_t destgid; char ret = IBD_OP_NOTSTARTED; DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), htonl(mac->ipoib_gidsuff[1])); /* * Check whether we are trying to transmit to a MCG. * In that case, we need to make sure we are a member of * the MCG. */ if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { boolean_t redirected; /* * If we can not find or join the group or even * redirect, error out. */ if ((mce = ibd_async_mcache(state, mac, &redirected)) == NULL) { state->id_ah_op = IBD_OP_ERRORED; return; } /* * If we got redirected, we need to determine whether * the AH for the new mcg is in the cache already, and * not pull it in then; otherwise proceed to get the * path for the new mcg. There is no guarantee that * if the AH is currently in the cache, it will still be * there when we look in ibd_acache_lookup(), but that's * okay, we will come back here. */ if (redirected) { ret = IBD_OP_ROUTERED; DPRINT(4, "ibd_async_acache : redirected to " "%08X:%08X:%08X:%08X:%08X", htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), htonl(mac->ipoib_gidsuff[1])); mutex_enter(&state->id_ac_mutex); if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { state->id_ah_op = IBD_OP_ROUTERED; mutex_exit(&state->id_ac_mutex); DPRINT(4, "ibd_async_acache : router AH found"); return; } mutex_exit(&state->id_ac_mutex); } } /* * Get an AH from the free list. */ mutex_enter(&state->id_ac_mutex); if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { /* * No free ones; try to grab an unreferenced active * one. Maybe we need to make the active list LRU, * but that will create more work for Tx callbacks. * Is there a way of not having to pull out the * entry from the active list, but just indicate it * is being recycled? Yes, but that creates one more * check in the fast lookup path. */ if ((ce = ibd_acache_get_unref(state)) == NULL) { /* * Pretty serious shortage now. */ state->id_ah_op = IBD_OP_NOTSTARTED; mutex_exit(&state->id_ac_mutex); DPRINT(10, "ibd_async_acache : failed to find AH " "slot\n"); return; } /* * We could check whether ac_mce points to a SendOnly * member and drop that membership now. Or do it lazily * at detach time. */ ce->ac_mce = NULL; } mutex_exit(&state->id_ac_mutex); ASSERT(ce->ac_mce == NULL); /* * Update the entry. */ bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); bzero(&path_info, sizeof (path_info)); bzero(&path_attr, sizeof (ibt_path_attr_t)); path_attr.pa_sgid = state->id_sgid; path_attr.pa_num_dgids = 1; ibd_n2h_gid(&ce->ac_mac, &destgid); path_attr.pa_dgids = &destgid; path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; path_attr.pa_pkey = state->id_pkey; if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); goto error; } if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, ntohl(ce->ac_mac.ipoib_qpn), &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); goto error; } /* * mce is set whenever an AH is being associated with a * MCG; this will come in handy when we leave the MCG. The * lock protects Tx fastpath from scanning the active list. */ if (mce != NULL) ce->ac_mce = mce; /* * initiate a RC mode connection for unicast address */ if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { ASSERT(ce->ac_chan == NULL); DPRINT(10, "ibd_async_acache: call " "ibd_rc_try_connect(ace=%p)", ce); ibd_rc_try_connect(state, ce, &path_info); if (ce->ac_chan == NULL) { DPRINT(10, "ibd_async_acache: fail to setup RC" " channel"); state->rc_conn_fail++; goto error; } } mutex_enter(&state->id_ac_mutex); IBD_ACACHE_INSERT_ACTIVE(state, ce); state->id_ah_op = ret; mutex_exit(&state->id_ac_mutex); return; error: /* * We might want to drop SendOnly membership here if we * joined above. The lock protects Tx callbacks inserting * into the free list. */ mutex_enter(&state->id_ac_mutex); state->id_ah_op = IBD_OP_ERRORED; IBD_ACACHE_INSERT_FREE(state, ce); mutex_exit(&state->id_ac_mutex); } /* * While restoring port's presence on the subnet on a port up, it is possible * that the port goes down again. */ static void ibd_async_link(ibd_state_t *state, ibd_req_t *req) { ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : LINK_STATE_UP; ibd_mce_t *mce, *pmce; ibd_ace_t *ace, *pace; DPRINT(10, "ibd_async_link(): %d", opcode); /* * On a link up, revalidate the link speed/width. No point doing * this on a link down, since we will be unable to do SA operations, * defaulting to the lowest speed. Also notice that we update our * notion of speed before calling mac_link_update(), which will do * necessary higher level notifications for speed changes. */ if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) state->id_link_speed = ibd_get_portspeed(state); _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) } /* * Do all the work required to establish our presence on * the subnet. */ if (opcode == IBD_LINK_UP_ABSENT) { /* * If in promiscuous mode ... */ if (state->id_prom_op == IBD_OP_COMPLETED) { /* * Drop all nonmembership. */ ibd_async_unsetprom(state); /* * Then, try to regain nonmembership to all mcg's. */ ibd_async_setprom(state); } /* * Drop all sendonly membership (which also gets rid of the * AHs); try to reacquire all full membership. */ mce = list_head(&state->id_mc_full); while ((pmce = mce) != NULL) { mce = list_next(&state->id_mc_full, mce); if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) ibd_leave_group(state, pmce->mc_info.mc_adds_vect.av_dgid, IB_MC_JSTATE_SEND_ONLY_NON); else ibd_reacquire_group(state, pmce); } /* * Recycle all active AHs to free list (and if there are * pending posts, make sure they will go into the free list * once the Tx's complete). Grab the lock to prevent * concurrent Tx's as well as Tx cleanups. */ mutex_enter(&state->id_ac_mutex); ace = list_head(&state->id_ah_active); while ((pace = ace) != NULL) { boolean_t cycled; ace = list_next(&state->id_ah_active, ace); mce = pace->ac_mce; if (pace->ac_chan != NULL) { ASSERT(mce == NULL); ASSERT(state->id_enable_rc == B_TRUE); if (pace->ac_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) { INC_REF(pace, 1); IBD_ACACHE_PULLOUT_ACTIVE(state, pace); pace->ac_chan->chan_state = IBD_RC_STATE_ACT_CLOSING; ibd_rc_signal_act_close(state, pace); } else { state->rc_act_close_simultaneous++; DPRINT(40, "ibd_async_link: other " "thread is closing it, ace=%p, " "ac_chan=%p, chan_state=%d", pace, pace->ac_chan, pace->ac_chan->chan_state); } } else { cycled = ibd_acache_recycle(state, &pace->ac_mac, B_TRUE); } /* * If this is for an mcg, it must be for a fullmember, * since we got rid of send-only members above when * processing the mce list. */ ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == IB_MC_JSTATE_FULL))); /* * Check if the fullmember mce needs to be torn down, * ie whether the DLPI disable has already been done. * If so, do some of the work of tx_cleanup, namely * causing leave (which will fail), detach and * mce-freeing. tx_cleanup will put the AH into free * list. The reason to duplicate some of this * tx_cleanup work is because we want to delete the * AH right now instead of waiting for tx_cleanup, to * force subsequent Tx's to reacquire an AH. */ if ((mce != NULL) && (mce->mc_fullreap)) ibd_async_reap_group(state, mce, mce->mc_info.mc_adds_vect.av_dgid, mce->mc_jstate); } mutex_exit(&state->id_ac_mutex); } /* * mac handle is guaranteed to exist since driver does ibt_close_hca() * (which stops further events from being delivered) before * mac_unregister(). At this point, it is guaranteed that mac_register * has already been done. */ mutex_enter(&state->id_link_mutex); state->id_link_state = lstate; mac_link_update(state->id_mh, lstate); mutex_exit(&state->id_link_mutex); ibd_async_done(state); } /* * Check the pkey table to see if we can find the pkey we're looking for. * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on * failure. */ static int ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, uint16_t *pkix) { uint16_t ndx; ASSERT(pkix != NULL); for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { if (pkey_tbl[ndx] == pkey) { *pkix = ndx; return (0); } } return (-1); } /* * Late HCA Initialization: * If plumb had succeeded without the availability of an active port or the * pkey, and either of their availability is now being indicated via PORT_UP * or PORT_CHANGE respectively, try a start of the interface. * * Normal Operation: * When the link is notified up, we need to do a few things, based * on the port's current p_init_type_reply claiming a reinit has been * done or not. The reinit steps are: * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify * the old Pkey and GID0 are correct. * 2. Register for mcg traps (already done by ibmf). * 3. If PreservePresenceReply indicates the SM has restored port's presence * in subnet, nothing more to do. Else go to next steps (on async daemon). * 4. Give up all sendonly memberships. * 5. Acquire all full memberships. * 6. In promiscuous mode, acquire all non memberships. * 7. Recycle all AHs to free list. */ static void ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) { ibt_hca_portinfo_t *port_infop = NULL; ibt_status_t ibt_status; uint_t psize, port_infosz; ibd_link_op_t opcode; ibd_req_t *req; link_state_t new_link_state = LINK_STATE_UP; uint8_t itreply; uint16_t pkix; int ret; /* * Let's not race with a plumb or an unplumb; if we detect a * pkey relocation event later on here, we may have to restart. */ ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); mutex_enter(&state->id_link_mutex); /* * If the link state is unknown, a plumb has not yet been attempted * on the interface. Nothing to do. */ if (state->id_link_state == LINK_STATE_UNKNOWN) { mutex_exit(&state->id_link_mutex); goto link_mod_return; } /* * If link state is down because of plumb failure, and we are not in * late HCA init, and we were not successfully plumbed, nothing to do. */ if ((state->id_link_state == LINK_STATE_DOWN) && ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { mutex_exit(&state->id_link_mutex); goto link_mod_return; } /* * If this routine was called in response to a port down event, * we just need to see if this should be informed. */ if (code == IBT_ERROR_PORT_DOWN) { new_link_state = LINK_STATE_DOWN; goto update_link_state; } /* * If it's not a port down event we've received, try to get the port * attributes first. If we fail here, the port is as good as down. * Otherwise, if the link went down by the time the handler gets * here, give up - we cannot even validate the pkey/gid since those * are not valid and this is as bad as a port down anyway. */ ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, &port_infop, &psize, &port_infosz); if ((ibt_status != IBT_SUCCESS) || (psize != 1) || (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { new_link_state = LINK_STATE_DOWN; goto update_link_state; } /* * If in the previous attempt, the pkey was not found either due to the * port state being down, or due to it's absence in the pkey table, * look for it now and try to start the interface. */ if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { mutex_exit(&state->id_link_mutex); if ((ret = ibd_start(state)) != 0) { DPRINT(10, "ibd_linkmod: cannot start from late HCA " "init, ret=%d", ret); } ibt_free_portinfo(port_infop, port_infosz); goto link_mod_return; } /* * Check the SM InitTypeReply flags. If both NoLoadReply and * PreserveContentReply are 0, we don't know anything about the * data loaded into the port attributes, so we need to verify * if gid0 and pkey are still valid. */ itreply = port_infop->p_init_type_reply; if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { /* * Check to see if the subnet part of GID0 has changed. If * not, check the simple case first to see if the pkey * index is the same as before; finally check to see if the * pkey has been relocated to a different index in the table. */ _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, sizeof (ib_gid_t)) != 0) { new_link_state = LINK_STATE_DOWN; } else if (port_infop->p_pkey_tbl[state->id_pkix] == state->id_pkey) { new_link_state = LINK_STATE_UP; } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { ibt_free_portinfo(port_infop, port_infosz); mutex_exit(&state->id_link_mutex); /* * Currently a restart is required if our pkey has moved * in the pkey table. If we get the ibt_recycle_ud() to * work as documented (expected), we may be able to * avoid a complete restart. Note that we've already * marked both the start and stop 'in-progress' flags, * so it is ok to go ahead and do this restart. */ (void) ibd_undo_start(state, LINK_STATE_DOWN); if ((ret = ibd_start(state)) != 0) { DPRINT(10, "ibd_restart: cannot restart, " "ret=%d", ret); } goto link_mod_return; } else { new_link_state = LINK_STATE_DOWN; } _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) } update_link_state: if (port_infop) { ibt_free_portinfo(port_infop, port_infosz); } /* * If we're reporting a link up, check InitTypeReply to see if * the SM has ensured that the port's presence in mcg, traps, * etc. is intact. */ if (new_link_state == LINK_STATE_DOWN) { opcode = IBD_LINK_DOWN; } else { if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { opcode = IBD_LINK_UP; } else { opcode = IBD_LINK_UP_ABSENT; } } /* * If the old state is the same as the new state, and the SM indicated * no change in the port parameters, nothing to do. */ if ((state->id_link_state == new_link_state) && (opcode != IBD_LINK_UP_ABSENT)) { mutex_exit(&state->id_link_mutex); goto link_mod_return; } /* * Ok, so there was a link state change; see if it's safe to ask * the async thread to do the work */ if (!ibd_async_safe(state)) { state->id_link_state = new_link_state; mutex_exit(&state->id_link_mutex); goto link_mod_return; } mutex_exit(&state->id_link_mutex); /* * Queue up a request for ibd_async_link() to handle this link * state change event */ req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); req->rq_ptr = (void *)opcode; ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); link_mod_return: ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); } /* * For the port up/down events, IBTL guarantees there will not be concurrent * invocations of the handler. IBTL might coalesce link transition events, * and not invoke the handler for _each_ up/down transition, but it will * invoke the handler with last known state */ static void ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, ibt_async_code_t code, ibt_async_event_t *event) { ibd_state_t *state = (ibd_state_t *)clnt_private; switch (code) { case IBT_ERROR_CATASTROPHIC_CHAN: ibd_print_warn(state, "catastrophic channel error"); break; case IBT_ERROR_CQ: ibd_print_warn(state, "completion queue error"); break; case IBT_PORT_CHANGE_EVENT: /* * Events will be delivered to all instances that have * done ibt_open_hca() but not yet done ibt_close_hca(). * Only need to do work for our port; IBTF will deliver * events for other ports on the hca we have ibt_open_hca'ed * too. Note that id_port is initialized in ibd_attach() * before we do an ibt_open_hca() in ibd_attach(). */ ASSERT(state->id_hca_hdl == hca_hdl); if (state->id_port != event->ev_port) break; if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == IBT_PORT_CHANGE_PKEY) { ibd_link_mod(state, code); } break; case IBT_ERROR_PORT_DOWN: case IBT_CLNT_REREG_EVENT: case IBT_EVENT_PORT_UP: /* * Events will be delivered to all instances that have * done ibt_open_hca() but not yet done ibt_close_hca(). * Only need to do work for our port; IBTF will deliver * events for other ports on the hca we have ibt_open_hca'ed * too. Note that id_port is initialized in ibd_attach() * before we do an ibt_open_hca() in ibd_attach(). */ ASSERT(state->id_hca_hdl == hca_hdl); if (state->id_port != event->ev_port) break; ibd_link_mod(state, code); break; case IBT_HCA_ATTACH_EVENT: case IBT_HCA_DETACH_EVENT: /* * When a new card is plugged to the system, attach_event is * invoked. Additionally, a cfgadm needs to be run to make the * card known to the system, and an ifconfig needs to be run to * plumb up any ibd interfaces on the card. In the case of card * unplug, a cfgadm is run that will trigger any RCM scripts to * unplumb the ibd interfaces on the card; when the card is * actually unplugged, the detach_event is invoked; * additionally, if any ibd instances are still active on the * card (eg there were no associated RCM scripts), driver's * detach routine is invoked. */ break; default: break; } } static int ibd_register_mac(ibd_state_t *state, dev_info_t *dip) { mac_register_t *macp; int ret; if ((macp = mac_alloc(MAC_VERSION)) == NULL) { DPRINT(10, "ibd_register_mac: mac_alloc() failed"); return (DDI_FAILURE); } /* * Note that when we register with mac during attach, we don't * have the id_macaddr yet, so we'll simply be registering a * zero macaddr that we'll overwrite later during plumb (in * ibd_m_start()). Similar is the case with id_mtu - we'll * update the mac layer with the correct mtu during plumb. */ macp->m_type_ident = MAC_PLUGIN_IDENT_IB; macp->m_driver = state; macp->m_dip = dip; macp->m_src_addr = (uint8_t *)&state->id_macaddr; macp->m_callbacks = &ibd_m_callbacks; macp->m_min_sdu = 0; macp->m_multicast_sdu = IBD_DEF_MAX_SDU; if (state->id_type == IBD_PORT_DRIVER) { macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; } else if (state->id_enable_rc) { macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; } else { macp->m_max_sdu = IBD_DEF_MAX_SDU; } macp->m_priv_props = ibd_priv_props; /* * Register ourselves with the GLDv3 interface */ if ((ret = mac_register(macp, &state->id_mh)) != 0) { mac_free(macp); DPRINT(10, "ibd_register_mac: mac_register() failed, ret=%d", ret); return (DDI_FAILURE); } mac_free(macp); return (DDI_SUCCESS); } static int ibd_record_capab(ibd_state_t *state) { ibt_hca_attr_t hca_attrs; ibt_status_t ibt_status; _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) /* * Query the HCA and fetch its attributes */ ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); ASSERT(ibt_status == IBT_SUCCESS); /* * 1. Set the Hardware Checksum capability. Currently we only consider * full checksum offload. */ if (state->id_enable_rc) { state->id_hwcksum_capab = 0; } else { if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; } } /* * 2. Set LSO policy, capability and maximum length */ if (state->id_enable_rc) { state->id_lso_capable = B_FALSE; state->id_lso_maxlen = 0; } else { if (hca_attrs.hca_max_lso_size > 0) { state->id_lso_capable = B_TRUE; if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) state->id_lso_maxlen = IBD_LSO_MAXLEN; else state->id_lso_maxlen = hca_attrs.hca_max_lso_size; } else { state->id_lso_capable = B_FALSE; state->id_lso_maxlen = 0; } } /* * 3. Set Reserved L_Key capability */ if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { state->id_hca_res_lkey_capab = 1; state->id_res_lkey = hca_attrs.hca_reserved_lkey; state->rc_enable_iov_map = B_TRUE; } else { /* If no reserved lkey, we will not use ibt_map_mem_iov */ state->rc_enable_iov_map = B_FALSE; } /* * 4. Set maximum sqseg value after checking to see if extended sgl * size information is provided by the hca */ if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; } else { state->id_max_sqseg = hca_attrs.hca_max_sgl; state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; } if (state->id_max_sqseg > IBD_MAX_SQSEG) { state->id_max_sqseg = IBD_MAX_SQSEG; } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { ibd_print_warn(state, "Set #sgl = %d instead of default %d", state->id_max_sqseg, IBD_MAX_SQSEG); } if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { state->rc_tx_max_sqseg = IBD_MAX_SQSEG; } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); } /* * Translating the virtual address regions into physical regions * for using the Reserved LKey feature results in a wr sgl that * is a little longer. Since failing ibt_map_mem_iov() is costly, * we'll fix a high-water mark (65%) for when we should stop. */ state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; /* * 5. Set number of recv and send wqes after checking hca maximum * channel size. Store the max channel size in the state so that it * can be referred to when the swqe/rwqe change is requested via * dladm. */ state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - IBD_RWQE_MIN; if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) return (DDI_SUCCESS); } static int ibd_part_busy(ibd_state_t *state) { if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n"); return (DDI_FAILURE); } if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n"); return (DDI_FAILURE); } /* * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is * connecting to a remote IPoIB port. We can't remove this port. */ if (state->id_ah_op == IBD_OP_ONGOING) { DPRINT(10, "ibd_part_busy: failed: connecting\n"); return (DDI_FAILURE); } return (DDI_SUCCESS); } static void ibd_part_unattach(ibd_state_t *state) { uint32_t progress = state->id_mac_state; ibt_status_t ret; /* make sure rx resources are freed */ ibd_free_rx_rsrcs(state); if (progress & IBD_DRV_RC_SRQ_ALLOCD) { ASSERT(state->id_enable_rc); ibd_rc_fini_srq_list(state); state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); } if (progress & IBD_DRV_MAC_REGISTERED) { (void) mac_unregister(state->id_mh); state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); } if (progress & IBD_DRV_ASYNC_THR_CREATED) { /* * No new async requests will be posted since the device * link state has been marked as unknown; completion handlers * have been turned off, so Tx handler will not cause any * more IBD_ASYNC_REAP requests. * * Queue a request for the async thread to exit, which will * be serviced after any pending ones. This can take a while, * specially if the SM is unreachable, since IBMF will slowly * timeout each SM request issued by the async thread. Reap * the thread before continuing on, we do not want it to be * lingering in modunloaded code. */ ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); thread_join(state->id_async_thrid); state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); } if (progress & IBD_DRV_REQ_LIST_INITED) { list_destroy(&state->id_req_list); mutex_destroy(&state->id_acache_req_lock); cv_destroy(&state->id_acache_req_cv); state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; } if (progress & IBD_DRV_PD_ALLOCD) { if ((ret = ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl)) != IBT_SUCCESS) { ibd_print_warn(state, "failed to free " "protection domain, ret=%d", ret); } state->id_pd_hdl = NULL; state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); } if (progress & IBD_DRV_HCA_OPENED) { if ((ret = ibt_close_hca(state->id_hca_hdl)) != IBT_SUCCESS) { ibd_print_warn(state, "failed to close " "HCA device, ret=%d", ret); } state->id_hca_hdl = NULL; state->id_mac_state &= (~IBD_DRV_HCA_OPENED); } mutex_enter(&ibd_gstate.ig_mutex); if (progress & IBD_DRV_IBTL_ATTACH_DONE) { if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { ibd_print_warn(state, "ibt_detach() failed, ret=%d", ret); } state->id_ibt_hdl = NULL; state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); ibd_gstate.ig_ibt_hdl_ref_cnt--; } if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && (ibd_gstate.ig_ibt_hdl != NULL)) { if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { ibd_print_warn(state, "ibt_detach(): global " "failed, ret=%d", ret); } ibd_gstate.ig_ibt_hdl = NULL; } mutex_exit(&ibd_gstate.ig_mutex); if (progress & IBD_DRV_TXINTR_ADDED) { ddi_remove_softintr(state->id_tx); state->id_tx = NULL; state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); } if (progress & IBD_DRV_RXINTR_ADDED) { ddi_remove_softintr(state->id_rx); state->id_rx = NULL; state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); } #ifdef DEBUG if (progress & IBD_DRV_RC_PRIVATE_STATE) { kstat_delete(state->rc_ksp); state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); } #endif if (progress & IBD_DRV_STATE_INITIALIZED) { ibd_state_fini(state); state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); } } int ibd_part_attach(ibd_state_t *state, dev_info_t *dip) { ibt_status_t ret; int rv; kthread_t *kht; /* * Initialize mutexes and condition variables */ if (ibd_state_init(state, dip) != DDI_SUCCESS) { DPRINT(10, "ibd_part_attach: failed in ibd_state_init()"); return (DDI_FAILURE); } state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; /* * Allocate rx,tx softintr */ if (ibd_rx_softintr == 1) { if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { DPRINT(10, "ibd_part_attach: failed in " "ddi_add_softintr(id_rx), ret=%d", rv); return (DDI_FAILURE); } state->id_mac_state |= IBD_DRV_RXINTR_ADDED; } if (ibd_tx_softintr == 1) { if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, NULL, NULL, ibd_tx_recycle, (caddr_t)state)) != DDI_SUCCESS) { DPRINT(10, "ibd_part_attach: failed in " "ddi_add_softintr(id_tx), ret=%d", rv); return (DDI_FAILURE); } state->id_mac_state |= IBD_DRV_TXINTR_ADDED; } /* * Attach to IBTL */ mutex_enter(&ibd_gstate.ig_mutex); if (ibd_gstate.ig_ibt_hdl == NULL) { if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_part_attach: global: failed in " "ibt_attach(), ret=%d", ret); mutex_exit(&ibd_gstate.ig_mutex); return (DDI_FAILURE); } } if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, &state->id_ibt_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d", ret); mutex_exit(&ibd_gstate.ig_mutex); return (DDI_FAILURE); } ibd_gstate.ig_ibt_hdl_ref_cnt++; mutex_exit(&ibd_gstate.ig_mutex); state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; /* * Open the HCA */ if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, &state->id_hca_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d", ret); return (DDI_FAILURE); } state->id_mac_state |= IBD_DRV_HCA_OPENED; #ifdef DEBUG /* Initialize Driver Counters for Reliable Connected Mode */ if (state->id_enable_rc) { if (ibd_rc_init_stats(state) != DDI_SUCCESS) { DPRINT(10, "ibd_part_attach: failed in " "ibd_rc_init_stats"); return (DDI_FAILURE); } state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; } #endif /* * Record capabilities */ (void) ibd_record_capab(state); /* * Allocate a protection domain on the HCA */ if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, &state->id_pd_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d", ret); return (DDI_FAILURE); } state->id_mac_state |= IBD_DRV_PD_ALLOCD; /* * We need to initialise the req_list that is required for the * operation of the async_thread. */ mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); list_create(&state->id_req_list, sizeof (ibd_req_t), offsetof(ibd_req_t, rq_list)); state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; /* * Create the async thread; thread_create never fails. */ kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, TS_RUN, minclsyspri); state->id_async_thrid = kht->t_did; state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; return (DDI_SUCCESS); } /* * Attach device to the IO framework. */ static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { int ret; switch (cmd) { case DDI_ATTACH: ret = ibd_port_attach(dip); break; default: ret = DDI_FAILURE; break; } return (ret); } /* * Detach device from the IO framework. */ static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { ibd_state_t *state; int instance; /* * IBD doesn't support suspend/resume */ if (cmd != DDI_DETACH) return (DDI_FAILURE); /* * Get the instance softstate */ instance = ddi_get_instance(dip); state = ddi_get_soft_state(ibd_list, instance); /* * Release all resources we're holding still. Note that if we'd * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly * so far, we should find all the flags we need in id_mac_state. */ return (ibd_port_unattach(state, dip)); } /* * Pre ibt_attach() driver initialization */ static int ibd_state_init(ibd_state_t *state, dev_info_t *dip) { char buf[64]; mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); state->id_link_state = LINK_STATE_UNKNOWN; mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); state->id_trap_stop = B_TRUE; state->id_trap_inprog = 0; mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); state->id_dip = dip; mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); state->id_tx_busy = 0; mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); state->id_rx_list.dl_bufs_outstanding = 0; state->id_rx_list.dl_cnt = 0; mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip), state->id_pkey, state->id_plinkid); state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* For Reliable Connected Mode */ mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL); /* * Make the default link mode as RC. If this fails during connection * setup, the link mode is automatically transitioned to UD. * Also set the RC MTU. */ state->id_enable_rc = IBD_DEF_LINK_MODE; state->rc_mtu = IBD_DEF_RC_MAX_MTU; state->id_mtu = IBD_DEF_MAX_MTU; /* Iniatialize all tunables to default */ state->id_lso_policy = IBD_DEF_LSO_POLICY; state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; state->id_num_ah = IBD_DEF_NUM_AH; state->id_hash_size = IBD_DEF_HASH_SIZE; state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; return (DDI_SUCCESS); } /* * Post ibt_detach() driver deconstruction */ static void ibd_state_fini(ibd_state_t *state) { kmem_cache_destroy(state->id_req_kmc); mutex_destroy(&state->id_rx_list.dl_mutex); mutex_destroy(&state->id_rx_free_list.dl_mutex); mutex_destroy(&state->id_txpost_lock); mutex_destroy(&state->id_tx_list.dl_mutex); mutex_destroy(&state->id_tx_rel_list.dl_mutex); mutex_destroy(&state->id_lso_lock); mutex_destroy(&state->id_sched_lock); mutex_destroy(&state->id_scq_poll_lock); mutex_destroy(&state->id_rcq_poll_lock); cv_destroy(&state->id_trap_cv); mutex_destroy(&state->id_trap_lock); mutex_destroy(&state->id_link_mutex); /* For Reliable Connected Mode */ mutex_destroy(&state->rc_timeout_lock); mutex_destroy(&state->rc_srq_free_list.dl_mutex); mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); mutex_destroy(&state->rc_tx_large_bufs_lock); mutex_destroy(&state->rc_rx_lock); } /* * Fetch link speed from SA for snmp ifspeed reporting. */ static uint64_t ibd_get_portspeed(ibd_state_t *state) { int ret; ibt_path_info_t path; ibt_path_attr_t path_attr; uint8_t num_paths; uint64_t ifspeed; /* * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire * translates to 2 Gbps data rate. Thus, 1X single data rate is * 2000000000. Start with that as default. */ ifspeed = 2000000000; bzero(&path_attr, sizeof (path_attr)); /* * Get the port speed from Loopback path information. */ path_attr.pa_dgids = &state->id_sgid; path_attr.pa_num_dgids = 1; path_attr.pa_sgid = state->id_sgid; if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) goto earlydone; if (num_paths < 1) goto earlydone; /* * In case SA does not return an expected value, report the default * speed as 1X. */ ret = 1; switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ ret = 1; break; case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ ret = 4; break; case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ ret = 12; break; case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ ret = 2; break; case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ ret = 8; break; case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ ret = 16; break; case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ ret = 24; break; case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ ret = 32; break; case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ ret = 48; break; } ifspeed *= ret; earlydone: return (ifspeed); } /* * Search input mcg list (id_mc_full or id_mc_non) for an entry * representing the input mcg mgid. */ static ibd_mce_t * ibd_mcache_find(ib_gid_t mgid, struct list *mlist) { ibd_mce_t *ptr = list_head(mlist); /* * Do plain linear search. */ while (ptr != NULL) { if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, sizeof (ib_gid_t)) == 0) return (ptr); ptr = list_next(mlist, ptr); } return (NULL); } /* * Execute IBA JOIN. */ static ibt_status_t ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) { ibt_mcg_attr_t mcg_attr; bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; mcg_attr.mc_mgid = mgid; mcg_attr.mc_join_state = mce->mc_jstate; mcg_attr.mc_scope = state->id_scope; mcg_attr.mc_pkey = state->id_pkey; mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, NULL, NULL)); } /* * This code JOINs the port in the proper way (depending on the join * state) so that IBA fabric will forward mcg packets to/from the port. * It also attaches the QPN to the mcg so it can receive those mcg * packets. This code makes sure not to attach the mcg to the QP if * that has been previously done due to the mcg being joined with a * different join state, even though this is not required by SWG_0216, * refid 3610. */ static ibd_mce_t * ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) { ibt_status_t ibt_status; ibd_mce_t *mce, *tmce, *omce = NULL; boolean_t do_attach = B_TRUE; DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", jstate, mgid.gid_prefix, mgid.gid_guid); /* * For enable_multicast Full member joins, we need to do some * extra work. If there is already an mce on the list that * indicates full membership, that means the membership has * not yet been dropped (since the disable_multicast was issued) * because there are pending Tx's to the mcg; in that case, just * mark the mce not to be reaped when the Tx completion queues * an async reap operation. * * If there is already an mce on the list indicating sendonly * membership, try to promote to full membership. Be careful * not to deallocate the old mce, since there might be an AH * pointing to it; instead, update the old mce with new data * that tracks the full membership. */ if ((jstate == IB_MC_JSTATE_FULL) && ((omce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { if (omce->mc_jstate == IB_MC_JSTATE_FULL) { ASSERT(omce->mc_fullreap); omce->mc_fullreap = B_FALSE; return (omce); } else { ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); } } /* * Allocate the ibd_mce_t to track this JOIN. */ mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); mce->mc_fullreap = B_FALSE; mce->mc_jstate = jstate; if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", ibt_status); kmem_free(mce, sizeof (ibd_mce_t)); return (NULL); } /* * Is an IBA attach required? Not if the interface is already joined * to the mcg in a different appropriate join state. */ if (jstate == IB_MC_JSTATE_NON) { tmce = IBD_MCACHE_FIND_FULL(state, mgid); if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) do_attach = B_FALSE; } else if (jstate == IB_MC_JSTATE_FULL) { if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) do_attach = B_FALSE; } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ do_attach = B_FALSE; } if (do_attach) { /* * Do the IBA attach. */ DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, &mce->mc_info)) != IBT_SUCCESS) { DPRINT(10, "ibd_join_group : failed qp attachment " "%d\n", ibt_status); /* * NOTE that we should probably preserve the join info * in the list and later try to leave again at detach * time. */ (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); kmem_free(mce, sizeof (ibd_mce_t)); return (NULL); } } /* * Insert the ibd_mce_t in the proper list. */ if (jstate == IB_MC_JSTATE_NON) { IBD_MCACHE_INSERT_NON(state, mce); } else { /* * Set up the mc_req fields used for reaping the * mcg in case of delayed tx completion (see * ibd_tx_cleanup()). Also done for sendonly join in * case we are promoted to fullmembership later and * keep using the same mce. */ mce->mc_req.rq_gid = mgid; mce->mc_req.rq_ptr = mce; /* * Check whether this is the case of trying to join * full member, and we were already joined send only. * We try to drop our SendOnly membership, but it is * possible that the mcg does not exist anymore (and * the subnet trap never reached us), so the leave * operation might fail. */ if (omce != NULL) { (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); omce->mc_jstate = IB_MC_JSTATE_FULL; bcopy(&mce->mc_info, &omce->mc_info, sizeof (ibt_mcg_info_t)); kmem_free(mce, sizeof (ibd_mce_t)); return (omce); } mutex_enter(&state->id_mc_mutex); IBD_MCACHE_INSERT_FULL(state, mce); mutex_exit(&state->id_mc_mutex); } return (mce); } /* * Called during port up event handling to attempt to reacquire full * membership to an mcg. Stripped down version of ibd_join_group(). * Note that it is possible that the mcg might have gone away, and * gets recreated at this point. */ static void ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) { ib_gid_t mgid; /* * If the mc_fullreap flag is set, or this join fails, a subsequent * reap/leave is going to try to leave the group. We could prevent * that by adding a boolean flag into ibd_mce_t, if required. */ if (mce->mc_fullreap) return; mgid = mce->mc_info.mc_adds_vect.av_dgid; DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); /* While reacquiring, leave and then join the MCG */ (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, mce->mc_jstate); if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) ibd_print_warn(state, "Failure on port up to rejoin " "multicast gid %016llx:%016llx", (u_longlong_t)mgid.gid_prefix, (u_longlong_t)mgid.gid_guid); } /* * This code handles delayed Tx completion cleanups for mcg's to which * disable_multicast has been issued, regular mcg related cleanups during * disable_multicast, disable_promiscuous and mcg traps, as well as * cleanups during driver detach time. Depending on the join state, * it deletes the mce from the appropriate list and issues the IBA * leave/detach; except in the disable_multicast case when the mce * is left on the active list for a subsequent Tx completion cleanup. */ static void ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, uint8_t jstate) { ibd_mce_t *tmce; boolean_t do_detach = B_TRUE; /* * Before detaching, we must check whether the other list * contains the mcg; if we detach blindly, the consumer * who set up the other list will also stop receiving * traffic. */ if (jstate == IB_MC_JSTATE_FULL) { /* * The following check is only relevant while coming * from the Tx completion path in the reap case. */ if (!mce->mc_fullreap) return; mutex_enter(&state->id_mc_mutex); IBD_MCACHE_PULLOUT_FULL(state, mce); mutex_exit(&state->id_mc_mutex); if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) do_detach = B_FALSE; } else if (jstate == IB_MC_JSTATE_NON) { IBD_MCACHE_PULLOUT_NON(state, mce); tmce = IBD_MCACHE_FIND_FULL(state, mgid); if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) do_detach = B_FALSE; } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ mutex_enter(&state->id_mc_mutex); IBD_MCACHE_PULLOUT_FULL(state, mce); mutex_exit(&state->id_mc_mutex); do_detach = B_FALSE; } /* * If we are reacting to a mcg trap and leaving our sendonly or * non membership, the mcg is possibly already gone, so attempting * to leave might fail. On the other hand, we must try to leave * anyway, since this might be a trap from long ago, and we could * have potentially sendonly joined to a recent incarnation of * the mcg and are about to loose track of this information. */ if (do_detach) { DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); } (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); kmem_free(mce, sizeof (ibd_mce_t)); } /* * Async code executed due to multicast and promiscuous disable requests * and mcg trap handling; also executed during driver detach. Mostly, a * leave and detach is done; except for the fullmember case when Tx * requests are pending, whence arrangements are made for subsequent * cleanup on Tx completion. */ static void ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) { ipoib_mac_t mcmac; boolean_t recycled; ibd_mce_t *mce; DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", jstate, mgid.gid_prefix, mgid.gid_guid); if (jstate == IB_MC_JSTATE_NON) { recycled = B_TRUE; mce = IBD_MCACHE_FIND_NON(state, mgid); /* * In case we are handling a mcg trap, we might not find * the mcg in the non list. */ if (mce == NULL) { return; } } else { mce = IBD_MCACHE_FIND_FULL(state, mgid); /* * In case we are handling a mcg trap, make sure the trap * is not arriving late; if we have an mce that indicates * that we are already a fullmember, that would be a clear * indication that the trap arrived late (ie, is for a * previous incarnation of the mcg). */ if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { if ((mce == NULL) || (mce->mc_jstate == IB_MC_JSTATE_FULL)) { return; } } else { ASSERT(jstate == IB_MC_JSTATE_FULL); /* * If join group failed, mce will be NULL here. * This is because in GLDv3 driver, set multicast * will always return success. */ if (mce == NULL) { return; } mce->mc_fullreap = B_TRUE; } /* * If no pending Tx's remain that reference the AH * for the mcg, recycle it from active to free list. * Else in the IB_MC_JSTATE_FULL case, just mark the AH, * so the last completing Tx will cause an async reap * operation to be invoked, at which time we will drop our * membership to the mcg so that the pending Tx's complete * successfully. Refer to comments on "AH and MCE active * list manipulation" at top of this file. The lock protects * against Tx fast path and Tx cleanup code. */ mutex_enter(&state->id_ac_mutex); ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); recycled = ibd_acache_recycle(state, &mcmac, (jstate == IB_MC_JSTATE_SEND_ONLY_NON)); mutex_exit(&state->id_ac_mutex); } if (recycled) { DPRINT(2, "ibd_leave_group : leave_group reaping : " "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); ibd_async_reap_group(state, mce, mgid, jstate); } } /* * Find the broadcast address as defined by IPoIB; implicitly * determines the IBA scope, mtu, tclass etc of the link the * interface is going to be a member of. */ static ibt_status_t ibd_find_bgroup(ibd_state_t *state) { ibt_mcg_attr_t mcg_attr; uint_t numg; uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, IB_MC_SCOPE_GLOBAL }; int i, mcgmtu; boolean_t found = B_FALSE; int ret; ibt_mcg_info_t mcg_info; state->id_bgroup_created = B_FALSE; state->id_bgroup_present = B_FALSE; query_bcast_grp: bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); mcg_attr.mc_pkey = state->id_pkey; _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { state->id_scope = mcg_attr.mc_scope = scopes[i]; /* * Look for the IPoIB broadcast group. */ _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) state->id_mgid.gid_prefix = (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | ((uint64_t)state->id_scope << 48) | ((uint32_t)(state->id_pkey << 16))); mcg_attr.mc_mgid = state->id_mgid; _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, &state->id_mcinfo, &numg) == IBT_SUCCESS) { found = B_TRUE; break; } } if (!found) { if (state->id_create_broadcast_group) { /* * If we created the broadcast group, but failed to * find it, we can't do anything except leave the * one we created and return failure. */ if (state->id_bgroup_created) { ibd_print_warn(state, "IPoIB broadcast group " "absent. Unable to query after create."); goto find_bgroup_fail; } /* * Create the ipoib broadcast group if it didn't exist */ bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; mcg_attr.mc_pkey = state->id_pkey; mcg_attr.mc_flow = 0; mcg_attr.mc_sl = 0; mcg_attr.mc_tclass = 0; _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) state->id_mgid.gid_prefix = (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | ((uint32_t)(state->id_pkey << 16))); mcg_attr.mc_mgid = state->id_mgid; _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, &mcg_info, NULL, NULL)) != IBT_SUCCESS) { ibd_print_warn(state, "IPoIB broadcast group " "absent, create failed: ret = %d\n", ret); state->id_bgroup_created = B_FALSE; return (IBT_FAILURE); } state->id_bgroup_created = B_TRUE; goto query_bcast_grp; } else { ibd_print_warn(state, "IPoIB broadcast group absent"); return (IBT_FAILURE); } } /* * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. */ mcgmtu = (128 << state->id_mcinfo->mc_mtu); if (state->id_mtu < mcgmtu) { ibd_print_warn(state, "IPoIB broadcast group MTU %d " "greater than port's maximum MTU %d", mcgmtu, state->id_mtu); ibt_free_mcg_info(state->id_mcinfo, 1); goto find_bgroup_fail; } state->id_mtu = mcgmtu; state->id_bgroup_present = B_TRUE; return (IBT_SUCCESS); find_bgroup_fail: if (state->id_bgroup_created) { (void) ibt_leave_mcg(state->id_sgid, mcg_info.mc_adds_vect.av_dgid, state->id_sgid, IB_MC_JSTATE_FULL); } return (IBT_FAILURE); } static int ibd_alloc_tx_copybufs(ibd_state_t *state) { ibt_mr_attr_t mem_attr; /* * Allocate one big chunk for all regular tx copy bufs */ state->id_tx_buf_sz = state->id_mtu; if (state->id_lso_policy && state->id_lso_capable && (state->id_ud_tx_copy_thresh > state->id_mtu)) { state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; } state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * state->id_tx_buf_sz, KM_SLEEP); state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * sizeof (ibd_swqe_t), KM_SLEEP); /* * Do one memory registration on the entire txbuf area */ mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * sizeof (ibd_swqe_t)); kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * state->id_tx_buf_sz); state->id_tx_bufs = NULL; return (DDI_FAILURE); } return (DDI_SUCCESS); } static int ibd_alloc_tx_lsobufs(ibd_state_t *state) { ibt_mr_attr_t mem_attr; ibd_lsobuf_t *buflist; ibd_lsobuf_t *lbufp; ibd_lsobuf_t *tail; ibd_lsobkt_t *bktp; uint8_t *membase; uint8_t *memp; uint_t memsz; int i; /* * Allocate the lso bucket */ bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); /* * Allocate the entire lso memory and register it */ memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ; membase = kmem_zalloc(memsz, KM_SLEEP); mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; mem_attr.mr_len = memsz; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); kmem_free(membase, memsz); kmem_free(bktp, sizeof (ibd_lsobkt_t)); return (DDI_FAILURE); } mutex_enter(&state->id_lso_lock); /* * Now allocate the buflist. Note that the elements in the buflist and * the buffers in the lso memory have a permanent 1-1 relation, so we * can always derive the address of a buflist entry from the address of * an lso buffer. */ buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t), KM_SLEEP); /* * Set up the lso buf chain */ memp = membase; lbufp = buflist; for (i = 0; i < state->id_num_lso_bufs; i++) { lbufp->lb_isfree = 1; lbufp->lb_buf = memp; lbufp->lb_next = lbufp + 1; tail = lbufp; memp += IBD_LSO_BUFSZ; lbufp++; } tail->lb_next = NULL; /* * Set up the LSO buffer information in ibd state */ bktp->bkt_bufl = buflist; bktp->bkt_free_head = buflist; bktp->bkt_mem = membase; bktp->bkt_nelem = state->id_num_lso_bufs; bktp->bkt_nfree = bktp->bkt_nelem; state->id_lso = bktp; mutex_exit(&state->id_lso_lock); return (DDI_SUCCESS); } /* * Statically allocate Tx buffer list(s). */ static int ibd_init_txlist(ibd_state_t *state) { ibd_swqe_t *swqe; ibt_lkey_t lkey; int i; uint_t len; uint8_t *bufaddr; if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) return (DDI_FAILURE); if (state->id_lso_policy && state->id_lso_capable) { if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) state->id_lso_capable = B_FALSE; } mutex_enter(&state->id_tx_list.dl_mutex); state->id_tx_list.dl_head = NULL; state->id_tx_list.dl_pending_sends = B_FALSE; state->id_tx_list.dl_cnt = 0; mutex_exit(&state->id_tx_list.dl_mutex); mutex_enter(&state->id_tx_rel_list.dl_mutex); state->id_tx_rel_list.dl_head = NULL; state->id_tx_rel_list.dl_pending_sends = B_FALSE; state->id_tx_rel_list.dl_cnt = 0; mutex_exit(&state->id_tx_rel_list.dl_mutex); /* * Allocate and setup the swqe list */ lkey = state->id_tx_mr_desc.md_lkey; bufaddr = state->id_tx_bufs; len = state->id_tx_buf_sz; swqe = state->id_tx_wqes; mutex_enter(&state->id_tx_list.dl_mutex); for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) { swqe->swqe_next = NULL; swqe->swqe_im_mblk = NULL; swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) bufaddr; swqe->swqe_copybuf.ic_sgl.ds_key = lkey; swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; swqe->w_swr.wr_trans = IBT_UD_SRV; /* These are set in send */ swqe->w_swr.wr_nds = 0; swqe->w_swr.wr_sgl = NULL; swqe->w_swr.wr_opcode = IBT_WRC_SEND; /* add to list */ state->id_tx_list.dl_cnt++; swqe->swqe_next = state->id_tx_list.dl_head; state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); } mutex_exit(&state->id_tx_list.dl_mutex); return (DDI_SUCCESS); } static int ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, uint32_t *nds_p) { ibd_lsobkt_t *bktp; ibd_lsobuf_t *lbufp; ibd_lsobuf_t *nextp; ibt_lkey_t lso_lkey; uint_t frag_sz; uint_t num_needed; int i; ASSERT(sgl_p != NULL); ASSERT(nds_p != NULL); ASSERT(req_sz != 0); /* * Determine how many bufs we'd need for the size requested */ num_needed = req_sz / IBD_LSO_BUFSZ; if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) num_needed++; mutex_enter(&state->id_lso_lock); /* * If we don't have enough lso bufs, return failure */ ASSERT(state->id_lso != NULL); bktp = state->id_lso; if (bktp->bkt_nfree < num_needed) { mutex_exit(&state->id_lso_lock); return (-1); } /* * Pick the first 'num_needed' bufs from the free list */ lso_lkey = bktp->bkt_mr_desc.md_lkey; lbufp = bktp->bkt_free_head; for (i = 0; i < num_needed; i++) { ASSERT(lbufp->lb_isfree != 0); ASSERT(lbufp->lb_buf != NULL); nextp = lbufp->lb_next; sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; sgl_p[i].ds_key = lso_lkey; sgl_p[i].ds_len = IBD_LSO_BUFSZ; lbufp->lb_isfree = 0; lbufp->lb_next = NULL; lbufp = nextp; } bktp->bkt_free_head = lbufp; /* * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need * to adjust the last sgl entry's length. Since we know we need atleast * one, the i-1 use below is ok. */ if (frag_sz) { sgl_p[i-1].ds_len = frag_sz; } /* * Update nfree count and return */ bktp->bkt_nfree -= num_needed; mutex_exit(&state->id_lso_lock); *nds_p = num_needed; return (0); } static void ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) { ibd_lsobkt_t *bktp; ibd_lsobuf_t *lbufp; uint8_t *lso_mem_end; uint_t ndx; int i; mutex_enter(&state->id_lso_lock); bktp = state->id_lso; ASSERT(bktp != NULL); lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; for (i = 0; i < nds; i++) { uint8_t *va; va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); /* * Figure out the buflist element this sgl buffer corresponds * to and put it back at the head */ ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; lbufp = bktp->bkt_bufl + ndx; ASSERT(lbufp->lb_isfree == 0); ASSERT(lbufp->lb_buf == va); lbufp->lb_isfree = 1; lbufp->lb_next = bktp->bkt_free_head; bktp->bkt_free_head = lbufp; } bktp->bkt_nfree += nds; mutex_exit(&state->id_lso_lock); } static void ibd_free_tx_copybufs(ibd_state_t *state) { /* * Unregister txbuf mr */ if (ibt_deregister_mr(state->id_hca_hdl, state->id_tx_mr_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); } state->id_tx_mr_hdl = NULL; /* * Free txbuf memory */ kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * sizeof (ibd_swqe_t)); kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * state->id_tx_buf_sz); state->id_tx_wqes = NULL; state->id_tx_bufs = NULL; } static void ibd_free_tx_lsobufs(ibd_state_t *state) { ibd_lsobkt_t *bktp; mutex_enter(&state->id_lso_lock); if ((bktp = state->id_lso) == NULL) { mutex_exit(&state->id_lso_lock); return; } /* * First, free the buflist */ ASSERT(bktp->bkt_bufl != NULL); kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); /* * Unregister the LSO memory and free it */ ASSERT(bktp->bkt_mr_hdl != NULL); if (ibt_deregister_mr(state->id_hca_hdl, bktp->bkt_mr_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_free_lsobufs: ibt_deregister_mr failed"); } ASSERT(bktp->bkt_mem); kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); /* * Finally free the bucket */ kmem_free(bktp, sizeof (ibd_lsobkt_t)); state->id_lso = NULL; mutex_exit(&state->id_lso_lock); } /* * Free the statically allocated Tx buffer list. */ static void ibd_fini_txlist(ibd_state_t *state) { /* * Free the allocated swqes */ mutex_enter(&state->id_tx_list.dl_mutex); mutex_enter(&state->id_tx_rel_list.dl_mutex); state->id_tx_list.dl_head = NULL; state->id_tx_list.dl_pending_sends = B_FALSE; state->id_tx_list.dl_cnt = 0; state->id_tx_rel_list.dl_head = NULL; state->id_tx_rel_list.dl_pending_sends = B_FALSE; state->id_tx_rel_list.dl_cnt = 0; mutex_exit(&state->id_tx_rel_list.dl_mutex); mutex_exit(&state->id_tx_list.dl_mutex); ibd_free_tx_lsobufs(state); ibd_free_tx_copybufs(state); } /* * post a list of rwqes, NULL terminated. */ static void ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) { uint_t i; uint_t num_posted; ibt_status_t ibt_status; ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; while (rwqe) { /* Post up to IBD_RX_POST_CNT receive work requests */ for (i = 0; i < IBD_RX_POST_CNT; i++) { wrs[i] = rwqe->w_rwr; rwqe = WQE_TO_RWQE(rwqe->rwqe_next); if (rwqe == NULL) { i++; break; } } /* * If posting fails for some reason, we'll never receive * completion intimation, so we'll need to cleanup. But * we need to make sure we don't clean up nodes whose * wrs have been successfully posted. We assume that the * hca driver returns on the first failure to post and * therefore the first 'num_posted' entries don't need * cleanup here. */ atomic_add_32(&state->id_rx_list.dl_cnt, i); num_posted = 0; ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, &num_posted); if (ibt_status != IBT_SUCCESS) { /* This cannot happen unless the device has an error. */ ibd_print_warn(state, "ibd_post_recv: FATAL: " "posting multiple wrs failed: " "requested=%d, done=%d, ret=%d", IBD_RX_POST_CNT, num_posted, ibt_status); atomic_add_32(&state->id_rx_list.dl_cnt, num_posted - i); } } } /* * Grab a list of rwqes from the array of lists, and post the list. */ static void ibd_post_recv_intr(ibd_state_t *state) { ibd_rx_queue_t *rxp; ibd_rwqe_t *list; /* rotate through the rx_queue array, expecting an adequate number */ state->id_rx_post_queue_index = (state->id_rx_post_queue_index + 1) & (state->id_rx_nqueues - 1); rxp = state->id_rx_queues + state->id_rx_post_queue_index; mutex_enter(&rxp->rx_post_lock); list = WQE_TO_RWQE(rxp->rx_head); rxp->rx_head = NULL; rxp->rx_cnt = 0; mutex_exit(&rxp->rx_post_lock); ibd_post_recv_list(state, list); } /* macro explained below */ #define RX_QUEUE_HASH(rwqe) \ (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) /* * Add a rwqe to one of the the Rx lists. If the list is large enough * (exactly IBD_RX_POST_CNT), post the list to the hardware. * * Note: one of 2^N lists is chosen via a hash. This is done * because using one list is contentious. If the first list is busy * (mutex_tryenter fails), use a second list (just call mutex_enter). * * The number 8 in RX_QUEUE_HASH is a random choice that provides * even distribution of mapping rwqes to the 2^N queues. */ static void ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) { ibd_rx_queue_t *rxp; rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); if (!mutex_tryenter(&rxp->rx_post_lock)) { /* Failed. Try a different queue ("ptr + 16" ensures that). */ rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); mutex_enter(&rxp->rx_post_lock); } rwqe->rwqe_next = rxp->rx_head; if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); /* only call ibt_post_recv() every Nth time through here */ if ((active & (state->id_rx_nqueues - 1)) == 0) { rxp->rx_head = NULL; rxp->rx_cnt = 0; mutex_exit(&rxp->rx_post_lock); ibd_post_recv_list(state, rwqe); return; } } rxp->rx_head = RWQE_TO_WQE(rwqe); mutex_exit(&rxp->rx_post_lock); } static int ibd_alloc_rx_copybufs(ibd_state_t *state) { ibt_mr_attr_t mem_attr; int i; /* * Allocate one big chunk for all regular rx copy bufs */ state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe * state->id_rx_buf_sz, KM_SLEEP); state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe * sizeof (ibd_rwqe_t), KM_SLEEP); state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * sizeof (ibd_rx_queue_t), KM_SLEEP); for (i = 0; i < state->id_rx_nqueues; i++) { ibd_rx_queue_t *rxp = state->id_rx_queues + i; mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); } /* * Do one memory registration on the entire rxbuf area */ mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * state->id_rx_buf_sz); state->id_rx_bufs = NULL; state->id_rx_wqes = NULL; return (DDI_FAILURE); } return (DDI_SUCCESS); } /* * Allocate the statically allocated Rx buffer list. */ static int ibd_init_rxlist(ibd_state_t *state) { ibd_rwqe_t *rwqe, *next; ibd_wqe_t *list; ibt_lkey_t lkey; int i; uint_t len; uint8_t *bufaddr; mutex_enter(&state->id_rx_free_list.dl_mutex); if (state->id_rx_free_list.dl_head != NULL) { /* rx rsrcs were never freed. Just repost them */ len = state->id_rx_buf_sz; list = state->id_rx_free_list.dl_head; state->id_rx_free_list.dl_head = NULL; state->id_rx_free_list.dl_cnt = 0; mutex_exit(&state->id_rx_free_list.dl_mutex); for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { if ((rwqe->rwqe_im_mblk = desballoc( rwqe->rwqe_copybuf.ic_bufaddr, len, 0, &rwqe->w_freemsg_cb)) == NULL) { /* allow freemsg_cb to free the rwqes */ if (atomic_dec_32_nv(&state->id_running) != 0) { cmn_err(CE_WARN, "ibd_init_rxlist: " "id_running was not 1\n"); } DPRINT(10, "ibd_init_rxlist : " "failed in desballoc()"); for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; rwqe = next) { next = WQE_TO_RWQE(rwqe->rwqe_next); if (rwqe->rwqe_im_mblk) { atomic_inc_32(&state-> id_rx_list. dl_bufs_outstanding); freemsg(rwqe->rwqe_im_mblk); } else ibd_free_rwqe(state, rwqe); } atomic_inc_32(&state->id_running); return (DDI_FAILURE); } } ibd_post_recv_list(state, WQE_TO_RWQE(list)); return (DDI_SUCCESS); } mutex_exit(&state->id_rx_free_list.dl_mutex); if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) return (DDI_FAILURE); /* * Allocate and setup the rwqe list */ len = state->id_rx_buf_sz; lkey = state->id_rx_mr_desc.md_lkey; rwqe = state->id_rx_wqes; bufaddr = state->id_rx_bufs; list = NULL; for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) { rwqe->w_state = state; rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; rwqe->w_freemsg_cb.free_arg = (char *)rwqe; rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, &rwqe->w_freemsg_cb)) == NULL) { DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); /* allow freemsg_cb to free the rwqes */ if (atomic_dec_32_nv(&state->id_running) != 0) { cmn_err(CE_WARN, "ibd_init_rxlist: " "id_running was not 1\n"); } DPRINT(10, "ibd_init_rxlist : " "failed in desballoc()"); for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; rwqe = next) { next = WQE_TO_RWQE(rwqe->rwqe_next); freemsg(rwqe->rwqe_im_mblk); } atomic_inc_32(&state->id_running); /* remove reference to free'd rwqes */ mutex_enter(&state->id_rx_free_list.dl_mutex); state->id_rx_free_list.dl_head = NULL; state->id_rx_free_list.dl_cnt = 0; mutex_exit(&state->id_rx_free_list.dl_mutex); ibd_fini_rxlist(state); return (DDI_FAILURE); } rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; rwqe->rwqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)bufaddr; rwqe->rwqe_copybuf.ic_sgl.ds_len = len; rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; rwqe->w_rwr.wr_nds = 1; rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; rwqe->rwqe_next = list; list = RWQE_TO_WQE(rwqe); } ibd_post_recv_list(state, WQE_TO_RWQE(list)); return (DDI_SUCCESS); } static void ibd_free_rx_copybufs(ibd_state_t *state) { int i; /* * Unregister rxbuf mr */ if (ibt_deregister_mr(state->id_hca_hdl, state->id_rx_mr_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); } state->id_rx_mr_hdl = NULL; /* * Free rxbuf memory */ for (i = 0; i < state->id_rx_nqueues; i++) { ibd_rx_queue_t *rxp = state->id_rx_queues + i; mutex_destroy(&rxp->rx_post_lock); } kmem_free(state->id_rx_queues, state->id_rx_nqueues * sizeof (ibd_rx_queue_t)); kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * state->id_rx_buf_sz); state->id_rx_queues = NULL; state->id_rx_wqes = NULL; state->id_rx_bufs = NULL; } static void ibd_free_rx_rsrcs(ibd_state_t *state) { mutex_enter(&state->id_rx_free_list.dl_mutex); if (state->id_rx_free_list.dl_head == NULL) { /* already freed */ mutex_exit(&state->id_rx_free_list.dl_mutex); return; } ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe); ibd_free_rx_copybufs(state); state->id_rx_free_list.dl_cnt = 0; state->id_rx_free_list.dl_head = NULL; mutex_exit(&state->id_rx_free_list.dl_mutex); } /* * Free the statically allocated Rx buffer list. */ static void ibd_fini_rxlist(ibd_state_t *state) { ibd_rwqe_t *rwqe; int i; /* run through the rx_queue's, calling freemsg() */ for (i = 0; i < state->id_rx_nqueues; i++) { ibd_rx_queue_t *rxp = state->id_rx_queues + i; mutex_enter(&rxp->rx_post_lock); for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { freemsg(rwqe->rwqe_im_mblk); rxp->rx_cnt--; } rxp->rx_head = NULL; mutex_exit(&rxp->rx_post_lock); } /* cannot free rx resources unless gld returned everything */ if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) ibd_free_rx_rsrcs(state); } /* * Free an allocated recv wqe. */ /* ARGSUSED */ static void ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) { /* * desballoc() failed (no memory). * * This rwqe is placed on a free list so that it * can be reinstated when memory is available. * * NOTE: no code currently exists to reinstate * these "lost" rwqes. */ mutex_enter(&state->id_rx_free_list.dl_mutex); state->id_rx_free_list.dl_cnt++; rwqe->rwqe_next = state->id_rx_free_list.dl_head; state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); mutex_exit(&state->id_rx_free_list.dl_mutex); } /* * IBA Rx completion queue handler. Guaranteed to be single * threaded and nonreentrant for this CQ. */ /* ARGSUSED */ static void ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibd_state_t *state = (ibd_state_t *)arg; atomic_inc_64(&state->id_num_intrs); if (ibd_rx_softintr == 1) { mutex_enter(&state->id_rcq_poll_lock); if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; mutex_exit(&state->id_rcq_poll_lock); return; } else { mutex_exit(&state->id_rcq_poll_lock); ddi_trigger_softintr(state->id_rx); } } else (void) ibd_intr((caddr_t)state); } /* * CQ handler for Tx completions, when the Tx CQ is in * interrupt driven mode. */ /* ARGSUSED */ static void ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibd_state_t *state = (ibd_state_t *)arg; atomic_inc_64(&state->id_num_intrs); if (ibd_tx_softintr == 1) { mutex_enter(&state->id_scq_poll_lock); if (state->id_scq_poll_busy & IBD_CQ_POLLING) { state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; mutex_exit(&state->id_scq_poll_lock); return; } else { mutex_exit(&state->id_scq_poll_lock); ddi_trigger_softintr(state->id_tx); } } else (void) ibd_tx_recycle((caddr_t)state); } /* * Multicast group create/delete trap handler. These will be delivered * on a kernel thread (handling can thus block) and can be invoked * concurrently. The handler can be invoked anytime after it is * registered and before ibt_detach(). */ /* ARGSUSED */ static void ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, ibt_subnet_event_t *event) { ibd_state_t *state = (ibd_state_t *)arg; ibd_req_t *req; /* * The trap handler will get invoked once for every event for * every port. The input "gid" is the GID0 of the port the * trap came in on; we just need to act on traps that came * to our port, meaning the port on which the ipoib interface * resides. Since ipoib uses GID0 of the port, we just match * the gids to check whether we need to handle the trap. */ _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) return; _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) DPRINT(10, "ibd_notices_handler : %d\n", code); switch (code) { case IBT_SM_EVENT_UNAVAILABLE: /* * If we are in promiscuous mode or have * sendnonmembers, we need to print a warning * message right now. Else, just store the * information, print when we enter promiscuous * mode or attempt nonmember send. We might * also want to stop caching sendnonmember. */ ibd_print_warn(state, "IBA multicast support " "degraded due to unavailability of multicast " "traps"); break; case IBT_SM_EVENT_AVAILABLE: /* * If we printed a warning message above or * while trying to nonmember send or get into * promiscuous mode, print an okay message. */ ibd_print_warn(state, "IBA multicast support " "restored due to availability of multicast " "traps"); break; case IBT_SM_EVENT_MCG_CREATED: case IBT_SM_EVENT_MCG_DELETED: /* * If it is a "deleted" event and we are in late hca * init, nothing to do. */ if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == IBD_DRV_IN_LATE_HCA_INIT) && (code == IBT_SM_EVENT_MCG_DELETED)) { break; } /* * Common processing of creation/deletion traps. * First check if the instance is being * [de]initialized; back off then, without doing * anything more, since we are not sure if the * async thread is around, or whether we might * be racing with the detach code in ibd_m_stop() * that scans the mcg list. */ if (!ibd_async_safe(state)) return; req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); req->rq_gid = event->sm_notice_gid; req->rq_ptr = (void *)code; ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); break; } } static void ibd_async_trap(ibd_state_t *state, ibd_req_t *req) { ib_gid_t mgid = req->rq_gid; ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; int ret; ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff; DPRINT(10, "ibd_async_trap : %d\n", code); /* * Check if we have already joined the IPoIB broadcast group for our * PKEY. If joined, perform the rest of the operation. * Else, the interface is not initialised. Do the initialisation here * by calling ibd_start() and return. */ if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) && (code == IBT_SM_EVENT_MCG_CREATED)) { /* * If we are in late HCA init and a notification for the * creation of a MCG came in, check if it is the IPoIB MCG for * this pkey. If not, return. */ if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey != state->id_pkey)) { ibd_async_done(state); return; } ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); /* * Check if there is still a necessity to start the interface. * It is possible that the user attempted unplumb at just about * the same time, and if unplumb succeeded, we have nothing to * do. */ if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == IBD_DRV_IN_LATE_HCA_INIT) && ((ret = ibd_start(state)) != 0)) { DPRINT(10, "ibd_async_trap: cannot start from late HCA " "init, ret=%d", ret); } ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); ibd_async_done(state); return; } /* * Atomically search the nonmember and sendonlymember lists and * delete. */ ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); if (state->id_prom_op == IBD_OP_COMPLETED) { ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); /* * If in promiscuous mode, try to join/attach to the new * mcg. Given the unreliable out-of-order mode of trap * delivery, we can never be sure whether it is a problem * if the join fails. Thus, we warn the admin of a failure * if this was a creation trap. Note that the trap might * actually be reporting a long past event, and the mcg * might already have been deleted, thus we might be warning * in vain. */ if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) ibd_print_warn(state, "IBA promiscuous mode missed " "new multicast gid %016llx:%016llx", (u_longlong_t)mgid.gid_prefix, (u_longlong_t)mgid.gid_guid); } /* * Free the request slot allocated by the subnet event thread. */ ibd_async_done(state); } /* * GLDv3 entry point to get capabilities. */ static boolean_t ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { ibd_state_t *state = arg; if (state->id_type == IBD_PORT_DRIVER) return (B_FALSE); switch (cap) { case MAC_CAPAB_HCKSUM: { uint32_t *txflags = cap_data; /* * We either do full checksum or not do it at all */ if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; else return (B_FALSE); break; } case MAC_CAPAB_LSO: { mac_capab_lso_t *cap_lso = cap_data; /* * In addition to the capability and policy, since LSO * relies on hw checksum, we'll not enable LSO if we * don't have hw checksum. Of course, if the HCA doesn't * provide the reserved lkey capability, enabling LSO will * actually affect performance adversely, so we'll disable * LSO even for that case. */ if (!state->id_lso_policy || !state->id_lso_capable) return (B_FALSE); if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) return (B_FALSE); if (state->id_hca_res_lkey_capab == 0) { ibd_print_warn(state, "no reserved-lkey capability, " "disabling LSO"); return (B_FALSE); } cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; break; } default: return (B_FALSE); } return (B_TRUE); } /* * callback function for set/get of properties */ static int ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, uint_t pr_valsize, const void *pr_val) { ibd_state_t *state = arg; int err = 0; uint32_t link_mode; /* Cannot set properties on a port driver */ if (state->id_type == IBD_PORT_DRIVER) { return (ENOTSUP); } switch (pr_num) { case MAC_PROP_IB_LINKMODE: if (state->id_mac_state & IBD_DRV_STARTED) { err = EBUSY; break; } if (pr_val == NULL) { err = EINVAL; break; } bcopy(pr_val, &link_mode, sizeof (link_mode)); if (link_mode != IBD_LINK_MODE_UD && link_mode != IBD_LINK_MODE_RC) { err = EINVAL; } else { if (link_mode == IBD_LINK_MODE_RC) { if (state->id_enable_rc) { return (0); } state->id_enable_rc = 1; /* inform MAC framework of new MTU */ err = mac_maxsdu_update2(state->id_mh, state->rc_mtu - IPOIB_HDRSIZE, state->id_mtu - IPOIB_HDRSIZE); } else { if (!state->id_enable_rc) { return (0); } state->id_enable_rc = 0; err = mac_maxsdu_update2(state->id_mh, state->id_mtu - IPOIB_HDRSIZE, state->id_mtu - IPOIB_HDRSIZE); } (void) ibd_record_capab(state); mac_capab_update(state->id_mh); } break; case MAC_PROP_PRIVATE: err = ibd_set_priv_prop(state, pr_name, pr_valsize, pr_val); break; default: err = ENOTSUP; break; } return (err); } static int ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, uint_t pr_valsize, void *pr_val) { ibd_state_t *state = arg; int err = 0; switch (pr_num) { case MAC_PROP_MTU: break; default: if (state->id_type == IBD_PORT_DRIVER) { return (ENOTSUP); } break; } switch (pr_num) { case MAC_PROP_IB_LINKMODE: *(uint_t *)pr_val = state->id_enable_rc; break; case MAC_PROP_PRIVATE: err = ibd_get_priv_prop(state, pr_name, pr_valsize, pr_val); break; default: err = ENOTSUP; break; } return (err); } static void ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_handle_t prh) { ibd_state_t *state = arg; switch (pr_num) { case MAC_PROP_IB_LINKMODE: { mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE); break; } case MAC_PROP_MTU: { uint32_t min, max; if (state->id_type == IBD_PORT_DRIVER) { min = 1500; max = IBD_DEF_RC_MAX_SDU; } else if (state->id_enable_rc) { min = max = IBD_DEF_RC_MAX_SDU; } else { min = max = state->id_mtu - IPOIB_HDRSIZE; } mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_range_uint32(prh, min, max); break; } case MAC_PROP_PRIVATE: { char valstr[64]; int value; if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); return; } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { value = IBD_DEF_COALESCE_COMPLETIONS; } else if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { value = IBD_DEF_CREATE_BCAST_GROUP; } else if (strcmp(pr_name, "_ibd_hash_size") == 0) { value = IBD_DEF_HASH_SIZE; } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) { value = IBD_DEF_LSO_POLICY; } else if (strcmp(pr_name, "_ibd_num_ah") == 0) { value = IBD_DEF_NUM_AH; } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { value = IBD_DEF_NUM_LSO_BUFS; } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { value = IBD_DEF_RC_ENABLE_SRQ; } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { value = IBD_DEF_RC_NUM_RWQE; } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { value = IBD_DEF_RC_NUM_SRQ; } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { value = IBD_DEF_RC_NUM_SWQE; } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { value = IBD_DEF_RC_RX_COMP_COUNT; } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { value = IBD_DEF_RC_RX_COMP_USEC; } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { value = IBD_DEF_RC_RX_COPY_THRESH; } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { value = IBD_DEF_RC_RX_RWQE_THRESH; } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { value = IBD_DEF_RC_TX_COMP_COUNT; } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { value = IBD_DEF_RC_TX_COMP_USEC; } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { value = IBD_DEF_RC_TX_COPY_THRESH; } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { value = IBD_DEF_UD_NUM_RWQE; } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { value = IBD_DEF_UD_NUM_SWQE; } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { value = IBD_DEF_UD_RX_COMP_COUNT; } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { value = IBD_DEF_UD_RX_COMP_USEC; } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { value = IBD_DEF_UD_TX_COMP_COUNT; } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { value = IBD_DEF_UD_TX_COMP_USEC; } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { value = IBD_DEF_UD_TX_COPY_THRESH; } else { return; } (void) snprintf(valstr, sizeof (valstr), "%d", value); mac_prop_info_set_default_str(prh, valstr); break; } } /* switch (pr_num) */ } /* ARGSUSED2 */ static int ibd_set_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, const void *pr_val) { int err = 0; long result; if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 0 || result > 1) { err = EINVAL; } else { state->id_allow_coalesce_comp_tuning = (result == 1) ? B_TRUE: B_FALSE; } return (err); } if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 0 || result > 1) { err = EINVAL; } else { state->id_create_broadcast_group = (result == 1) ? B_TRUE: B_FALSE; } return (err); } if (strcmp(pr_name, "_ibd_hash_size") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) { err = EINVAL; } else { state->id_hash_size = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_lso_enable") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 0 || result > 1) { err = EINVAL; } else { state->id_lso_policy = (result == 1) ? B_TRUE: B_FALSE; } mac_capab_update(state->id_mh); return (err); } if (strcmp(pr_name, "_ibd_num_ah") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) { err = EINVAL; } else { state->id_num_ah = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (!state->id_lso_policy || !state->id_lso_capable) { return (EINVAL); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_NUM_LSO_BUFS || result > IBD_MAX_NUM_LSO_BUFS) { err = EINVAL; } else { state->id_num_lso_bufs = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 0 || result > 1) { err = EINVAL; } else { state->rc_enable_srq = (result == 1) ? B_TRUE: B_FALSE; } if (!state->rc_enable_srq) { state->id_rc_num_srq = 0; } return (err); } if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_RC_NUM_RWQE || result > IBD_MAX_RC_NUM_RWQE) { err = EINVAL; } else { state->id_rc_num_rwqe = (uint32_t)result; if (state->id_allow_coalesce_comp_tuning && state->id_rc_rx_comp_count > state->id_rc_num_rwqe) state->id_rc_rx_comp_count = state->id_rc_num_rwqe; if (state->id_rc_num_srq > state->id_rc_num_rwqe) state->id_rc_num_srq = state->id_rc_num_rwqe - 1; /* * If rx_rwqe_threshold is greater than the number of * rwqes, pull it back to 25% of number of rwqes. */ if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe) state->id_rc_rx_rwqe_thresh = (state->id_rc_num_rwqe >> 2); } return (err); } if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } if (!state->rc_enable_srq) return (EINVAL); (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_RC_NUM_SRQ || result >= state->id_rc_num_rwqe) { err = EINVAL; } else state->id_rc_num_srq = (uint32_t)result; return (err); } if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_RC_NUM_SWQE || result > IBD_MAX_RC_NUM_SWQE) { err = EINVAL; } else { state->id_rc_num_swqe = (uint32_t)result; if (state->id_allow_coalesce_comp_tuning && state->id_rc_tx_comp_count > state->id_rc_num_swqe) state->id_rc_tx_comp_count = state->id_rc_num_swqe; } return (err); } if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1 || result > state->id_rc_num_rwqe) { err = EINVAL; } else { state->id_rc_rx_comp_count = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1) { err = EINVAL; } else { state->id_rc_rx_comp_usec = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_RC_RX_COPY_THRESH || result > state->rc_mtu) { err = EINVAL; } else { state->id_rc_rx_copy_thresh = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_RC_RX_RWQE_THRESH || result >= state->id_rc_num_rwqe) { err = EINVAL; } else { state->id_rc_rx_rwqe_thresh = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1 || result > state->id_rc_num_swqe) { err = EINVAL; } else { state->id_rc_tx_comp_count = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1) err = EINVAL; else { state->id_rc_tx_comp_usec = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_RC_TX_COPY_THRESH || result > state->rc_mtu) { err = EINVAL; } else { state->id_rc_tx_copy_thresh = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_UD_NUM_RWQE || result > IBD_MAX_UD_NUM_RWQE) { err = EINVAL; } else { if (result > state->id_hca_max_chan_sz) { state->id_ud_num_rwqe = state->id_hca_max_chan_sz; } else { state->id_ud_num_rwqe = (uint32_t)result; } if (state->id_allow_coalesce_comp_tuning && state->id_ud_rx_comp_count > state->id_ud_num_rwqe) state->id_ud_rx_comp_count = state->id_ud_num_rwqe; } return (err); } if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_UD_NUM_SWQE || result > IBD_MAX_UD_NUM_SWQE) { err = EINVAL; } else { if (result > state->id_hca_max_chan_sz) { state->id_ud_num_swqe = state->id_hca_max_chan_sz; } else { state->id_ud_num_swqe = (uint32_t)result; } if (state->id_allow_coalesce_comp_tuning && state->id_ud_tx_comp_count > state->id_ud_num_swqe) state->id_ud_tx_comp_count = state->id_ud_num_swqe; } return (err); } if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1 || result > state->id_ud_num_rwqe) { err = EINVAL; } else { state->id_ud_rx_comp_count = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1) { err = EINVAL; } else { state->id_ud_rx_comp_usec = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1 || result > state->id_ud_num_swqe) { err = EINVAL; } else { state->id_ud_tx_comp_count = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { if (!state->id_allow_coalesce_comp_tuning) { return (ENOTSUP); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < 1) { err = EINVAL; } else { state->id_ud_tx_comp_usec = (uint32_t)result; } return (err); } if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { if (state->id_mac_state & IBD_DRV_STARTED) { return (EBUSY); } if (pr_val == NULL) { return (EINVAL); } (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); if (result < IBD_MIN_UD_TX_COPY_THRESH || result > IBD_MAX_UD_TX_COPY_THRESH) { err = EINVAL; } else { state->id_ud_tx_copy_thresh = (uint32_t)result; } return (err); } return (ENOTSUP); } static int ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, void *pr_val) { int err = ENOTSUP; int value; if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { value = state->id_bgroup_present; err = 0; goto done; } if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { value = state->id_allow_coalesce_comp_tuning; err = 0; goto done; } if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { value = state->id_create_broadcast_group; err = 0; goto done; } if (strcmp(pr_name, "_ibd_hash_size") == 0) { value = state->id_hash_size; err = 0; goto done; } if (strcmp(pr_name, "_ibd_lso_enable") == 0) { value = state->id_lso_policy; err = 0; goto done; } if (strcmp(pr_name, "_ibd_num_ah") == 0) { value = state->id_num_ah; err = 0; goto done; } if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { value = state->id_num_lso_bufs; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { value = state->rc_enable_srq; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { value = state->id_rc_num_rwqe; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { value = state->id_rc_num_srq; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { value = state->id_rc_num_swqe; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { value = state->id_rc_rx_comp_count; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { value = state->id_rc_rx_comp_usec; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { value = state->id_rc_rx_copy_thresh; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { value = state->id_rc_rx_rwqe_thresh; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { value = state->id_rc_tx_comp_count; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { value = state->id_rc_tx_comp_usec; err = 0; goto done; } if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { value = state->id_rc_tx_copy_thresh; err = 0; goto done; } if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { value = state->id_ud_num_rwqe; err = 0; goto done; } if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { value = state->id_ud_num_swqe; err = 0; goto done; } if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { value = state->id_ud_rx_comp_count; err = 0; goto done; } if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { value = state->id_ud_rx_comp_usec; err = 0; goto done; } if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { value = state->id_ud_tx_comp_count; err = 0; goto done; } if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { value = state->id_ud_tx_comp_usec; err = 0; goto done; } if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { value = state->id_ud_tx_copy_thresh; err = 0; goto done; } done: if (err == 0) { (void) snprintf(pr_val, pr_valsize, "%d", value); } return (err); } static int ibd_get_port_details(ibd_state_t *state) { ibt_hca_portinfo_t *port_infop; ibt_status_t ret; uint_t psize, port_infosz; mutex_enter(&state->id_link_mutex); /* * Query for port information */ ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, &port_infop, &psize, &port_infosz); if ((ret != IBT_SUCCESS) || (psize != 1)) { mutex_exit(&state->id_link_mutex); DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " "failed, ret=%d", ret); return (ENETDOWN); } /* * If the link is active, verify the pkey */ if (port_infop->p_linkstate == IBT_PORT_ACTIVE) { if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { state->id_link_state = LINK_STATE_DOWN; } else { state->id_link_state = LINK_STATE_UP; } state->id_mtu = (128 << port_infop->p_mtu); _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) state->id_sgid = *port_infop->p_sgid_tbl; _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) /* * Now that the port is active, record the port speed */ state->id_link_speed = ibd_get_portspeed(state); } else { /* Make sure that these are handled in PORT_UP/CHANGE */ state->id_mtu = 0; state->id_link_state = LINK_STATE_DOWN; state->id_link_speed = 0; } mutex_exit(&state->id_link_mutex); ibt_free_portinfo(port_infop, port_infosz); return (0); } static int ibd_alloc_cqs(ibd_state_t *state) { ibt_hca_attr_t hca_attrs; ibt_cq_attr_t cq_attr; ibt_status_t ret; uint32_t real_size; uint_t num_rwqe_change = 0; uint_t num_swqe_change = 0; ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); ASSERT(ret == IBT_SUCCESS); /* * Allocate Rx/combined CQ: * Theoretically, there is no point in having more than #rwqe * plus #swqe cqe's, except that the CQ will be signaled for * overflow when the last wqe completes, if none of the previous * cqe's have been polled. Thus, we allocate just a few less wqe's * to make sure such overflow does not occur. */ cq_attr.cq_sched = NULL; cq_attr.cq_flags = IBT_CQ_NO_FLAGS; /* * Allocate Receive CQ. */ if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) { cq_attr.cq_size = state->id_ud_num_rwqe + 1; } else { cq_attr.cq_size = hca_attrs.hca_max_cq_sz; num_rwqe_change = state->id_ud_num_rwqe; state->id_ud_num_rwqe = cq_attr.cq_size - 1; } if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " "failed, ret=%d\n", ret); return (DDI_FAILURE); } if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count, state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " "moderation failed, ret=%d\n", ret); } /* make the #rx wc's the same as max rx chain size */ state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * state->id_rxwcs_size, KM_SLEEP); /* * Allocate Send CQ. */ if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) { cq_attr.cq_size = state->id_ud_num_swqe + 1; } else { cq_attr.cq_size = hca_attrs.hca_max_cq_sz; num_swqe_change = state->id_ud_num_swqe; state->id_ud_num_swqe = cq_attr.cq_size - 1; } if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " "failed, ret=%d\n", ret); kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); (void) ibt_free_cq(state->id_rcq_hdl); return (DDI_FAILURE); } if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count, state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " "moderation failed, ret=%d\n", ret); } state->id_txwcs_size = IBD_TX_POLL_THRESH; state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * state->id_txwcs_size, KM_SLEEP); /* * Print message in case we could not allocate as many wqe's * as was requested. */ if (num_rwqe_change) { ibd_print_warn(state, "Setting #rwqe = %d instead of default " "%d", state->id_ud_num_rwqe, num_rwqe_change); } if (num_swqe_change) { ibd_print_warn(state, "Setting #swqe = %d instead of default " "%d", state->id_ud_num_swqe, num_swqe_change); } return (DDI_SUCCESS); } static int ibd_setup_ud_channel(ibd_state_t *state) { ibt_ud_chan_alloc_args_t ud_alloc_attr; ibt_ud_chan_query_attr_t ud_chan_attr; ibt_status_t ret; ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; if (state->id_hca_res_lkey_capab) ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; if (state->id_lso_policy && state->id_lso_capable) ud_alloc_attr.ud_flags |= IBT_USES_LSO; ud_alloc_attr.ud_hca_port_num = state->id_port; ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe; ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe; ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; ud_alloc_attr.ud_scq = state->id_scq_hdl; ud_alloc_attr.ud_rcq = state->id_rcq_hdl; ud_alloc_attr.ud_pd = state->id_pd_hdl; ud_alloc_attr.ud_pkey_ix = state->id_pkix; ud_alloc_attr.ud_clone_chan = NULL; if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " "failed, ret=%d\n", ret); return (DDI_FAILURE); } if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr)) != IBT_SUCCESS) { DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " "failed, ret=%d\n", ret); (void) ibt_free_channel(state->id_chnl_hdl); return (DDI_FAILURE); } state->id_qpnum = ud_chan_attr.ud_qpn; return (DDI_SUCCESS); } static int ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) { uint32_t progress = state->id_mac_state; uint_t attempts; ibt_status_t ret; ib_gid_t mgid; ibd_mce_t *mce; uint8_t jstate; timeout_id_t tid; if (atomic_dec_32_nv(&state->id_running) != 0) cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); /* * Before we try to stop/undo whatever we did in ibd_start(), * we need to mark the link state appropriately to prevent the * ip layer from using this instance for any new transfers. Note * that if the original state of the link was "up" when we're * here, we'll set the final link state to "unknown", to behave * in the same fashion as other ethernet drivers. */ mutex_enter(&state->id_link_mutex); if (cur_link_state == LINK_STATE_DOWN) { state->id_link_state = cur_link_state; } else { state->id_link_state = LINK_STATE_UNKNOWN; } mutex_exit(&state->id_link_mutex); bzero(&state->id_macaddr, sizeof (ipoib_mac_t)); mac_link_update(state->id_mh, state->id_link_state); state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); if (progress & IBD_DRV_STARTED) { state->id_mac_state &= (~IBD_DRV_STARTED); } if (progress & IBD_DRV_IN_LATE_HCA_INIT) { state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT); } /* Stop listen under Reliable Connected Mode */ if (progress & IBD_DRV_RC_LISTEN) { ASSERT(state->id_enable_rc); if (state->rc_listen_hdl != NULL) { ibd_rc_stop_listen(state); } state->id_mac_state &= (~IBD_DRV_RC_LISTEN); } /* Stop timeout routine */ if (progress & IBD_DRV_RC_TIMEOUT) { ASSERT(state->id_enable_rc); mutex_enter(&state->rc_timeout_lock); state->rc_timeout_start = B_FALSE; tid = state->rc_timeout; state->rc_timeout = 0; mutex_exit(&state->rc_timeout_lock); if (tid != 0) (void) untimeout(tid); state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT); } if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { attempts = 100; while (state->id_ah_op == IBD_OP_ONGOING) { /* * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB * port is connecting to a remote IPoIB port. Wait for * the end of this connecting operation. */ delay(drv_usectohz(100000)); if (--attempts == 0) { state->rc_stop_connect++; DPRINT(40, "ibd_undo_start: connecting"); break; } } mutex_enter(&state->id_sched_lock); state->id_sched_needed = 0; mutex_exit(&state->id_sched_lock); (void) ibd_rc_close_all_chan(state); } /* * First, stop receive interrupts; this stops the driver from * handing up buffers to higher layers. Wait for receive buffers * to be returned and give up after 1 second. */ if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { attempts = 10; while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) > 0) { delay(drv_usectohz(100000)); if (--attempts == 0) { /* * There are pending bufs with the network * layer and we have no choice but to wait * for them to be done with. Reap all the * Tx/Rx completions that were posted since * we turned off the notification and * return failure. */ cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); DPRINT(2, "ibd_undo_start: " "reclaiming failed"); break; } } state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); } if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { ibd_rc_fini_tx_largebuf_list(state); state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); } if (progress & IBD_DRV_RC_SRQ_ALLOCD) { ASSERT(state->id_enable_rc); if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { if (state->id_ah_op == IBD_OP_ONGOING) { delay(drv_usectohz(10000)); if (state->id_ah_op == IBD_OP_ONGOING) { /* * "state->id_ah_op == IBD_OP_ONGOING" * means this IPoIB port is connecting * to a remote IPoIB port. We can't * delete SRQ here. */ state->rc_stop_connect++; DPRINT(40, "ibd_undo_start: " "connecting"); } else { ibd_rc_fini_srq_list(state); state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); } } else { ibd_rc_fini_srq_list(state); state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); } } else { DPRINT(40, "ibd_undo_start: srq bufs outstanding\n"); } } if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); mutex_enter(&state->id_trap_lock); state->id_trap_stop = B_TRUE; while (state->id_trap_inprog > 0) cv_wait(&state->id_trap_cv, &state->id_trap_lock); mutex_exit(&state->id_trap_lock); state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); } if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { /* * Flushing the channel ensures that all pending WQE's * are marked with flush_error and handed to the CQ. It * does not guarantee the invocation of the CQ handler. * This call is guaranteed to return successfully for * UD QPNs. */ if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_undo_start: flush_channel " "failed, ret=%d", ret); } /* * Give some time for the TX CQ handler to process the * completions. */ attempts = 10; mutex_enter(&state->id_tx_list.dl_mutex); mutex_enter(&state->id_tx_rel_list.dl_mutex); while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != state->id_ud_num_swqe) { if (--attempts == 0) break; mutex_exit(&state->id_tx_rel_list.dl_mutex); mutex_exit(&state->id_tx_list.dl_mutex); delay(drv_usectohz(100000)); mutex_enter(&state->id_tx_list.dl_mutex); mutex_enter(&state->id_tx_rel_list.dl_mutex); } ibt_set_cq_handler(state->id_scq_hdl, 0, 0); if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != state->id_ud_num_swqe) { cmn_err(CE_WARN, "tx resources not freed\n"); } mutex_exit(&state->id_tx_rel_list.dl_mutex); mutex_exit(&state->id_tx_list.dl_mutex); attempts = 10; while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { if (--attempts == 0) break; delay(drv_usectohz(100000)); } ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { cmn_err(CE_WARN, "rx resources not freed\n"); } state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); } if (progress & IBD_DRV_BCAST_GROUP_JOINED) { /* * Drop all residual full/non membership. This includes full * membership to the broadcast group, and any nonmembership * acquired during transmits. We do this after the Tx completion * handlers are done, since those might result in some late * leaves; this also eliminates a potential race with that * path wrt the mc full list insert/delete. Trap handling * has also been suppressed at this point. Thus, no locks * are required while traversing the mc full list. */ DPRINT(2, "ibd_undo_start: clear full cache entries"); mce = list_head(&state->id_mc_full); while (mce != NULL) { mgid = mce->mc_info.mc_adds_vect.av_dgid; jstate = mce->mc_jstate; mce = list_next(&state->id_mc_full, mce); ibd_leave_group(state, mgid, jstate); } state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); } if (progress & IBD_DRV_RXLIST_ALLOCD) { ibd_fini_rxlist(state); state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); } if (progress & IBD_DRV_TXLIST_ALLOCD) { ibd_fini_txlist(state); state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); } if (progress & IBD_DRV_UD_CHANNEL_SETUP) { if ((ret = ibt_free_channel(state->id_chnl_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_undo_start: free_channel " "failed, ret=%d", ret); } state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); } if (progress & IBD_DRV_CQS_ALLOCD) { kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * state->id_txwcs_size); if ((ret = ibt_free_cq(state->id_scq_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_undo_start: free_cq(scq) " "failed, ret=%d", ret); } kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " "ret=%d", ret); } state->id_txwcs = NULL; state->id_rxwcs = NULL; state->id_scq_hdl = NULL; state->id_rcq_hdl = NULL; state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); } if (progress & IBD_DRV_ACACHE_INITIALIZED) { mutex_enter(&state->id_ac_mutex); mod_hash_destroy_hash(state->id_ah_active_hash); mutex_exit(&state->id_ac_mutex); ibd_acache_fini(state); state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); } if (progress & IBD_DRV_BCAST_GROUP_FOUND) { /* * If we'd created the ipoib broadcast group and had * successfully joined it, leave it now */ if (state->id_bgroup_created) { mgid = state->id_mcinfo->mc_adds_vect.av_dgid; jstate = IB_MC_JSTATE_FULL; (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); } ibt_free_mcg_info(state->id_mcinfo, 1); state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); } return (DDI_SUCCESS); } /* * These pair of routines are used to set/clear the condition that * the caller is likely to do something to change the id_mac_state. * If there's already someone doing either a start or a stop (possibly * due to the async handler detecting a pkey relocation event, a plumb * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until * that's done. */ static void ibd_set_mac_progress(ibd_state_t *state, uint_t flag) { mutex_enter(&state->id_macst_lock); while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) cv_wait(&state->id_macst_cv, &state->id_macst_lock); state->id_mac_state |= flag; mutex_exit(&state->id_macst_lock); } static void ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) { mutex_enter(&state->id_macst_lock); state->id_mac_state &= (~flag); cv_signal(&state->id_macst_cv); mutex_exit(&state->id_macst_lock); } /* * GLDv3 entry point to start hardware. */ /*ARGSUSED*/ static int ibd_m_start(void *arg) { ibd_state_t *state = arg; int ret; if (state->id_type == IBD_PORT_DRIVER) return (EINVAL); ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); if (state->id_mac_state & IBD_DRV_IN_DELETION) { ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); return (EIO); } ret = ibd_start(state); ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); return (ret); } static int ibd_start(ibd_state_t *state) { int err; ibt_status_t ret; int late_hca_init = 0; if (state->id_mac_state & IBD_DRV_STARTED) return (DDI_SUCCESS); /* * We do not increment the running flag when calling ibd_start() as * a result of some event which moves the state away from late HCA * initialization viz. MCG_CREATED, PORT_CHANGE or link availability. */ if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && (atomic_inc_32_nv(&state->id_running) != 1)) { DPRINT(10, "ibd_start: id_running is non-zero"); cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); atomic_dec_32(&state->id_running); return (EINVAL); } /* * Get port details; if we fail here, something bad happened. * Fail plumb. */ if ((err = ibd_get_port_details(state)) != 0) { DPRINT(10, "ibd_start: ibd_get_port_details() failed"); goto start_fail; } /* * If state->id_link_state is DOWN, it indicates that either the port * is down, or the pkey is not available. In both cases, resort to late * initialization. Register for subnet notices, and return success. */ state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; if (state->id_link_state == LINK_STATE_DOWN) { late_hca_init = 1; goto late_hca_init_return; } /* * Find the IPoIB broadcast group */ if (ibd_find_bgroup(state) != IBT_SUCCESS) { /* Resort to late initialization */ late_hca_init = 1; goto reg_snet_notices; } state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; /* * Initialize per-interface caches and lists; if we fail here, * it is most likely due to a lack of resources */ if (ibd_acache_init(state) != DDI_SUCCESS) { DPRINT(10, "ibd_start: ibd_acache_init() failed"); err = ENOMEM; goto start_fail; } state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; /* * Allocate send and receive completion queues */ if (ibd_alloc_cqs(state) != DDI_SUCCESS) { DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); err = ENOMEM; goto start_fail; } state->id_mac_state |= IBD_DRV_CQS_ALLOCD; /* * Setup a UD channel */ if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { err = ENOMEM; DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); goto start_fail; } state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; /* * Allocate and initialize the tx buffer list */ if (ibd_init_txlist(state) != DDI_SUCCESS) { DPRINT(10, "ibd_start: ibd_init_txlist() failed"); err = ENOMEM; goto start_fail; } state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; /* * Create the send cq handler here */ ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " "failed, ret=%d", ret); err = EINVAL; goto start_fail; } state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; /* * Allocate and initialize the rx buffer list */ if (ibd_init_rxlist(state) != DDI_SUCCESS) { DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); err = ENOMEM; goto start_fail; } state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; /* * Join IPoIB broadcast group */ if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { DPRINT(10, "ibd_start: ibd_join_group() failed"); err = ENOTACTIVE; goto start_fail; } state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; /* * When we did mac_register() in ibd_attach(), we didn't register * the real macaddr and we didn't have the true port mtu. Now that * we're almost ready, set the local mac address and broadcast * addresses and update gldv3 about the real values of these * parameters. */ if (state->id_enable_rc) { ibd_h2n_mac(&state->id_macaddr, IBD_MAC_ADDR_RC + state->id_qpnum, state->id_sgid.gid_prefix, state->id_sgid.gid_guid); ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, state->id_sgid.gid_prefix, state->id_sgid.gid_guid); } else { ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, state->id_sgid.gid_prefix, state->id_sgid.gid_guid); } ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, state->id_mgid.gid_guid); if (!state->id_enable_rc) { (void) mac_maxsdu_update2(state->id_mh, state->id_mtu - IPOIB_HDRSIZE, state->id_mtu - IPOIB_HDRSIZE); } mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); /* * Setup the receive cq handler */ ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " "failed, ret=%d", ret); err = EINVAL; goto start_fail; } state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; reg_snet_notices: /* * In case of normal initialization sequence, * Setup the subnet notices handler after we've initialized the acache/ * mcache and started the async thread, both of which are required for * the trap handler to function properly. * * Now that the async thread has been started (and we've already done * a mac_register() during attach so mac_tx_update() can be called * if necessary without any problem), we can enable the trap handler * to queue requests to the async thread. * * In case of late hca initialization, the subnet notices handler will * only handle MCG created/deleted event. The action performed as part * of handling these events is to start the interface. So, the * acache/mcache initialization is not a necessity in such cases for * registering the subnet notices handler. Also, if we are in * ibd_start() as a result of, say, some event handling after entering * late hca initialization phase no need to register again. */ if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) { ibt_register_subnet_notices(state->id_ibt_hdl, ibd_snet_notices_handler, state); mutex_enter(&state->id_trap_lock); state->id_trap_stop = B_FALSE; mutex_exit(&state->id_trap_lock); state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; } late_hca_init_return: if (late_hca_init == 1) { state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT; /* * In case of late initialization, mark the link state as down, * immaterial of the actual link state as reported in the * port_info. */ state->id_link_state = LINK_STATE_DOWN; mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); mac_link_update(state->id_mh, state->id_link_state); return (DDI_SUCCESS); } if (state->id_enable_rc) { if (state->rc_enable_srq) { if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { if (ibd_rc_repost_srq_free_list(state) != IBT_SUCCESS) { err = ENOMEM; goto start_fail; } } else { /* Allocate SRQ resource */ if (ibd_rc_init_srq_list(state) != IBT_SUCCESS) { err = ENOMEM; goto start_fail; } state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; } } if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " "failed"); err = ENOMEM; goto start_fail; } state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; /* RC: begin to listen only after everything is available */ if (ibd_rc_listen(state) != IBT_SUCCESS) { DPRINT(10, "ibd_start: ibd_rc_listen() failed"); err = EINVAL; goto start_fail; } state->id_mac_state |= IBD_DRV_RC_LISTEN; } /* * Indicate link status to GLDv3 and higher layers. By default, * we assume we are in up state (which must have been true at * least at the time the broadcast mcg's were probed); if there * were any up/down transitions till the time we come here, the * async handler will have updated last known state, which we * use to tell GLDv3. The async handler will not send any * notifications to GLDv3 till we reach here in the initialization * sequence. */ mac_link_update(state->id_mh, state->id_link_state); state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT; state->id_mac_state |= IBD_DRV_STARTED; /* Start timer after everything is ready */ if (state->id_enable_rc) { mutex_enter(&state->rc_timeout_lock); state->rc_timeout_start = B_TRUE; state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state, SEC_TO_TICK(ibd_rc_conn_timeout)); mutex_exit(&state->rc_timeout_lock); state->id_mac_state |= IBD_DRV_RC_TIMEOUT; } return (DDI_SUCCESS); start_fail: /* * If we ran into a problem during ibd_start() and ran into * some other problem during undoing our partial work, we can't * do anything about it. Ignore any errors we might get from * ibd_undo_start() and just return the original error we got. */ (void) ibd_undo_start(state, LINK_STATE_DOWN); return (err); } /* * GLDv3 entry point to stop hardware from receiving packets. */ /*ARGSUSED*/ static void ibd_m_stop(void *arg) { ibd_state_t *state = (ibd_state_t *)arg; if (state->id_type == IBD_PORT_DRIVER) return; ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); (void) ibd_undo_start(state, state->id_link_state); ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); } /* * GLDv3 entry point to modify device's mac address. We do not * allow address modifications. */ static int ibd_m_unicst(void *arg, const uint8_t *macaddr) { ibd_state_t *state = arg; if (state->id_type == IBD_PORT_DRIVER) return (EINVAL); /* * Don't bother even comparing the macaddr if we haven't * completed ibd_m_start(). */ if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return (0); if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) return (0); else return (EINVAL); } /* * The blocking part of the IBA join/leave operations are done out * of here on the async thread. */ static void ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) { DPRINT(3, "ibd_async_multicast : async_setmc op %d :" "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); if (op == IBD_ASYNC_JOIN) { if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { ibd_print_warn(state, "Join multicast group failed :" "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); } } else { /* * Here, we must search for the proper mcg_info and * use that to leave the group. */ ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); } } /* * GLDv3 entry point for multicast enable/disable requests. * This function queues the operation to the async thread and * return success for a valid multicast address. */ static int ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) { ibd_state_t *state = (ibd_state_t *)arg; ipoib_mac_t maddr, *mcast; ib_gid_t mgid; ibd_req_t *req; if (state->id_type == IBD_PORT_DRIVER) return (EINVAL); /* * If we haven't completed ibd_m_start(), async thread wouldn't * have been started and id_bcaddr wouldn't be set, so there's * no point in continuing. */ if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return (0); /* * The incoming multicast address might not be aligned properly * on a 4 byte boundary to be considered an ipoib_mac_t. We force * it to look like one though, to get the offsets of the mc gid, * since we know we are not going to dereference any values with * the ipoib_mac_t pointer. */ bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); mcast = &maddr; /* * Check validity of MCG address. We could additionally check * that a enable/disable is not being issued on the "broadcast" * mcg, but since this operation is only invokable by privileged * programs anyway, we allow the flexibility to those dlpi apps. * Note that we do not validate the "scope" of the IBA mcg. */ if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) return (EINVAL); /* * fill in multicast pkey and scope */ IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); /* * If someone is trying to JOIN/LEAVE the broadcast group, we do * nothing (i.e. we stay JOINed to the broadcast group done in * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically * requires to be joined to broadcast groups at all times. * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also * depends on this. */ if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) return (0); ibd_n2h_gid(mcast, &mgid); req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) return (ENOMEM); req->rq_gid = mgid; if (add) { DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); } else { DPRINT(1, "ibd_m_multicst : unset_multicast : " "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); } return (0); } /* * The blocking part of the IBA promiscuous operations are done * out of here on the async thread. The dlpireq parameter indicates * whether this invocation is due to a dlpi request or due to * a port up/down event. */ static void ibd_async_unsetprom(ibd_state_t *state) { ibd_mce_t *mce = list_head(&state->id_mc_non); ib_gid_t mgid; DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); while (mce != NULL) { mgid = mce->mc_info.mc_adds_vect.av_dgid; mce = list_next(&state->id_mc_non, mce); ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); } state->id_prom_op = IBD_OP_NOTSTARTED; } /* * The blocking part of the IBA promiscuous operations are done * out of here on the async thread. The dlpireq parameter indicates * whether this invocation is due to a dlpi request or due to * a port up/down event. */ static void ibd_async_setprom(ibd_state_t *state) { ibt_mcg_attr_t mcg_attr; ibt_mcg_info_t *mcg_info; ib_gid_t mgid; uint_t numg; int i; char ret = IBD_OP_COMPLETED; DPRINT(2, "ibd_async_setprom : async_set_promisc"); /* * Obtain all active MC groups on the IB fabric with * specified criteria (scope + Pkey + Qkey + mtu). */ bzero(&mcg_attr, sizeof (mcg_attr)); mcg_attr.mc_pkey = state->id_pkey; mcg_attr.mc_scope = state->id_scope; mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; mcg_attr.mc_mtu_req.r_selector = IBT_EQU; if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != IBT_SUCCESS) { ibd_print_warn(state, "Could not get list of IBA multicast " "groups"); ret = IBD_OP_ERRORED; goto done; } /* * Iterate over the returned mcg's and join as NonMember * to the IP mcg's. */ for (i = 0; i < numg; i++) { /* * Do a NonMember JOIN on the MC group. */ mgid = mcg_info[i].mc_adds_vect.av_dgid; if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) ibd_print_warn(state, "IBA promiscuous mode missed " "multicast gid %016llx:%016llx", (u_longlong_t)mgid.gid_prefix, (u_longlong_t)mgid.gid_guid); } ibt_free_mcg_info(mcg_info, numg); DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); done: state->id_prom_op = ret; } /* * GLDv3 entry point for multicast promiscuous enable/disable requests. * GLDv3 assumes phys state receives more packets than multi state, * which is not true for IPoIB. Thus, treat the multi and phys * promiscuous states the same way to work with GLDv3's assumption. */ static int ibd_m_promisc(void *arg, boolean_t on) { ibd_state_t *state = (ibd_state_t *)arg; ibd_req_t *req; if (state->id_type == IBD_PORT_DRIVER) return (EINVAL); /* * Async thread wouldn't have been started if we haven't * passed ibd_m_start() */ if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return (0); req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) return (ENOMEM); if (on) { DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); } else { DPRINT(1, "ibd_m_promisc : unset_promisc"); ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); } return (0); } /* * GLDv3 entry point for gathering statistics. */ static int ibd_m_stat(void *arg, uint_t stat, uint64_t *val) { ibd_state_t *state = (ibd_state_t *)arg; switch (stat) { case MAC_STAT_IFSPEED: *val = state->id_link_speed; break; case MAC_STAT_MULTIRCV: *val = state->id_multi_rcv; break; case MAC_STAT_BRDCSTRCV: *val = state->id_brd_rcv; break; case MAC_STAT_MULTIXMT: *val = state->id_multi_xmt; break; case MAC_STAT_BRDCSTXMT: *val = state->id_brd_xmt; break; case MAC_STAT_RBYTES: *val = state->id_rcv_bytes + state->rc_rcv_trans_byte + state->rc_rcv_copy_byte; break; case MAC_STAT_IPACKETS: *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt + state->rc_rcv_copy_pkt; break; case MAC_STAT_OBYTES: *val = state->id_xmt_bytes + state->rc_xmt_bytes; break; case MAC_STAT_OPACKETS: *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + state->rc_xmt_fragmented_pkt + state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; break; case MAC_STAT_OERRORS: *val = state->id_ah_error; /* failed AH translation */ break; case MAC_STAT_IERRORS: *val = 0; break; case MAC_STAT_NOXMTBUF: *val = state->id_tx_short + state->rc_swqe_short + state->rc_xmt_buf_short; break; case MAC_STAT_NORCVBUF: default: return (ENOTSUP); } return (0); } static void ibd_async_txsched(ibd_state_t *state) { ibd_resume_transmission(state); } static void ibd_resume_transmission(ibd_state_t *state) { int flag; int met_thresh = 0; int thresh = 0; int ret = -1; mutex_enter(&state->id_sched_lock); if (state->id_sched_needed & IBD_RSRC_SWQE) { mutex_enter(&state->id_tx_list.dl_mutex); mutex_enter(&state->id_tx_rel_list.dl_mutex); met_thresh = state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt; mutex_exit(&state->id_tx_rel_list.dl_mutex); mutex_exit(&state->id_tx_list.dl_mutex); thresh = IBD_FREE_SWQES_THRESH; flag = IBD_RSRC_SWQE; } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { ASSERT(state->id_lso != NULL); mutex_enter(&state->id_lso_lock); met_thresh = state->id_lso->bkt_nfree; thresh = IBD_FREE_LSOS_THRESH; mutex_exit(&state->id_lso_lock); flag = IBD_RSRC_LSOBUF; if (met_thresh > thresh) state->id_sched_lso_cnt++; } if (met_thresh > thresh) { state->id_sched_needed &= ~flag; state->id_sched_cnt++; ret = 0; } mutex_exit(&state->id_sched_lock); if (ret == 0) mac_tx_update(state->id_mh); } /* * Release the send wqe back into free list. */ static void ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) { /* * Add back on Tx list for reuse. */ ASSERT(tail->swqe_next == NULL); mutex_enter(&state->id_tx_rel_list.dl_mutex); state->id_tx_rel_list.dl_pending_sends = B_FALSE; tail->swqe_next = state->id_tx_rel_list.dl_head; state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); state->id_tx_rel_list.dl_cnt += n; mutex_exit(&state->id_tx_rel_list.dl_mutex); } /* * Acquire a send wqe from free list. * Returns error number and send wqe pointer. */ static ibd_swqe_t * ibd_acquire_swqe(ibd_state_t *state) { ibd_swqe_t *wqe; mutex_enter(&state->id_tx_rel_list.dl_mutex); if (state->id_tx_rel_list.dl_head != NULL) { /* transfer id_tx_rel_list to id_tx_list */ state->id_tx_list.dl_head = state->id_tx_rel_list.dl_head; state->id_tx_list.dl_cnt = state->id_tx_rel_list.dl_cnt; state->id_tx_list.dl_pending_sends = B_FALSE; /* clear id_tx_rel_list */ state->id_tx_rel_list.dl_head = NULL; state->id_tx_rel_list.dl_cnt = 0; mutex_exit(&state->id_tx_rel_list.dl_mutex); wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); state->id_tx_list.dl_cnt -= 1; state->id_tx_list.dl_head = wqe->swqe_next; } else { /* no free swqe */ mutex_exit(&state->id_tx_rel_list.dl_mutex); state->id_tx_list.dl_pending_sends = B_TRUE; DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); state->id_tx_short++; wqe = NULL; } return (wqe); } static int ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, ibt_ud_dest_hdl_t ud_dest) { mblk_t *nmp; int iph_len, tcph_len; ibt_wr_lso_t *lso; uintptr_t ip_start, tcp_start; uint8_t *dst; uint_t pending, mblen; /* * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; * we need to adjust it here for lso. */ lso = &(node->w_swr.wr.ud_lso); lso->lso_ud_dest = ud_dest; lso->lso_mss = mss; /* * Calculate the LSO header size and set it in the UD LSO structure. * Note that the only assumption we make is that each of the IPoIB, * IP and TCP headers will be contained in a single mblk fragment; * together, the headers may span multiple mblk fragments. */ nmp = mp; ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; if (ip_start >= (uintptr_t)(nmp->b_wptr)) { ip_start = (uintptr_t)nmp->b_cont->b_rptr + (ip_start - (uintptr_t)(nmp->b_wptr)); nmp = nmp->b_cont; } iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); tcp_start = ip_start + iph_len; if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { tcp_start = (uintptr_t)nmp->b_cont->b_rptr + (tcp_start - (uintptr_t)(nmp->b_wptr)); nmp = nmp->b_cont; } tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; /* * If the lso header fits entirely within a single mblk fragment, * we'll avoid an additional copy of the lso header here and just * pass the b_rptr of the mblk directly. * * If this isn't true, we'd have to allocate for it explicitly. */ if (lso->lso_hdr_sz <= MBLKL(mp)) { lso->lso_hdr = mp->b_rptr; } else { /* On work completion, remember to free this allocated hdr */ lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); if (lso->lso_hdr == NULL) { DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " "sz = %d", lso->lso_hdr_sz); lso->lso_hdr_sz = 0; lso->lso_mss = 0; return (-1); } } /* * Copy in the lso header only if we need to */ if (lso->lso_hdr != mp->b_rptr) { dst = lso->lso_hdr; pending = lso->lso_hdr_sz; for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { mblen = MBLKL(nmp); if (pending > mblen) { bcopy(nmp->b_rptr, dst, mblen); dst += mblen; pending -= mblen; } else { bcopy(nmp->b_rptr, dst, pending); break; } } } return (0); } static void ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) { ibt_wr_lso_t *lso; if ((!node) || (!mp)) return; /* * Free any header space that we might've allocated if we * did an LSO */ if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { lso = &(node->w_swr.wr.ud_lso); if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { kmem_free(lso->lso_hdr, lso->lso_hdr_sz); lso->lso_hdr = NULL; lso->lso_hdr_sz = 0; } } } static void ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) { uint_t i; uint_t num_posted; uint_t n_wrs; ibt_status_t ibt_status; ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; ibd_swqe_t *tx_head, *elem; ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; /* post the one request, then check for more */ ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL); if (ibt_status != IBT_SUCCESS) { ibd_print_warn(state, "ibd_post_send: " "posting one wr failed: ret=%d", ibt_status); ibd_tx_cleanup(state, node); } tx_head = NULL; for (;;) { if (tx_head == NULL) { mutex_enter(&state->id_txpost_lock); tx_head = state->id_tx_head; if (tx_head == NULL) { state->id_tx_busy = 0; mutex_exit(&state->id_txpost_lock); return; } state->id_tx_head = NULL; mutex_exit(&state->id_txpost_lock); } /* * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs * at a time if possible, and keep posting them. */ for (n_wrs = 0, elem = tx_head; (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { nodes[n_wrs] = elem; wrs[n_wrs] = elem->w_swr; } tx_head = elem; ASSERT(n_wrs != 0); /* * If posting fails for some reason, we'll never receive * completion intimation, so we'll need to cleanup. But * we need to make sure we don't clean up nodes whose * wrs have been successfully posted. We assume that the * hca driver returns on the first failure to post and * therefore the first 'num_posted' entries don't need * cleanup here. */ num_posted = 0; ibt_status = ibt_post_send(state->id_chnl_hdl, wrs, n_wrs, &num_posted); if (ibt_status != IBT_SUCCESS) { ibd_print_warn(state, "ibd_post_send: " "posting multiple wrs failed: " "requested=%d, done=%d, ret=%d", n_wrs, num_posted, ibt_status); for (i = num_posted; i < n_wrs; i++) ibd_tx_cleanup(state, nodes[i]); } } } static int ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, uint_t lsohdr_sz) { ibt_wr_ds_t *sgl; ibt_status_t ibt_status; mblk_t *nmp; mblk_t *data_mp; uchar_t *bufp; size_t blksize; size_t skip; size_t avail; uint_t pktsize; uint_t frag_len; uint_t pending_hdr; int nmblks; int i; /* * Let's skip ahead to the data if this is LSO */ data_mp = mp; pending_hdr = 0; if (lsohdr_sz) { pending_hdr = lsohdr_sz; for (nmp = mp; nmp; nmp = nmp->b_cont) { frag_len = nmp->b_wptr - nmp->b_rptr; if (frag_len > pending_hdr) break; pending_hdr -= frag_len; } data_mp = nmp; /* start of data past lso header */ ASSERT(data_mp != NULL); } /* * Calculate the size of message data and number of msg blocks */ pktsize = 0; for (nmblks = 0, nmp = data_mp; nmp != NULL; nmp = nmp->b_cont, nmblks++) { pktsize += MBLKL(nmp); } pktsize -= pending_hdr; /* * We only do ibt_map_mem_iov() if the pktsize is above the * "copy-threshold", and if the number of mp fragments is less than * the maximum acceptable. */ if ((state->id_hca_res_lkey_capab) && (pktsize > state->id_ud_tx_copy_thresh) && (nmblks < state->id_max_sqseg_hiwm)) { ibt_iov_t iov_arr[IBD_MAX_SQSEG]; ibt_iov_attr_t iov_attr; iov_attr.iov_as = NULL; iov_attr.iov = iov_arr; iov_attr.iov_buf = NULL; iov_attr.iov_list_len = nmblks; iov_attr.iov_wr_nds = state->id_max_sqseg; iov_attr.iov_lso_hdr_sz = lsohdr_sz; iov_attr.iov_flags = IBT_IOV_SLEEP; for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; iov_arr[i].iov_len = MBLKL(nmp); if (i == 0) { iov_arr[i].iov_addr += pending_hdr; iov_arr[i].iov_len -= pending_hdr; } } node->w_buftype = IBD_WQE_MAPPED; node->w_swr.wr_sgl = node->w_sgl; ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); if (ibt_status != IBT_SUCCESS) { ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); goto ibd_copy_path; } return (0); } ibd_copy_path: if (pktsize <= state->id_tx_buf_sz) { node->swqe_copybuf.ic_sgl.ds_len = pktsize; node->w_swr.wr_nds = 1; node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; node->w_buftype = IBD_WQE_TXBUF; /* * Even though this is the copy path for transfers less than * id_tx_buf_sz, it could still be an LSO packet. If so, it * is possible the first data mblk fragment (data_mp) still * contains part of the LSO header that we need to skip. */ bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { blksize = MBLKL(nmp) - pending_hdr; bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); bufp += blksize; pending_hdr = 0; } return (0); } /* * Copy path for transfers greater than id_tx_buf_sz */ node->w_swr.wr_sgl = node->w_sgl; if (ibd_acquire_lsobufs(state, pktsize, node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); return (-1); } node->w_buftype = IBD_WQE_LSOBUF; /* * Copy the larger-than-id_tx_buf_sz packet into a set of * fixed-sized, pre-mapped LSO buffers. Note that we might * need to skip part of the LSO header in the first fragment * as before. */ nmp = data_mp; skip = pending_hdr; for (i = 0; i < node->w_swr.wr_nds; i++) { sgl = node->w_swr.wr_sgl + i; bufp = (uchar_t *)(uintptr_t)sgl->ds_va; avail = IBD_LSO_BUFSZ; while (nmp && avail) { blksize = MBLKL(nmp) - skip; if (blksize > avail) { bcopy(nmp->b_rptr + skip, bufp, avail); skip += avail; avail = 0; } else { bcopy(nmp->b_rptr + skip, bufp, blksize); skip = 0; avail -= blksize; bufp += blksize; nmp = nmp->b_cont; } } } return (0); } /* * Schedule a completion queue polling to reap the resource we're * short on. If we implement the change to reap tx completions * in a separate thread, we'll need to wake up that thread here. */ static int ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) { ibd_req_t *req; mutex_enter(&state->id_sched_lock); state->id_sched_needed |= resource_type; mutex_exit(&state->id_sched_lock); /* * If we are asked to queue a work entry, we need to do it */ if (q_flag) { req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) return (-1); ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); } return (0); } /* * The passed in packet has this format: * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data */ static boolean_t ibd_send(ibd_state_t *state, mblk_t *mp) { ibd_ace_t *ace; ibd_swqe_t *node; ipoib_mac_t *dest; ib_header_info_t *ipibp; ip6_t *ip6h; uint_t pktsize; uint32_t mss; uint32_t hckflags; uint32_t lsoflags = 0; uint_t lsohdr_sz = 0; int ret, len; boolean_t dofree = B_FALSE; boolean_t rc; /* if (rc_chan == NULL) send by UD; else send by RC; */ ibd_rc_chan_t *rc_chan; int nmblks; mblk_t *nmp; /* * If we aren't done with the device initialization and start, * we shouldn't be here. */ if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return (B_FALSE); /* * Obtain an address handle for the destination. */ ipibp = (ib_header_info_t *)mp->b_rptr; dest = (ipoib_mac_t *)&ipibp->ib_dst; if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); rc_chan = NULL; ace = ibd_acache_lookup(state, dest, &ret, 1); if (state->id_enable_rc && (ace != NULL) && (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { if (ace->ac_chan == NULL) { state->rc_null_conn++; } else { if (ace->ac_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) { rc_chan = ace->ac_chan; rc_chan->is_used = B_TRUE; mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); node = WQE_TO_SWQE( rc_chan->tx_wqe_list.dl_head); if (node != NULL) { rc_chan->tx_wqe_list.dl_cnt -= 1; rc_chan->tx_wqe_list.dl_head = node->swqe_next; } else { node = ibd_rc_acquire_swqes(rc_chan); } mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); if (node == NULL) { state->rc_swqe_short++; mutex_enter(&state->id_sched_lock); state->id_sched_needed |= IBD_RSRC_RC_SWQE; mutex_exit(&state->id_sched_lock); ibd_dec_ref_ace(state, ace); return (B_FALSE); } } else { state->rc_no_estab_conn++; } } } if (rc_chan == NULL) { mutex_enter(&state->id_tx_list.dl_mutex); node = WQE_TO_SWQE(state->id_tx_list.dl_head); if (node != NULL) { state->id_tx_list.dl_cnt -= 1; state->id_tx_list.dl_head = node->swqe_next; } else { node = ibd_acquire_swqe(state); } mutex_exit(&state->id_tx_list.dl_mutex); if (node == NULL) { /* * If we don't have an swqe available, schedule a * transmit completion queue cleanup and hold off on * sending more packets until we have some free swqes */ if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { if (ace != NULL) { ibd_dec_ref_ace(state, ace); } return (B_FALSE); } /* * If a poll cannot be scheduled, we have no choice but * to drop this packet */ ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); if (ace != NULL) { ibd_dec_ref_ace(state, ace); } return (B_TRUE); } } /* * Initialize the commonly used fields in swqe to NULL to protect * against ibd_tx_cleanup accidentally misinterpreting these on a * failure. */ node->swqe_im_mblk = NULL; node->w_swr.wr_nds = 0; node->w_swr.wr_sgl = NULL; node->w_swr.wr_opcode = IBT_WRC_SEND; /* * Calculate the size of message data and number of msg blocks */ pktsize = 0; for (nmblks = 0, nmp = mp; nmp != NULL; nmp = nmp->b_cont, nmblks++) { pktsize += MBLKL(nmp); } if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) atomic_inc_64(&state->id_brd_xmt); else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) atomic_inc_64(&state->id_multi_xmt); if (ace != NULL) { node->w_ahandle = ace; node->w_swr.wr.ud.udwr_dest = ace->ac_dest; } else { DPRINT(5, "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", ((ret == EFAULT) ? "failed" : "queued"), htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), htonl(dest->ipoib_gidpref[1]), htonl(dest->ipoib_gidsuff[0]), htonl(dest->ipoib_gidsuff[1])); state->rc_ace_not_found++; node->w_ahandle = NULL; /* * Here if ibd_acache_lookup() returns EFAULT, it means ibd * can not find a path for the specific dest address. We * should get rid of this kind of packet. We also should get * rid of the packet if we cannot schedule a poll via the * async thread. For the normal case, ibd will return the * packet to upper layer and wait for AH creating. * * Note that we always queue a work slot entry for the async * thread when we fail AH lookup (even in intr mode); this is * due to the convoluted way the code currently looks for AH. */ if (ret == EFAULT) { dofree = B_TRUE; rc = B_TRUE; } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { dofree = B_TRUE; rc = B_TRUE; } else { dofree = B_FALSE; rc = B_FALSE; } goto ibd_send_fail; } /* * For ND6 packets, padding is at the front of the source lladdr. * Insert the padding at front. */ if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { if (!pullupmsg(mp, IPV6_HDR_LEN + sizeof (ib_header_info_t))) { DPRINT(10, "ibd_send: pullupmsg failure "); dofree = B_TRUE; rc = B_TRUE; goto ibd_send_fail; } ipibp = (ib_header_info_t *)mp->b_rptr; } ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ib_header_info_t)); len = ntohs(ip6h->ip6_plen); if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { mblk_t *pad; pad = allocb(4, 0); pad->b_wptr = (uchar_t *)pad->b_rptr + 4; linkb(mp, pad); if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN + len + 4) { if (!pullupmsg(mp, sizeof (ib_header_info_t) + IPV6_HDR_LEN + len + 4)) { DPRINT(10, "ibd_send: pullupmsg " "failure "); dofree = B_TRUE; rc = B_TRUE; goto ibd_send_fail; } ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + sizeof (ib_header_info_t)); } /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_SEND); } } ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); mp->b_rptr += sizeof (ib_addrs_t); pktsize -= sizeof (ib_addrs_t); if (rc_chan) { /* send in RC mode */ ibt_iov_t iov_arr[IBD_MAX_SQSEG]; ibt_iov_attr_t iov_attr; uint_t i; size_t blksize; uchar_t *bufp; ibd_rc_tx_largebuf_t *lbufp; atomic_add_64(&state->rc_xmt_bytes, pktsize); /* * Upper layer does Tx checksum, we don't need do any * checksum here. */ ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); /* * We only do ibt_map_mem_iov() if the pktsize is above * the "copy-threshold", and if the number of mp * fragments is less than the maximum acceptable. */ if (pktsize <= state->id_rc_tx_copy_thresh) { atomic_inc_64(&state->rc_xmt_small_pkt); /* * Only process unicast packet in Reliable Connected * mode. */ node->swqe_copybuf.ic_sgl.ds_len = pktsize; node->w_swr.wr_nds = 1; node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; node->w_buftype = IBD_WQE_TXBUF; bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { blksize = MBLKL(nmp); bcopy(nmp->b_rptr, bufp, blksize); bufp += blksize; } freemsg(mp); ASSERT(node->swqe_im_mblk == NULL); } else { if ((state->rc_enable_iov_map) && (nmblks < state->rc_max_sqseg_hiwm)) { /* do ibt_map_mem_iov() */ iov_attr.iov_as = NULL; iov_attr.iov = iov_arr; iov_attr.iov_buf = NULL; iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; iov_attr.iov_lso_hdr_sz = 0; iov_attr.iov_flags = IBT_IOV_SLEEP; i = 0; for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { iov_arr[i].iov_len = MBLKL(nmp); if (iov_arr[i].iov_len != 0) { iov_arr[i].iov_addr = (caddr_t) (void *)nmp->b_rptr; i++; } } iov_attr.iov_list_len = i; node->w_swr.wr_sgl = node->w_sgl; ret = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); if (ret != IBT_SUCCESS) { atomic_inc_64( &state->rc_xmt_map_fail_pkt); DPRINT(30, "ibd_send: ibt_map_mem_iov(" ") failed, nmblks=%d, real_nmblks" "=%d, ret=0x%x", nmblks, i, ret); goto ibd_rc_large_copy; } atomic_inc_64(&state->rc_xmt_map_succ_pkt); node->w_buftype = IBD_WQE_MAPPED; node->swqe_im_mblk = mp; } else { atomic_inc_64(&state->rc_xmt_fragmented_pkt); ibd_rc_large_copy: mutex_enter(&state->rc_tx_large_bufs_lock); if (state->rc_tx_largebuf_nfree == 0) { state->rc_xmt_buf_short++; mutex_exit (&state->rc_tx_large_bufs_lock); mutex_enter(&state->id_sched_lock); state->id_sched_needed |= IBD_RSRC_RC_TX_LARGEBUF; mutex_exit(&state->id_sched_lock); dofree = B_FALSE; rc = B_FALSE; /* * If we don't have Tx large bufs, * return failure. node->w_buftype * should not be IBD_WQE_RC_COPYBUF, * otherwise it will cause problem * in ibd_rc_tx_cleanup() */ node->w_buftype = IBD_WQE_TXBUF; goto ibd_send_fail; } lbufp = state->rc_tx_largebuf_free_head; ASSERT(lbufp->lb_buf != NULL); state->rc_tx_largebuf_free_head = lbufp->lb_next; lbufp->lb_next = NULL; /* Update nfree count */ state->rc_tx_largebuf_nfree --; mutex_exit(&state->rc_tx_large_bufs_lock); bufp = lbufp->lb_buf; node->w_sgl[0].ds_va = (ib_vaddr_t)(uintptr_t)bufp; node->w_sgl[0].ds_key = state->rc_tx_mr_desc.md_lkey; node->w_sgl[0].ds_len = pktsize; node->w_swr.wr_sgl = node->w_sgl; node->w_swr.wr_nds = 1; node->w_buftype = IBD_WQE_RC_COPYBUF; node->w_rc_tx_largebuf = lbufp; for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { blksize = MBLKL(nmp); if (blksize != 0) { bcopy(nmp->b_rptr, bufp, blksize); bufp += blksize; } } freemsg(mp); ASSERT(node->swqe_im_mblk == NULL); } } node->swqe_next = NULL; mutex_enter(&rc_chan->tx_post_lock); if (rc_chan->tx_busy) { if (rc_chan->tx_head) { rc_chan->tx_tail->swqe_next = SWQE_TO_WQE(node); } else { rc_chan->tx_head = node; } rc_chan->tx_tail = node; mutex_exit(&rc_chan->tx_post_lock); } else { rc_chan->tx_busy = 1; mutex_exit(&rc_chan->tx_post_lock); ibd_rc_post_send(rc_chan, node); } return (B_TRUE); } /* send by RC */ if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { /* * Too long pktsize. The packet size from GLD should <= * state->id_mtu + sizeof (ib_addrs_t) */ if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { ibd_req_t *req; mutex_enter(&ace->tx_too_big_mutex); if (ace->tx_too_big_ongoing) { mutex_exit(&ace->tx_too_big_mutex); state->rc_xmt_reenter_too_long_pkt++; dofree = B_TRUE; } else { ace->tx_too_big_ongoing = B_TRUE; mutex_exit(&ace->tx_too_big_mutex); state->rc_xmt_icmp_too_long_pkt++; req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) { ibd_print_warn(state, "ibd_send: alloc " "ibd_req_t fail"); /* Drop it. */ dofree = B_TRUE; } else { req->rq_ptr = mp; req->rq_ptr2 = ace; ibd_queue_work_slot(state, req, IBD_ASYNC_RC_TOO_BIG); dofree = B_FALSE; } } } else { ibd_print_warn(state, "Reliable Connected mode is on. " "Multicast packet length %d > %d is too long to " "send packet (%d > %d), drop it", pktsize, state->id_mtu); state->rc_xmt_drop_too_long_pkt++; /* Drop it. */ dofree = B_TRUE; } rc = B_TRUE; goto ibd_send_fail; } atomic_add_64(&state->id_xmt_bytes, pktsize); atomic_inc_64(&state->id_xmt_pkt); /* * Do LSO and checksum related work here. For LSO send, adjust the * ud destination, the opcode and the LSO header information to the * work request. */ mac_lso_get(mp, &mss, &lsoflags); if ((lsoflags & HW_LSO) != HW_LSO) { node->w_swr.wr_opcode = IBT_WRC_SEND; lsohdr_sz = 0; } else { if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { /* * The routine can only fail if there's no memory; we * can only drop the packet if this happens */ ibd_print_warn(state, "ibd_send: no memory, lso posting failed"); dofree = B_TRUE; rc = B_TRUE; goto ibd_send_fail; } node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; } mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; else node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; /* * Prepare the sgl for posting; the routine can only fail if there's * no lso buf available for posting. If this is the case, we should * probably resched for lso bufs to become available and then try again. */ if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { dofree = B_TRUE; rc = B_TRUE; } else { dofree = B_FALSE; rc = B_FALSE; } goto ibd_send_fail; } node->swqe_im_mblk = mp; /* * Queue the wqe to hardware; since we can now simply queue a * post instead of doing it serially, we cannot assume anything * about the 'node' after ibd_post_send() returns. */ node->swqe_next = NULL; mutex_enter(&state->id_txpost_lock); if (state->id_tx_busy) { if (state->id_tx_head) { state->id_tx_tail->swqe_next = SWQE_TO_WQE(node); } else { state->id_tx_head = node; } state->id_tx_tail = node; mutex_exit(&state->id_txpost_lock); } else { state->id_tx_busy = 1; mutex_exit(&state->id_txpost_lock); ibd_post_send(state, node); } return (B_TRUE); ibd_send_fail: if (node && mp) ibd_free_lsohdr(node, mp); if (dofree) freemsg(mp); if (node != NULL) { if (rc_chan) { ibd_rc_tx_cleanup(node); } else { ibd_tx_cleanup(state, node); } } return (rc); } /* * GLDv3 entry point for transmitting datagram. */ static mblk_t * ibd_m_tx(void *arg, mblk_t *mp) { ibd_state_t *state = (ibd_state_t *)arg; mblk_t *next; if (state->id_type == IBD_PORT_DRIVER) { freemsgchain(mp); return (NULL); } if ((state->id_link_state != LINK_STATE_UP) || !(state->id_mac_state & IBD_DRV_STARTED)) { freemsgchain(mp); mp = NULL; } while (mp != NULL) { next = mp->b_next; mp->b_next = NULL; if (ibd_send(state, mp) == B_FALSE) { /* Send fail */ mp->b_next = next; break; } mp = next; } return (mp); } /* * this handles Tx and Rx completions. With separate CQs, this handles * only Rx completions. */ static uint_t ibd_intr(caddr_t arg) { ibd_state_t *state = (ibd_state_t *)arg; ibd_poll_rcq(state, state->id_rcq_hdl); return (DDI_INTR_CLAIMED); } /* * Poll and fully drain the send cq */ static void ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) { ibt_wc_t *wcs = state->id_txwcs; uint_t numwcs = state->id_txwcs_size; ibd_wqe_t *wqe; ibd_swqe_t *head, *tail; ibt_wc_t *wc; uint_t num_polled; int i; while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { head = tail = NULL; for (i = 0, wc = wcs; i < num_polled; i++, wc++) { wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; if (wc->wc_status != IBT_WC_SUCCESS) { /* * Channel being torn down. */ if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { DPRINT(5, "ibd_drain_scq: flush error"); DPRINT(10, "ibd_drain_scq: Bad " "status %d", wc->wc_status); } else { DPRINT(10, "ibd_drain_scq: " "unexpected wc_status %d", wc->wc_status); } /* * Fallthrough to invoke the Tx handler to * release held resources, e.g., AH refcount. */ } /* * Add this swqe to the list to be cleaned up. */ if (head) tail->swqe_next = wqe; else head = WQE_TO_SWQE(wqe); tail = WQE_TO_SWQE(wqe); } tail->swqe_next = NULL; ibd_tx_cleanup_list(state, head, tail); /* * Resume any blocked transmissions if possible */ ibd_resume_transmission(state); } } /* * Poll and fully drain the receive cq */ static void ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) { ibt_wc_t *wcs = state->id_rxwcs; uint_t numwcs = state->id_rxwcs_size; ibd_rwqe_t *rwqe; ibt_wc_t *wc; uint_t num_polled; int i; mblk_t *head, *tail, *mp; while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { head = tail = NULL; for (i = 0, wc = wcs; i < num_polled; i++, wc++) { rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; if (wc->wc_status != IBT_WC_SUCCESS) { /* * Channel being torn down. */ if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { DPRINT(5, "ibd_drain_rcq: " "expected flushed rwqe"); } else { DPRINT(5, "ibd_drain_rcq: " "unexpected wc_status %d", wc->wc_status); } atomic_inc_32( &state->id_rx_list.dl_bufs_outstanding); freemsg(rwqe->rwqe_im_mblk); continue; } mp = ibd_process_rx(state, rwqe, wc); if (mp == NULL) continue; /* * Add this mp to the list to send to the nw layer. */ if (head) tail->b_next = mp; else head = mp; tail = mp; } if (head) mac_rx(state->id_mh, state->id_rh, head); /* * Account for #rwqes polled. * Post more here, if less than one fourth full. */ if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < (state->id_ud_num_rwqe / 4)) ibd_post_recv_intr(state); } } /* * Common code for interrupt handling as well as for polling * for all completed wqe's while detaching. */ static void ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) { int flag, redo_flag; int redo = 1; flag = IBD_CQ_POLLING; redo_flag = IBD_REDO_CQ_POLLING; mutex_enter(&state->id_scq_poll_lock); if (state->id_scq_poll_busy & flag) { ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); state->id_scq_poll_busy |= redo_flag; mutex_exit(&state->id_scq_poll_lock); return; } state->id_scq_poll_busy |= flag; mutex_exit(&state->id_scq_poll_lock); /* * In some cases (eg detaching), this code can be invoked on * any cpu after disabling cq notification (thus no concurrency * exists). Apart from that, the following applies normally: * Transmit completion handling could be from any cpu if * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ * is interrupt driven. */ /* * Poll and drain the CQ */ ibd_drain_scq(state, cq_hdl); /* * Enable CQ notifications and redrain the cq to catch any * completions we might have missed after the ibd_drain_scq() * above and before the ibt_enable_cq_notify() that follows. * Finally, service any new requests to poll the cq that * could've come in after the ibt_enable_cq_notify(). */ do { if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); } ibd_drain_scq(state, cq_hdl); mutex_enter(&state->id_scq_poll_lock); if (state->id_scq_poll_busy & redo_flag) state->id_scq_poll_busy &= ~redo_flag; else { state->id_scq_poll_busy &= ~flag; redo = 0; } mutex_exit(&state->id_scq_poll_lock); } while (redo); } /* * Common code for interrupt handling as well as for polling * for all completed wqe's while detaching. */ static void ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) { int flag, redo_flag; int redo = 1; flag = IBD_CQ_POLLING; redo_flag = IBD_REDO_CQ_POLLING; mutex_enter(&state->id_rcq_poll_lock); if (state->id_rcq_poll_busy & flag) { ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); state->id_rcq_poll_busy |= redo_flag; mutex_exit(&state->id_rcq_poll_lock); return; } state->id_rcq_poll_busy |= flag; mutex_exit(&state->id_rcq_poll_lock); /* * Poll and drain the CQ */ ibd_drain_rcq(state, rcq); /* * Enable CQ notifications and redrain the cq to catch any * completions we might have missed after the ibd_drain_cq() * above and before the ibt_enable_cq_notify() that follows. * Finally, service any new requests to poll the cq that * could've come in after the ibt_enable_cq_notify(). */ do { if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); } ibd_drain_rcq(state, rcq); mutex_enter(&state->id_rcq_poll_lock); if (state->id_rcq_poll_busy & redo_flag) state->id_rcq_poll_busy &= ~redo_flag; else { state->id_rcq_poll_busy &= ~flag; redo = 0; } mutex_exit(&state->id_rcq_poll_lock); } while (redo); } /* * Unmap the memory area associated with a given swqe. */ void ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) { ibt_status_t stat; DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); if (swqe->w_mi_hdl) { if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, swqe->w_mi_hdl)) != IBT_SUCCESS) { DPRINT(10, "failed in ibt_unmap_mem_iov, ret=%d\n", stat); } swqe->w_mi_hdl = NULL; } swqe->w_swr.wr_nds = 0; } void ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) { /* * The recycling logic can be eliminated from here * and put into the async thread if we create another * list to hold ACE's for unjoined mcg's. */ if (DEC_REF_DO_CYCLE(ace)) { ibd_mce_t *mce; /* * Check with the lock taken: we decremented * reference count without the lock, and some * transmitter might already have bumped the * reference count (possible in case of multicast * disable when we leave the AH on the active * list). If not still 0, get out, leaving the * recycle bit intact. * * Atomically transition the AH from active * to free list, and queue a work request to * leave the group and destroy the mce. No * transmitter can be looking at the AH or * the MCE in between, since we have the * ac_mutex lock. In the SendOnly reap case, * it is not necessary to hold the ac_mutex * and recheck the ref count (since the AH was * taken off the active list), we just do it * to have uniform processing with the Full * reap case. */ mutex_enter(&state->id_ac_mutex); mce = ace->ac_mce; if (GET_REF_CYCLE(ace) == 0) { CLEAR_REFCYCLE(ace); /* * Identify the case of fullmember reap as * opposed to mcg trap reap. Also, port up * might set ac_mce to NULL to indicate Tx * cleanup should do no more than put the * AH in the free list (see ibd_async_link). */ if (mce != NULL) { ace->ac_mce = NULL; IBD_ACACHE_PULLOUT_ACTIVE(state, ace); /* * mc_req was initialized at mce * creation time. */ ibd_queue_work_slot(state, &mce->mc_req, IBD_ASYNC_REAP); } IBD_ACACHE_INSERT_FREE(state, ace); } mutex_exit(&state->id_ac_mutex); } } /* * Common code that deals with clean ups after a successful or * erroneous transmission attempt. */ static void ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) { ibd_ace_t *ace = swqe->w_ahandle; DPRINT(20, "ibd_tx_cleanup %p\n", swqe); /* * If this was a dynamic mapping in ibd_send(), we need to * unmap here. If this was an lso buffer we'd used for sending, * we need to release the lso buf to the pool, since the resource * is scarce. However, if this was simply a normal send using * the copybuf (present in each swqe), we don't need to release it. */ if (swqe->swqe_im_mblk != NULL) { if (swqe->w_buftype == IBD_WQE_MAPPED) { ibd_unmap_mem(state, swqe); } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { ibd_release_lsobufs(state, swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); } ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); freemsg(swqe->swqe_im_mblk); swqe->swqe_im_mblk = NULL; } /* * Drop the reference count on the AH; it can be reused * now for a different destination if there are no more * posted sends that will use it. This can be eliminated * if we can always associate each Tx buffer with an AH. * The ace can be null if we are cleaning up from the * ibd_send() error path. */ if (ace != NULL) { ibd_dec_ref_ace(state, ace); } /* * Release the send wqe for reuse. */ swqe->swqe_next = NULL; ibd_release_swqe(state, swqe, swqe, 1); } static void ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) { ibd_ace_t *ace; ibd_swqe_t *swqe; int n = 0; DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { /* * If this was a dynamic mapping in ibd_send(), we need to * unmap here. If this was an lso buffer we'd used for sending, * we need to release the lso buf to the pool, since the * resource is scarce. However, if this was simply a normal * send using the copybuf (present in each swqe), we don't need * to release it. */ if (swqe->swqe_im_mblk != NULL) { if (swqe->w_buftype == IBD_WQE_MAPPED) { ibd_unmap_mem(state, swqe); } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { ibd_release_lsobufs(state, swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); } ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); freemsg(swqe->swqe_im_mblk); swqe->swqe_im_mblk = NULL; } /* * Drop the reference count on the AH; it can be reused * now for a different destination if there are no more * posted sends that will use it. This can be eliminated * if we can always associate each Tx buffer with an AH. * The ace can be null if we are cleaning up from the * ibd_send() error path. */ ace = swqe->w_ahandle; if (ace != NULL) { ibd_dec_ref_ace(state, ace); } n++; } /* * Release the send wqes for reuse. */ ibd_release_swqe(state, head, tail, n); } /* * Processing to be done after receipt of a packet; hand off to GLD * in the format expected by GLD. The received packet has this * format: 2b sap :: 00 :: data. */ static mblk_t * ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) { ib_header_info_t *phdr; mblk_t *mp; ipoib_hdr_t *ipibp; ipha_t *iphap; ip6_t *ip6h; int len; ib_msglen_t pkt_len = wc->wc_bytes_xfer; uint32_t bufs; /* * Track number handed to upper layer that need to be returned. */ bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); /* Never run out of rwqes, use allocb when running low */ if (bufs >= state->id_rx_bufs_outstanding_limit) { atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); atomic_inc_32(&state->id_rx_allocb); mp = allocb(pkt_len, BPRI_HI); if (mp) { bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); ibd_post_recv(state, rwqe); } else { /* no memory */ atomic_inc_32(&state->id_rx_allocb_failed); ibd_post_recv(state, rwqe); return (NULL); } } else { mp = rwqe->rwqe_im_mblk; } /* * Adjust write pointer depending on how much data came in. */ mp->b_wptr = mp->b_rptr + pkt_len; /* * Make sure this is NULL or we're in trouble. */ if (mp->b_next != NULL) { ibd_print_warn(state, "ibd_process_rx: got duplicate mp from rcq?"); mp->b_next = NULL; } /* * the IB link will deliver one of the IB link layer * headers called, the Global Routing Header (GRH). * ibd driver uses the information in GRH to build the * Header_info structure and pass it with the datagram up * to GLDv3. * If the GRH is not valid, indicate to GLDv3 by setting * the VerTcFlow field to 0. */ phdr = (ib_header_info_t *)mp->b_rptr; if (wc->wc_flags & IBT_WC_GRH_PRESENT) { phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); /* if it is loop back packet, just drop it. */ if (state->id_enable_rc) { if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->rc_macaddr_loopback, IPOIB_ADDRL) == 0) { freemsg(mp); return (NULL); } } else { if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, IPOIB_ADDRL) == 0) { freemsg(mp); return (NULL); } } ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, sizeof (ipoib_mac_t)); if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); } else { phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; } } else { /* * It can not be a IBA multicast packet. Must have been * unicast for us. Just copy the interface address to dst. */ phdr->ib_grh.ipoib_vertcflow = 0; ovbcopy(&state->id_macaddr, &phdr->ib_dst, sizeof (ipoib_mac_t)); } /* * For ND6 packets, padding is at the front of the source/target * lladdr. However the inet6 layer is not aware of it, hence remove * the padding from such packets. */ ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); len = ntohs(ip6h->ip6_plen); if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_RECV); } } /* * Update statistics */ atomic_add_64(&state->id_rcv_bytes, pkt_len); atomic_inc_64(&state->id_rcv_pkt); if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) atomic_inc_64(&state->id_brd_rcv); else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) atomic_inc_64(&state->id_multi_rcv); iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); /* * Set receive checksum status in mp * Hardware checksumming can be considered valid only if: * 1. CQE.IP_OK bit is set * 2. CQE.CKSUM = 0xffff * 3. IPv6 routing header is not present in the packet * 4. If there are no IP_OPTIONS in the IP HEADER */ if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && (wc->wc_cksum == 0xFFFF) && (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); } return (mp); } /* * Callback code invoked from STREAMs when the receive data buffer is * free for recycling. */ static void ibd_freemsg_cb(char *arg) { ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; ibd_state_t *state = rwqe->w_state; atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); /* * If the driver is stopped, just free the rwqe. */ if (atomic_add_32_nv(&state->id_running, 0) == 0) { DPRINT(6, "ibd_freemsg: wqe being freed"); rwqe->rwqe_im_mblk = NULL; ibd_free_rwqe(state, rwqe); return; } rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); if (rwqe->rwqe_im_mblk == NULL) { ibd_free_rwqe(state, rwqe); DPRINT(6, "ibd_freemsg: desballoc failed"); return; } ibd_post_recv(state, rwqe); } static uint_t ibd_tx_recycle(caddr_t arg) { ibd_state_t *state = (ibd_state_t *)arg; /* * Poll for completed entries */ ibd_poll_scq(state, state->id_scq_hdl); return (DDI_INTR_CLAIMED); } #ifdef IBD_LOGGING static void ibd_log_init(void) { ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); ibd_lbuf_ndx = 0; mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); } static void ibd_log_fini(void) { if (ibd_lbuf) kmem_free(ibd_lbuf, IBD_LOG_SZ); ibd_lbuf_ndx = 0; ibd_lbuf = NULL; mutex_destroy(&ibd_lbuf_lock); } static void ibd_log(const char *fmt, ...) { va_list ap; uint32_t off; uint32_t msglen; char tmpbuf[IBD_DMAX_LINE]; if (ibd_lbuf == NULL) return; va_start(ap, fmt); msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); va_end(ap); if (msglen >= IBD_DMAX_LINE) msglen = IBD_DMAX_LINE - 1; mutex_enter(&ibd_lbuf_lock); off = ibd_lbuf_ndx; /* current msg should go here */ if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; ibd_lbuf_ndx += msglen; /* place where next msg should start */ ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) ibd_lbuf_ndx = 0; mutex_exit(&ibd_lbuf_lock); bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ } #endif /* ARGSUSED */ static int ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp, int *rvalp) { ibd_create_ioctl_t *cmd = karg; ibd_state_t *state, *port_state, *p; int i, err, rval = 0; mac_register_t *macp; ibt_hca_portinfo_t *pinfop = NULL; ibt_status_t ibt_status; uint_t psize, pinfosz; boolean_t force_create = B_FALSE; cmd->ibdioc.ioc_status = 0; if (cmd->ibdioc.ioc_port_inst < 0) { cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; return (EINVAL); } port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst); if (port_state == NULL) { DPRINT(10, "ibd_create_partition: failed to get state %d", cmd->ibdioc.ioc_port_inst); cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; return (EINVAL); } /* Limited PKeys not supported */ if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) { rval = EINVAL; goto part_create_return; } if (cmd->ioc_force_create == 0) { /* * Check if the port pkey table contains the pkey for which * this partition is being created. */ ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, port_state->id_port, &pinfop, &psize, &pinfosz); if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { rval = EINVAL; goto part_create_return; } if (pinfop->p_linkstate != IBT_PORT_ACTIVE) { rval = ENETDOWN; cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN; goto part_create_return; } for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) { if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) { break; } } if (i == pinfop->p_pkey_tbl_sz) { rval = EINVAL; cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT; goto part_create_return; } } else { force_create = B_TRUE; } mutex_enter(&ibd_objlist_lock); for (p = ibd_objlist_head; p; p = p->id_next) { if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) && (p->id_pkey == cmd->ioc_pkey) && (p->id_plinkid == cmd->ioc_partid)) { mutex_exit(&ibd_objlist_lock); rval = EEXIST; cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS; goto part_create_return; } } mutex_exit(&ibd_objlist_lock); state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP); state->id_type = IBD_PARTITION_OBJ; state->id_plinkid = cmd->ioc_partid; state->id_dlinkid = cmd->ibdioc.ioc_linkid; state->id_port_inst = cmd->ibdioc.ioc_port_inst; state->id_dip = port_state->id_dip; state->id_port = port_state->id_port; state->id_pkey = cmd->ioc_pkey; state->id_hca_guid = port_state->id_hca_guid; state->id_port_guid = port_state->id_port_guid; state->id_force_create = force_create; mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) { rval = EIO; cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE; goto fail; } if ((macp = mac_alloc(MAC_VERSION)) == NULL) { rval = EAGAIN; goto fail; } macp->m_type_ident = MAC_PLUGIN_IDENT_IB; macp->m_dip = port_state->id_dip; macp->m_instance = (uint_t)-1; macp->m_driver = state; macp->m_src_addr = (uint8_t *)&state->id_macaddr; macp->m_callbacks = &ibd_m_callbacks; macp->m_min_sdu = 0; macp->m_multicast_sdu = IBD_DEF_MAX_SDU; if (state->id_enable_rc) { macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; } else { macp->m_max_sdu = IBD_DEF_MAX_SDU; } macp->m_priv_props = ibd_priv_props; err = mac_register(macp, &state->id_mh); mac_free(macp); if (err != 0) { DPRINT(10, "ibd_create_partition: mac_register() failed %d", err); rval = err; goto fail; } err = dls_devnet_create(state->id_mh, cmd->ioc_partid, crgetzoneid(credp)); if (err != 0) { DPRINT(10, "ibd_create_partition: dls_devnet_create() failed " "%d", err); rval = err; (void) mac_unregister(state->id_mh); goto fail; } /* * Add the new partition state structure to the list */ mutex_enter(&ibd_objlist_lock); if (ibd_objlist_head) state->id_next = ibd_objlist_head; ibd_objlist_head = state; mutex_exit(&ibd_objlist_lock); part_create_return: if (pinfop) { ibt_free_portinfo(pinfop, pinfosz); } return (rval); fail: if (pinfop) { ibt_free_portinfo(pinfop, pinfosz); } ibd_part_unattach(state); kmem_free(state, sizeof (ibd_state_t)); return (rval); } /* ARGSUSED */ static int ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp, int *rvalp) { int err; datalink_id_t tmpid; ibd_state_t *node, *prev; ibd_delete_ioctl_t *cmd = karg; prev = NULL; mutex_enter(&ibd_objlist_lock); node = ibd_objlist_head; /* Find the ibd state structure corresponding to the partition */ while (node != NULL) { if (node->id_plinkid == cmd->ioc_partid) break; prev = node; node = node->id_next; } if (node == NULL) { mutex_exit(&ibd_objlist_lock); return (ENOENT); } if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) { DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed " "%d", err); mutex_exit(&ibd_objlist_lock); return (err); } /* * Call ibd_part_unattach() only after making sure that the instance has * not been started yet and is also not in late hca init mode. */ ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); err = 0; if ((node->id_mac_state & IBD_DRV_STARTED) || (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) || (ibd_part_busy(node) != DDI_SUCCESS) || ((err = mac_disable(node->id_mh)) != 0)) { (void) dls_devnet_create(node->id_mh, cmd->ioc_partid, crgetzoneid(credp)); ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); mutex_exit(&ibd_objlist_lock); return (err != 0 ? err : EBUSY); } node->id_mac_state |= IBD_DRV_IN_DELETION; ibd_part_unattach(node); ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); /* Remove the partition state structure from the linked list */ if (prev == NULL) ibd_objlist_head = node->id_next; else prev->id_next = node->id_next; mutex_exit(&ibd_objlist_lock); if ((err = mac_unregister(node->id_mh)) != 0) { DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d", err); } cv_destroy(&node->id_macst_cv); mutex_destroy(&node->id_macst_lock); kmem_free(node, sizeof (ibd_state_t)); return (0); } /* ARGSUSED */ static int ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { ibd_ioctl_t cmd; ibpart_ioctl_t partioc; ibport_ioctl_t portioc; #ifdef _MULTI_DATAMODEL ibport_ioctl32_t portioc32; #endif ibd_state_t *state, *port_state; int size; ibt_hca_portinfo_t *pinfop = NULL; ibt_status_t ibt_status; uint_t psize, pinfosz; int rval = 0; size = sizeof (ibd_ioctl_t); if (ddi_copyin((void *)arg, &cmd, size, mode)) { return (EFAULT); } cmd.ioc_status = 0; switch (cmd.ioc_info_cmd) { case IBD_INFO_CMD_IBPART: size = sizeof (ibpart_ioctl_t); if (ddi_copyin((void *)arg, &partioc, size, mode)) { return (EFAULT); } mutex_enter(&ibd_objlist_lock); /* Find the ibd state structure corresponding the partition */ for (state = ibd_objlist_head; state; state = state->id_next) { if (state->id_plinkid == cmd.ioc_linkid) { break; } } if (state == NULL) { mutex_exit(&ibd_objlist_lock); return (ENOENT); } partioc.ibdioc.ioc_linkid = state->id_dlinkid; partioc.ibdioc.ioc_port_inst = state->id_port_inst; partioc.ibdioc.ioc_portnum = state->id_port; partioc.ibdioc.ioc_hcaguid = state->id_hca_guid; partioc.ibdioc.ioc_portguid = state->id_port_guid; partioc.ibdioc.ioc_status = 0; partioc.ioc_partid = state->id_plinkid; partioc.ioc_pkey = state->id_pkey; partioc.ioc_force_create = state->id_force_create; if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) { mutex_exit(&ibd_objlist_lock); return (EFAULT); } mutex_exit(&ibd_objlist_lock); break; case IBD_INFO_CMD_IBPORT: if ((cmd.ioc_port_inst < 0) || ((port_state = ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { DPRINT(10, "ibd_create_partition: failed to get" " state %d", cmd.ioc_port_inst); size = sizeof (ibd_ioctl_t); cmd.ioc_status = IBD_INVALID_PORT_INST; if (ddi_copyout((void *)&cmd, (void *)arg, size, mode)) { return (EFAULT); } return (EINVAL); } ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, port_state->id_port, &pinfop, &psize, &pinfosz); if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { return (EINVAL); } #ifdef _MULTI_DATAMODEL switch (ddi_model_convert_from(mode & FMODELS)) { case DDI_MODEL_ILP32: { size = sizeof (ibport_ioctl32_t); if (ddi_copyin((void *)arg, &portioc32, size, mode)) { rval = EFAULT; goto fail; } portioc32.ibdioc.ioc_status = 0; portioc32.ibdioc.ioc_portnum = port_state->id_port; portioc32.ibdioc.ioc_hcaguid = port_state->id_hca_guid; portioc32.ibdioc.ioc_portguid = port_state->id_port_guid; if (portioc32.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { rval = EINVAL; size = sizeof (ibd_ioctl_t); portioc32.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; if (ddi_copyout((void *)&portioc32.ibdioc, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } goto fail; } size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); if (ddi_copyout((void *)pinfop->p_pkey_tbl, (void *)(uintptr_t)portioc32.ioc_pkeys, size, mode)) { rval = EFAULT; goto fail; } size = sizeof (ibport_ioctl32_t); if (ddi_copyout((void *)&portioc32, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } break; } case DDI_MODEL_NONE: size = sizeof (ibport_ioctl_t); if (ddi_copyin((void *)arg, &portioc, size, mode)) { rval = EFAULT; goto fail; } portioc.ibdioc.ioc_status = 0; portioc.ibdioc.ioc_portnum = port_state->id_port; portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; portioc.ibdioc.ioc_portguid = port_state->id_port_guid; if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { rval = EINVAL; size = sizeof (ibd_ioctl_t); portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } goto fail; } size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); if (ddi_copyout((void *)pinfop->p_pkey_tbl, (void *)(portioc.ioc_pkeys), size, mode)) { rval = EFAULT; goto fail; } size = sizeof (ibport_ioctl_t); if (ddi_copyout((void *)&portioc, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } break; } #else /* ! _MULTI_DATAMODEL */ size = sizeof (ibport_ioctl_t); if (ddi_copyin((void *)arg, &portioc, size, mode)) { rval = EFAULT; goto fail; } portioc.ibdioc.ioc_status = 0; portioc.ibdioc.ioc_portnum = port_state->id_port; portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; portioc.ibdioc.ioc_portguid = port_state->id_port_guid; if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { rval = EINVAL; size = sizeof (ibd_ioctl_t); portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } goto fail; } size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); if (ddi_copyout((void *)pinfop->p_pkey_tbl, (void *)(portioc.ioc_pkeys), size, mode)) { rval = EFAULT; goto fail; } size = sizeof (ibport_ioctl_t); if (ddi_copyout((void *)&portioc, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } #endif /* _MULTI_DATAMODEL */ break; case IBD_INFO_CMD_PKEYTBLSZ: if ((cmd.ioc_port_inst < 0) || ((port_state = ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { DPRINT(10, "ibd_create_partition: failed to get" " state %d", cmd.ioc_port_inst); size = sizeof (ibd_ioctl_t); cmd.ioc_status = IBD_INVALID_PORT_INST; if (ddi_copyout((void *)&cmd, (void *)arg, size, mode)) { return (EFAULT); } return (EINVAL); } ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, port_state->id_port, &pinfop, &psize, &pinfosz); if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { return (EINVAL); } #ifdef _MULTI_DATAMODEL switch (ddi_model_convert_from(mode & FMODELS)) { case DDI_MODEL_ILP32: { size = sizeof (ibport_ioctl32_t); if (ddi_copyin((void *)arg, &portioc32, size, mode)) { rval = EFAULT; goto fail; } portioc32.ibdioc.ioc_status = 0; portioc32.ibdioc.ioc_portnum = port_state->id_port; portioc32.ibdioc.ioc_hcaguid = port_state->id_hca_guid; portioc32.ibdioc.ioc_portguid = port_state->id_port_guid; portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; if (ddi_copyout((void *)&portioc32, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } break; } case DDI_MODEL_NONE: size = sizeof (ibport_ioctl_t); if (ddi_copyin((void *)arg, &portioc, size, mode)) { rval = EFAULT; goto fail; } portioc.ibdioc.ioc_status = 0; portioc.ibdioc.ioc_portnum = port_state->id_port; portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; portioc.ibdioc.ioc_portguid = port_state->id_port_guid; portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; if (ddi_copyout((void *)&portioc, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } break; } #else /* ! _MULTI_DATAMODEL */ size = sizeof (ibport_ioctl_t); if (ddi_copyin((void *)arg, &portioc, size, mode)) { rval = EFAULT; goto fail; } portioc.ibdioc.ioc_status = 0; portioc.ibdioc.ioc_portnum = port_state->id_port; portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; portioc.ibdioc.ioc_portguid = port_state->id_port_guid; portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; if (ddi_copyout((void *)&portioc, (void *)arg, size, mode)) { rval = EFAULT; goto fail; } #endif /* _MULTI_DATAMODEL */ break; default: return (EINVAL); } /* switch (cmd.ioc_info_cmd) */ fail: if (pinfop) { ibt_free_portinfo(pinfop, pinfosz); } return (rval); } /* ARGSUSED */ static void ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl, ibt_async_code_t code, ibt_async_event_t *event) { ibd_state_t *state = (ibd_state_t *)arg; link_state_t lstate; switch (code) { case IBT_EVENT_PORT_UP: case IBT_ERROR_PORT_DOWN: if (ibd_get_port_state(state, &lstate) != 0) break; if (state->id_link_state != lstate) { state->id_link_state = lstate; mac_link_update(state->id_mh, lstate); } break; default: break; } } static int ibd_get_port_state(ibd_state_t *state, link_state_t *lstate) { ibt_hca_portinfo_t *port_infop; uint_t psize, port_infosz; ibt_status_t ret; ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, &port_infop, &psize, &port_infosz); if ((ret != IBT_SUCCESS) || (psize != 1)) return (-1); state->id_sgid = *port_infop->p_sgid_tbl; state->id_link_speed = ibd_get_portspeed(state); if (port_infop->p_linkstate == IBT_PORT_ACTIVE) *lstate = LINK_STATE_UP; else *lstate = LINK_STATE_DOWN; ibt_free_portinfo(port_infop, port_infosz); return (0); } static int ibd_port_attach(dev_info_t *dip) { ibd_state_t *state; link_state_t lstate; int instance; ibt_status_t ret; /* * Allocate softstate structure */ instance = ddi_get_instance(dip); if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) { DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed"); return (DDI_FAILURE); } state = ddi_get_soft_state(ibd_list, instance); state->id_dip = dip; state->id_type = IBD_PORT_DRIVER; if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, "port-number", 0)) == 0) { DPRINT(10, "ibd_port_attach: invalid port number (%d)", state->id_port); return (DDI_FAILURE); } if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, "hca-guid", 0)) == 0) { DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)", state->id_hca_guid); return (DDI_FAILURE); } if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, "port-guid", 0)) == 0) { DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)", state->id_port_guid); return (DDI_FAILURE); } /* * Attach to IBTL */ if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state, &state->id_ibt_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d", ret); goto done; } state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, &state->id_hca_hdl)) != IBT_SUCCESS) { DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", ret); goto done; } state->id_mac_state |= IBD_DRV_HCA_OPENED; /* Update link status */ if (ibd_get_port_state(state, &lstate) != 0) { DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", ret); goto done; } state->id_link_state = lstate; /* * Register ibd interfaces with the Nemo framework */ if (ibd_register_mac(state, dip) != IBT_SUCCESS) { DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()"); goto done; } state->id_mac_state |= IBD_DRV_MAC_REGISTERED; mac_link_update(state->id_mh, lstate); return (DDI_SUCCESS); done: (void) ibd_port_unattach(state, dip); return (DDI_FAILURE); } static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip) { int instance; uint32_t progress = state->id_mac_state; ibt_status_t ret; if (progress & IBD_DRV_MAC_REGISTERED) { (void) mac_unregister(state->id_mh); state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); } if (progress & IBD_DRV_HCA_OPENED) { if ((ret = ibt_close_hca(state->id_hca_hdl)) != IBT_SUCCESS) { ibd_print_warn(state, "failed to close " "HCA device, ret=%d", ret); } state->id_hca_hdl = NULL; state->id_mac_state &= (~IBD_DRV_HCA_OPENED); } if (progress & IBD_DRV_IBTL_ATTACH_DONE) { if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { ibd_print_warn(state, "ibt_detach() failed, ret=%d", ret); } state->id_ibt_hdl = NULL; state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); } instance = ddi_get_instance(dip); ddi_soft_state_free(ibd_list, instance); return (DDI_SUCCESS); } ibt_status_t ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr) { ibd_state_t *state; mutex_enter(&ibd_objlist_lock); /* Find the ibd state structure corresponding the partition */ for (state = ibd_objlist_head; state; state = state->id_next) { if (state->id_plinkid == linkid) { break; } } if (state == NULL) { mutex_exit(&ibd_objlist_lock); return (IBT_NO_SUCH_OBJECT); } attr->pa_dlinkid = state->id_dlinkid; attr->pa_plinkid = state->id_plinkid; attr->pa_port = state->id_port; attr->pa_hca_guid = state->id_hca_guid; attr->pa_port_guid = state->id_port_guid; attr->pa_pkey = state->id_pkey; mutex_exit(&ibd_objlist_lock); return (IBT_SUCCESS); } ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts) { ibd_state_t *state; int n = 0; ibt_part_attr_t *attr; mutex_enter(&ibd_objlist_lock); for (state = ibd_objlist_head; state; state = state->id_next) n++; *nparts = n; if (n == 0) { *attr_list = NULL; mutex_exit(&ibd_objlist_lock); return (IBT_SUCCESS); } *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP); attr = *attr_list; for (state = ibd_objlist_head; state; state = state->id_next) { #ifdef DEBUG ASSERT(n > 0); n--; #endif attr->pa_dlinkid = state->id_dlinkid; attr->pa_plinkid = state->id_plinkid; attr->pa_port = state->id_port; attr->pa_hca_guid = state->id_hca_guid; attr->pa_port_guid = state->id_port_guid; attr->pa_pkey = state->id_pkey; attr++; } mutex_exit(&ibd_objlist_lock); return (IBT_SUCCESS); }