/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Port add/deletion/etc routines */ static void vsw_port_delete(vsw_port_t *port); static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); static void vsw_ldc_detach(vsw_ldc_t *ldcp); static int vsw_ldc_init(vsw_ldc_t *ldcp); static void vsw_ldc_uninit(vsw_ldc_t *ldcp); static void vsw_ldc_drain(vsw_ldc_t *ldcp); static void vsw_drain_port_taskq(vsw_port_t *port); static void vsw_marker_task(void *); static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); void vsw_detach_ports(vsw_t *vswp); int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr); int vsw_port_detach(vsw_t *vswp, int p_instance); int vsw_portsend(vsw_port_t *port, mblk_t *mp); int vsw_port_attach(vsw_port_t *portp); vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); void vsw_vlan_unaware_port_reset(vsw_port_t *portp); void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate); void vsw_reset_ports(vsw_t *vswp); void vsw_port_reset(vsw_port_t *portp); void vsw_physlink_update_ports(vsw_t *vswp); static void vsw_port_physlink_update(vsw_port_t *portp); /* Interrupt routines */ static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); /* Handshake routines */ static void vsw_ldc_reinit(vsw_ldc_t *); static void vsw_conn_task(void *); static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); static void vsw_next_milestone(vsw_ldc_t *); static int vsw_supported_version(vio_ver_msg_t *); static void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp); static void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp); void vsw_process_conn_evt(vsw_ldc_t *, uint16_t); /* Data processing routines */ void vsw_process_pkt(void *); static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int); static void vsw_process_ctrl_pkt(void *); static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); static void vsw_process_physlink_msg(vsw_ldc_t *, void *); static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *, uint32_t); static void vsw_process_pkt_data_nop(void *, void *, uint32_t); static void vsw_process_pkt_data(void *, void *, uint32_t); static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *); static void vsw_process_evt_read(vsw_ldc_t *ldcp); static void vsw_ldc_rcv(vsw_ldc_t *ldcp); /* Switching/data transmit routines */ static int vsw_descrsend(vsw_ldc_t *, mblk_t *); static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp); static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries); static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count); static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count); /* Packet creation routines */ static void vsw_send_ver(void *); static void vsw_send_attr(vsw_ldc_t *); static void vsw_send_dring_info(vsw_ldc_t *); static void vsw_send_rdx(vsw_ldc_t *); static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state); /* Dring routines */ static void vsw_create_privring(vsw_ldc_t *); static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt); static void vsw_unmap_dring(vsw_ldc_t *ldcp); static void vsw_destroy_dring(vsw_ldc_t *ldcp); static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt); static void vsw_set_lane_attr(vsw_t *, lane_t *); dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt); static int vsw_mapin_avail(vsw_ldc_t *ldcp); /* tx/msg/rcv thread routines */ static void vsw_stop_tx_thread(vsw_ldc_t *ldcp); static void vsw_ldc_tx_worker(void *arg); /* Misc support routines */ static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr); static int vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **rhead, mblk_t **rtail, mblk_t **mpp); static mblk_t *vsw_dupmsgchain(mblk_t *mp); /* Debugging routines */ static void dump_flags(uint64_t); static void display_state(void); static void display_lane(lane_t *); static void display_ring(dring_info_t *); /* * Functions imported from other files. */ extern int vsw_set_hw(vsw_t *, vsw_port_t *, int); extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int); extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port); extern void vsw_del_mcst_port(vsw_port_t *port); extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg); extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg); extern void vsw_fdbe_add(vsw_t *vswp, void *port); extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr); extern void vsw_create_vlans(void *arg, int type); extern void vsw_destroy_vlans(void *arg, int type); extern void vsw_vlan_add_ids(void *arg, int type); extern void vsw_vlan_remove_ids(void *arg, int type); extern boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp, uint16_t *vidp); extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp); extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt); extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid); extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp); extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp); extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg); extern void vsw_hio_stop_port(vsw_port_t *portp); extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp); extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type); extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type); extern void vsw_destroy_rxpools(void *arg); extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp); extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); extern int vsw_dringsend(vsw_ldc_t *, mblk_t *); extern int vsw_reclaim_dring(dring_info_t *dp, int start); extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, int *); extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *); extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp); extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp); extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt); extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp); extern void vsw_ldc_msg_worker(void *arg); extern void vsw_process_dringdata(void *, void *); extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *); extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp); extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt); extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp); extern void vsw_ldc_rcv_worker(void *arg); extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp); extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *); extern void vsw_process_dringdata_shm(void *, void *); /* * Tunables used in this file. */ extern int vsw_num_handshakes; extern int vsw_ldc_tx_delay; extern int vsw_ldc_tx_retries; extern int vsw_ldc_retries; extern int vsw_ldc_delay; extern boolean_t vsw_ldc_rxthr_enabled; extern boolean_t vsw_ldc_txthr_enabled; extern uint32_t vsw_num_descriptors; extern uint8_t vsw_dring_mode; extern uint32_t vsw_max_tx_qcount; extern boolean_t vsw_obp_ver_proto_workaround; extern uint32_t vsw_publish_macaddr_count; extern uint32_t vsw_nrbufs_factor; #define LDC_ENTER_LOCK(ldcp) \ mutex_enter(&((ldcp)->ldc_cblock));\ mutex_enter(&((ldcp)->ldc_rxlock));\ mutex_enter(&((ldcp)->ldc_txlock)); #define LDC_EXIT_LOCK(ldcp) \ mutex_exit(&((ldcp)->ldc_txlock));\ mutex_exit(&((ldcp)->ldc_rxlock));\ mutex_exit(&((ldcp)->ldc_cblock)); #define VSW_VER_EQ(ldcp, major, minor) \ ((ldcp)->lane_out.ver_major == (major) && \ (ldcp)->lane_out.ver_minor == (minor)) #define VSW_VER_LT(ldcp, major, minor) \ (((ldcp)->lane_out.ver_major < (major)) || \ ((ldcp)->lane_out.ver_major == (major) && \ (ldcp)->lane_out.ver_minor < (minor))) #define VSW_VER_GTEQ(ldcp, major, minor) \ (((ldcp)->lane_out.ver_major > (major)) || \ ((ldcp)->lane_out.ver_major == (major) && \ (ldcp)->lane_out.ver_minor >= (minor))) #define VSW_VER_LTEQ(ldcp, major, minor) \ (((ldcp)->lane_out.ver_major < (major)) || \ ((ldcp)->lane_out.ver_major == (major) && \ (ldcp)->lane_out.ver_minor <= (minor))) /* * VIO Protocol Version Info: * * The version specified below represents the version of protocol currently * supported in the driver. It means the driver can negotiate with peers with * versions <= this version. Here is a summary of the feature(s) that are * supported at each version of the protocol: * * 1.0 Basic VIO protocol. * 1.1 vDisk protocol update (no virtual network update). * 1.2 Support for priority frames (priority-ether-types). * 1.3 VLAN and HybridIO support. * 1.4 Jumbo Frame support. * 1.5 Link State Notification support with optional support * for Physical Link information. * 1.6 Support for RxDringData mode. */ static ver_sup_t vsw_versions[] = { {1, 6} }; /* * For the moment the state dump routines have their own * private flag. */ #define DUMP_STATE 0 #if DUMP_STATE #define DUMP_TAG(tag) \ { \ D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ } #define DUMP_TAG_PTR(tag) \ { \ D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ } #define DUMP_FLAGS(flags) dump_flags(flags); #define DISPLAY_STATE() display_state() #else #define DUMP_TAG(tag) #define DUMP_TAG_PTR(tag) #define DUMP_FLAGS(state) #define DISPLAY_STATE() #endif /* DUMP_STATE */ /* * Attach the specified port. * * Returns 0 on success, 1 on failure. */ int vsw_port_attach(vsw_port_t *port) { vsw_t *vswp = port->p_vswp; vsw_port_list_t *plist = &vswp->plist; vsw_port_t *p, **pp; int nids = port->num_ldcs; uint64_t *ldcids; int rv; D1(vswp, "%s: enter : port %d", __func__, port->p_instance); /* port already exists? */ READ_ENTER(&plist->lockrw); for (p = plist->head; p != NULL; p = p->p_next) { if (p->p_instance == port->p_instance) { DWARN(vswp, "%s: port instance %d already attached", __func__, p->p_instance); RW_EXIT(&plist->lockrw); return (1); } } RW_EXIT(&plist->lockrw); mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL); mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); port->state = VSW_PORT_INIT; D2(vswp, "%s: %d nids", __func__, nids); ldcids = port->ldc_ids; D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]); if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) { DERR(vswp, "%s: ldc_attach failed", __func__); goto exit_error; } if (vswp->switching_setup_done == B_TRUE) { /* * If the underlying network device has been setup, * then open a mac client and porgram the mac address * for this port. */ rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT); if (rv != 0) { goto exit_error; } } /* create the fdb entry for this port/mac address */ vsw_fdbe_add(vswp, port); vsw_create_vlans(port, VSW_VNETPORT); WRITE_ENTER(&plist->lockrw); /* link it into the list of ports for this vsw instance */ pp = (vsw_port_t **)(&plist->head); port->p_next = *pp; *pp = port; plist->num_ports++; RW_EXIT(&plist->lockrw); /* * Initialise the port and any ldc's under it. */ (void) vsw_ldc_init(port->ldcp); /* announce macaddr of vnet to the physical switch */ if (vsw_publish_macaddr_count != 0) { /* enabled */ vsw_publish_macaddr(vswp, port); } D1(vswp, "%s: exit", __func__); return (0); exit_error: cv_destroy(&port->state_cv); mutex_destroy(&port->state_lock); rw_destroy(&port->maccl_rwlock); mutex_destroy(&port->tx_lock); mutex_destroy(&port->mca_lock); kmem_free(port, sizeof (vsw_port_t)); return (1); } /* * Detach the specified port. * * Returns 0 on success, 1 on failure. */ int vsw_port_detach(vsw_t *vswp, int p_instance) { vsw_port_t *port = NULL; vsw_port_list_t *plist = &vswp->plist; D1(vswp, "%s: enter: port id %d", __func__, p_instance); WRITE_ENTER(&plist->lockrw); if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { RW_EXIT(&plist->lockrw); return (1); } if (vsw_plist_del_node(vswp, port)) { RW_EXIT(&plist->lockrw); return (1); } /* cleanup any HybridIO for this port */ vsw_hio_stop_port(port); /* * No longer need to hold writer lock on port list now * that we have unlinked the target port from the list. */ RW_EXIT(&plist->lockrw); /* Cleanup and close the mac client */ vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT); /* Remove the fdb entry for this port/mac address */ vsw_fdbe_del(vswp, &(port->p_macaddr)); vsw_destroy_vlans(port, VSW_VNETPORT); /* Remove any multicast addresses.. */ vsw_del_mcst_port(port); vsw_port_delete(port); D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); return (0); } /* * Detach all active ports. */ void vsw_detach_ports(vsw_t *vswp) { vsw_port_list_t *plist = &vswp->plist; vsw_port_t *port = NULL; D1(vswp, "%s: enter", __func__); WRITE_ENTER(&plist->lockrw); while ((port = plist->head) != NULL) { (void) vsw_plist_del_node(vswp, port); /* cleanup any HybridIO for this port */ vsw_hio_stop_port(port); /* Cleanup and close the mac client */ vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT); /* Remove the fdb entry for this port/mac address */ vsw_fdbe_del(vswp, &(port->p_macaddr)); vsw_destroy_vlans(port, VSW_VNETPORT); /* Remove any multicast addresses.. */ vsw_del_mcst_port(port); /* * No longer need to hold the lock on the port list * now that we have unlinked the target port from the * list. */ RW_EXIT(&plist->lockrw); vsw_port_delete(port); WRITE_ENTER(&plist->lockrw); } RW_EXIT(&plist->lockrw); D1(vswp, "%s: exit", __func__); } /* * Delete the specified port. */ static void vsw_port_delete(vsw_port_t *port) { vsw_t *vswp = port->p_vswp; D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); vsw_ldc_uninit(port->ldcp); /* * Wait for any pending ctrl msg tasks which reference this * port to finish. */ vsw_drain_port_taskq(port); /* * Wait for any active callbacks to finish */ vsw_ldc_drain(port->ldcp); vsw_ldc_detach(port->ldcp); rw_destroy(&port->maccl_rwlock); mutex_destroy(&port->mca_lock); mutex_destroy(&port->tx_lock); cv_destroy(&port->state_cv); mutex_destroy(&port->state_lock); if (port->num_ldcs != 0) { kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t)); port->num_ldcs = 0; } if (port->nvids != 0) { kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids); } kmem_free(port, sizeof (vsw_port_t)); D1(vswp, "%s: exit", __func__); } /* * Attach a logical domain channel (ldc) under a specified port. * * Returns 0 on success, 1 on failure. */ static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) { vsw_t *vswp = port->p_vswp; vsw_ldc_t *ldcp = NULL; ldc_attr_t attr; ldc_status_t istatus; int status = DDI_FAILURE; char kname[MAXNAMELEN]; enum { PROG_init = 0x0, PROG_callback = 0x1, PROG_tx_thread = 0x2} progress; progress = PROG_init; D1(vswp, "%s: enter", __func__); ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); if (ldcp == NULL) { DERR(vswp, "%s: kmem_zalloc failed", __func__); return (1); } ldcp->ldc_id = ldc_id; mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL); mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); ldcp->msg_thr_flags = 0; mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL); ldcp->rcv_thr_flags = 0; mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL); mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); /* required for handshake with peer */ ldcp->local_session = (uint64_t)ddi_get_lbolt(); ldcp->peer_session = 0; ldcp->session_status = 0; ldcp->hss_id = 1; /* Initial handshake session id */ ldcp->hphase = VSW_MILESTONE0; (void) atomic_swap_32(&port->p_hio_capable, B_FALSE); /* only set for outbound lane, inbound set by peer */ vsw_set_lane_attr(vswp, &ldcp->lane_out); attr.devclass = LDC_DEV_NT_SVC; attr.instance = ddi_get_instance(vswp->dip); attr.mode = LDC_MODE_UNRELIABLE; attr.mtu = VSW_LDC_MTU; status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); if (status != 0) { DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", __func__, ldc_id, status); goto ldc_attach_fail; } if (vsw_ldc_txthr_enabled) { ldcp->tx_thr_flags = 0; ldcp->tx_mhead = ldcp->tx_mtail = NULL; mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL); ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ, vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri); progress |= PROG_tx_thread; if (ldcp->tx_thread == NULL) { DWARN(vswp, "%s(%lld): Failed to create worker thread", __func__, ldc_id); goto ldc_attach_fail; } } status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); if (status != 0) { DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", __func__, ldc_id, status); (void) ldc_fini(ldcp->ldc_handle); goto ldc_attach_fail; } /* * allocate a message for ldc_read()s, big enough to hold ctrl and * data msgs, including raw data msgs used to recv priority frames. */ ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size; ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP); progress |= PROG_callback; mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL); if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { DERR(vswp, "%s: ldc_status failed", __func__); mutex_destroy(&ldcp->status_lock); goto ldc_attach_fail; } ldcp->ldc_status = istatus; ldcp->ldc_port = port; ldcp->ldc_vswp = vswp; vsw_reset_vnet_proto_ops(ldcp); (void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id); ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance, kname, &ldcp->ldc_stats); if (ldcp->ksp == NULL) { DERR(vswp, "%s: kstats setup failed", __func__); goto ldc_attach_fail; } /* link it into this port */ port->ldcp = ldcp; D1(vswp, "%s: exit", __func__); return (0); ldc_attach_fail: if (progress & PROG_callback) { (void) ldc_unreg_callback(ldcp->ldc_handle); kmem_free(ldcp->ldcmsg, ldcp->msglen); } if (progress & PROG_tx_thread) { if (ldcp->tx_thread != NULL) { vsw_stop_tx_thread(ldcp); } mutex_destroy(&ldcp->tx_thr_lock); cv_destroy(&ldcp->tx_thr_cv); } if (ldcp->ksp != NULL) { vgen_destroy_kstats(ldcp->ksp); } mutex_destroy(&ldcp->msg_thr_lock); mutex_destroy(&ldcp->rcv_thr_lock); mutex_destroy(&ldcp->ldc_txlock); mutex_destroy(&ldcp->ldc_rxlock); mutex_destroy(&ldcp->ldc_cblock); mutex_destroy(&ldcp->drain_cv_lock); cv_destroy(&ldcp->msg_thr_cv); cv_destroy(&ldcp->rcv_thr_cv); cv_destroy(&ldcp->drain_cv); kmem_free(ldcp, sizeof (vsw_ldc_t)); return (1); } /* * Detach a logical domain channel (ldc) belonging to a * particular port. */ static void vsw_ldc_detach(vsw_ldc_t *ldcp) { int rv; vsw_t *vswp = ldcp->ldc_port->p_vswp; int retries = 0; D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); /* Stop msg/rcv thread */ if (ldcp->rcv_thread != NULL) { vsw_stop_rcv_thread(ldcp); } else if (ldcp->msg_thread != NULL) { vsw_stop_msg_thread(ldcp); } kmem_free(ldcp->ldcmsg, ldcp->msglen); /* Stop the tx thread */ if (ldcp->tx_thread != NULL) { vsw_stop_tx_thread(ldcp); mutex_destroy(&ldcp->tx_thr_lock); cv_destroy(&ldcp->tx_thr_cv); if (ldcp->tx_mhead != NULL) { freemsgchain(ldcp->tx_mhead); ldcp->tx_mhead = ldcp->tx_mtail = NULL; ldcp->tx_cnt = 0; } } /* Destory kstats */ vgen_destroy_kstats(ldcp->ksp); /* * Before we can close the channel we must release any mapped * resources (e.g. drings). */ vsw_free_lane_resources(ldcp, INBOUND); vsw_free_lane_resources(ldcp, OUTBOUND); /* * Close the channel, retry on EAAGIN. */ while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) { if (++retries > vsw_ldc_retries) { break; } drv_usecwait(vsw_ldc_delay); } if (rv != 0) { cmn_err(CE_NOTE, "!vsw%d: Error(%d) closing the channel(0x%lx)\n", vswp->instance, rv, ldcp->ldc_id); } (void) ldc_fini(ldcp->ldc_handle); ldcp->ldc_status = LDC_INIT; ldcp->ldc_handle = 0; ldcp->ldc_vswp = NULL; mutex_destroy(&ldcp->msg_thr_lock); mutex_destroy(&ldcp->rcv_thr_lock); mutex_destroy(&ldcp->ldc_txlock); mutex_destroy(&ldcp->ldc_rxlock); mutex_destroy(&ldcp->ldc_cblock); mutex_destroy(&ldcp->drain_cv_lock); mutex_destroy(&ldcp->status_lock); cv_destroy(&ldcp->msg_thr_cv); cv_destroy(&ldcp->rcv_thr_cv); cv_destroy(&ldcp->drain_cv); kmem_free(ldcp, sizeof (vsw_ldc_t)); } /* * Open and attempt to bring up the channel. Note that channel * can only be brought up if peer has also opened channel. * * Returns 0 if can open and bring up channel, otherwise * returns 1. */ static int vsw_ldc_init(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_vswp; ldc_status_t istatus = 0; int rv; D1(vswp, "%s: enter", __func__); LDC_ENTER_LOCK(ldcp); /* don't start at 0 in case clients don't like that */ ldcp->next_ident = 1; rv = ldc_open(ldcp->ldc_handle); if (rv != 0) { DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", __func__, ldcp->ldc_id, rv); LDC_EXIT_LOCK(ldcp); return (1); } if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { DERR(vswp, "%s: unable to get status", __func__); LDC_EXIT_LOCK(ldcp); return (1); } else if (istatus != LDC_OPEN && istatus != LDC_READY) { DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", __func__, ldcp->ldc_id, istatus); LDC_EXIT_LOCK(ldcp); return (1); } mutex_enter(&ldcp->status_lock); ldcp->ldc_status = istatus; mutex_exit(&ldcp->status_lock); rv = ldc_up(ldcp->ldc_handle); if (rv != 0) { /* * Not a fatal error for ldc_up() to fail, as peer * end point may simply not be ready yet. */ D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, ldcp->ldc_id, rv); LDC_EXIT_LOCK(ldcp); return (1); } /* * ldc_up() call is non-blocking so need to explicitly * check channel status to see if in fact the channel * is UP. */ mutex_enter(&ldcp->status_lock); if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { DERR(vswp, "%s: unable to get status", __func__); mutex_exit(&ldcp->status_lock); LDC_EXIT_LOCK(ldcp); return (1); } if (ldcp->ldc_status == LDC_UP) { D2(vswp, "%s: channel %ld now UP (%ld)", __func__, ldcp->ldc_id, istatus); mutex_exit(&ldcp->status_lock); LDC_EXIT_LOCK(ldcp); vsw_process_conn_evt(ldcp, VSW_CONN_UP); return (0); } mutex_exit(&ldcp->status_lock); LDC_EXIT_LOCK(ldcp); D1(vswp, "%s: exit", __func__); return (0); } /* disable callbacks on the channel */ static void vsw_ldc_uninit(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_vswp; int rv; D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); LDC_ENTER_LOCK(ldcp); rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); if (rv != 0) { cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling " "interrupts (rv = %d)\n", ldcp->ldc_id, rv); } mutex_enter(&ldcp->status_lock); ldcp->ldc_status = LDC_INIT; mutex_exit(&ldcp->status_lock); LDC_EXIT_LOCK(ldcp); D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); } /* * Wait until the callback(s) associated with the ldcs under the specified * port have completed. * * Prior to this function being invoked each channel under this port * should have been quiesced via ldc_set_cb_mode(DISABLE). * * A short explaination of what we are doing below.. * * The simplest approach would be to have a reference counter in * the ldc structure which is increment/decremented by the callbacks as * they use the channel. The drain function could then simply disable any * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately * there is a tiny window here - before the callback is able to get the lock * on the channel it is interrupted and this function gets to execute. It * sees that the ref count is zero and believes its free to delete the * associated data structures. * * We get around this by taking advantage of the fact that before the ldc * framework invokes a callback it sets a flag to indicate that there is a * callback active (or about to become active). If when we attempt to * unregister a callback when this active flag is set then the unregister * will fail with EWOULDBLOCK. * * If the unregister fails we do a cv_timedwait. We will either be signaled * by the callback as it is exiting (note we have to wait a short period to * allow the callback to return fully to the ldc framework and it to clear * the active flag), or by the timer expiring. In either case we again attempt * the unregister. We repeat this until we can succesfully unregister the * callback. * * The reason we use a cv_timedwait rather than a simple cv_wait is to catch * the case where the callback has finished but the ldc framework has not yet * cleared the active flag. In this case we would never get a cv_signal. */ static void vsw_ldc_drain(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_port->p_vswp; D1(vswp, "%s: enter", __func__); /* * If we can unregister the channel callback then we * know that there is no callback either running or * scheduled to run for this channel so move on to next * channel in the list. */ mutex_enter(&ldcp->drain_cv_lock); /* prompt active callbacks to quit */ ldcp->drain_state = VSW_LDC_DRAINING; if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { D2(vswp, "%s: unreg callback for chan %ld", __func__, ldcp->ldc_id); mutex_exit(&ldcp->drain_cv_lock); } else { /* * If we end up here we know that either 1) a callback * is currently executing, 2) is about to start (i.e. * the ldc framework has set the active flag but * has not actually invoked the callback yet, or 3) * has finished and has returned to the ldc framework * but the ldc framework has not yet cleared the * active bit. * * Wait for it to finish. */ while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) { (void) cv_timedwait(&ldcp->drain_cv, &ldcp->drain_cv_lock, ddi_get_lbolt() + hz); } mutex_exit(&ldcp->drain_cv_lock); D2(vswp, "%s: unreg callback for chan %ld after " "timeout", __func__, ldcp->ldc_id); } D1(vswp, "%s: exit", __func__); } /* * Wait until all tasks which reference this port have completed. * * Prior to this function being invoked each channel under this port * should have been quiesced via ldc_set_cb_mode(DISABLE). */ static void vsw_drain_port_taskq(vsw_port_t *port) { vsw_t *vswp = port->p_vswp; D1(vswp, "%s: enter", __func__); /* * Mark the port as in the process of being detached, and * dispatch a marker task to the queue so we know when all * relevant tasks have completed. */ mutex_enter(&port->state_lock); port->state = VSW_PORT_DETACHING; if ((vswp->taskq_p == NULL) || (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, port, DDI_NOSLEEP) != DDI_SUCCESS)) { cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task", vswp->instance); mutex_exit(&port->state_lock); return; } /* * Wait for the marker task to finish. */ while (port->state != VSW_PORT_DETACHABLE) cv_wait(&port->state_cv, &port->state_lock); mutex_exit(&port->state_lock); D1(vswp, "%s: exit", __func__); } static void vsw_marker_task(void *arg) { vsw_port_t *port = arg; vsw_t *vswp = port->p_vswp; D1(vswp, "%s: enter", __func__); mutex_enter(&port->state_lock); /* * No further tasks should be dispatched which reference * this port so ok to mark it as safe to detach. */ port->state = VSW_PORT_DETACHABLE; cv_signal(&port->state_cv); mutex_exit(&port->state_lock); D1(vswp, "%s: exit", __func__); } vsw_port_t * vsw_lookup_port(vsw_t *vswp, int p_instance) { vsw_port_list_t *plist = &vswp->plist; vsw_port_t *port; for (port = plist->head; port != NULL; port = port->p_next) { if (port->p_instance == p_instance) { D2(vswp, "vsw_lookup_port: found p_instance\n"); return (port); } } return (NULL); } void vsw_vlan_unaware_port_reset(vsw_port_t *portp) { vsw_ldc_t *ldcp = portp->ldcp; mutex_enter(&ldcp->ldc_cblock); /* * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate * the connection. See comments in vsw_set_vnet_proto_ops(). */ if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) && portp->nvids != 0) { vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); } mutex_exit(&ldcp->ldc_cblock); } void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate) { vsw_ldc_t *ldcp = portp->ldcp; mutex_enter(&ldcp->ldc_cblock); /* * If the peer is HybridIO capable (ver >= 1.3), reset channel * to trigger re-negotiation, which inturn trigger HybridIO * setup/cleanup. */ if ((ldcp->hphase == VSW_MILESTONE4) && (portp->p_hio_capable == B_TRUE)) { if (immediate == B_TRUE) { (void) ldc_down(ldcp->ldc_handle); } else { vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); } } mutex_exit(&ldcp->ldc_cblock); } void vsw_port_reset(vsw_port_t *portp) { vsw_ldc_t *ldcp = portp->ldcp; mutex_enter(&ldcp->ldc_cblock); /* * reset channel and terminate the connection. */ vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); mutex_exit(&ldcp->ldc_cblock); } void vsw_reset_ports(vsw_t *vswp) { vsw_port_list_t *plist = &vswp->plist; vsw_port_t *portp; READ_ENTER(&plist->lockrw); for (portp = plist->head; portp != NULL; portp = portp->p_next) { if ((portp->p_hio_capable) && (portp->p_hio_enabled)) { vsw_hio_stop_port(portp); } vsw_port_reset(portp); } RW_EXIT(&plist->lockrw); } static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state) { vnet_physlink_msg_t msg; vnet_physlink_msg_t *msgp = &msg; uint32_t physlink_info = 0; if (plink_state == LINK_STATE_UP) { physlink_info |= VNET_PHYSLINK_STATE_UP; } else { physlink_info |= VNET_PHYSLINK_STATE_DOWN; } msgp->tag.vio_msgtype = VIO_TYPE_CTRL; msgp->tag.vio_subtype = VIO_SUBTYPE_INFO; msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO; msgp->tag.vio_sid = ldcp->local_session; msgp->physlink_info = physlink_info; (void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE); } static void vsw_port_physlink_update(vsw_port_t *portp) { vsw_ldc_t *ldcp; vsw_t *vswp; vswp = portp->p_vswp; ldcp = portp->ldcp; mutex_enter(&ldcp->ldc_cblock); /* * If handshake has completed successfully and if the vnet device * has negotiated to get physical link state updates, send a message * with the current state. */ if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) { vsw_send_physlink_msg(ldcp, vswp->phys_link_state); } mutex_exit(&ldcp->ldc_cblock); } void vsw_physlink_update_ports(vsw_t *vswp) { vsw_port_list_t *plist = &vswp->plist; vsw_port_t *portp; READ_ENTER(&plist->lockrw); for (portp = plist->head; portp != NULL; portp = portp->p_next) { vsw_port_physlink_update(portp); } RW_EXIT(&plist->lockrw); } /* * Search for and remove the specified port from the port * list. Returns 0 if able to locate and remove port, otherwise * returns 1. */ static int vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) { vsw_port_list_t *plist = &vswp->plist; vsw_port_t *curr_p, *prev_p; if (plist->head == NULL) return (1); curr_p = prev_p = plist->head; while (curr_p != NULL) { if (curr_p == port) { if (prev_p == curr_p) { plist->head = curr_p->p_next; } else { prev_p->p_next = curr_p->p_next; } plist->num_ports--; break; } else { prev_p = curr_p; curr_p = curr_p->p_next; } } return (0); } /* * Interrupt handler for ldc messages. */ static uint_t vsw_ldc_cb(uint64_t event, caddr_t arg) { vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; vsw_t *vswp = ldcp->ldc_vswp; D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); mutex_enter(&ldcp->ldc_cblock); ldcp->ldc_stats.callbacks++; mutex_enter(&ldcp->status_lock); if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == 0)) { mutex_exit(&ldcp->status_lock); mutex_exit(&ldcp->ldc_cblock); return (LDC_SUCCESS); } mutex_exit(&ldcp->status_lock); if (event & LDC_EVT_UP) { /* * Channel has come up. */ D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", __func__, ldcp->ldc_id, event, ldcp->ldc_status); vsw_process_conn_evt(ldcp, VSW_CONN_UP); ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); } if (event & LDC_EVT_READ) { /* * Data available for reading. */ D2(vswp, "%s: id(ld) event(%llx) data READ", __func__, ldcp->ldc_id, event); vsw_process_evt_read(ldcp); ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); goto vsw_cb_exit; } if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)", __func__, ldcp->ldc_id, event, ldcp->ldc_status); vsw_process_conn_evt(ldcp, VSW_CONN_RESET); } /* * Catch either LDC_EVT_WRITE which we don't support or any * unknown event. */ if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) { DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", __func__, ldcp->ldc_id, event, ldcp->ldc_status); } vsw_cb_exit: mutex_exit(&ldcp->ldc_cblock); /* * Let the drain function know we are finishing if it * is waiting. */ mutex_enter(&ldcp->drain_cv_lock); if (ldcp->drain_state == VSW_LDC_DRAINING) cv_signal(&ldcp->drain_cv); mutex_exit(&ldcp->drain_cv_lock); return (LDC_SUCCESS); } /* * Reinitialise data structures associated with the channel. */ static void vsw_ldc_reinit(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_vswp; vsw_port_t *port; D1(vswp, "%s: enter", __func__); port = ldcp->ldc_port; D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, ldcp->lane_in.lstate, ldcp->lane_out.lstate); vsw_free_lane_resources(ldcp, INBOUND); vsw_free_lane_resources(ldcp, OUTBOUND); ldcp->lane_in.lstate = 0; ldcp->lane_out.lstate = 0; /* * Remove parent port from any multicast groups * it may have registered with. Client must resend * multicast add command after handshake completes. */ vsw_del_mcst_port(port); ldcp->peer_session = 0; ldcp->session_status = 0; ldcp->hcnt = 0; ldcp->hphase = VSW_MILESTONE0; vsw_reset_vnet_proto_ops(ldcp); D1(vswp, "%s: exit", __func__); } /* * Process a connection event. */ void vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt) { vsw_t *vswp = ldcp->ldc_vswp; vsw_conn_evt_t *conn = NULL; D1(vswp, "%s: enter", __func__); /* * Check if either a reset or restart event is pending * or in progress. If so just return. * * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT * being received by the callback handler, or a ECONNRESET error * code being returned from a ldc_read() or ldc_write() call. * * A VSW_CONN_RESTART event occurs when some error checking code * decides that there is a problem with data from the channel, * and that the handshake should be restarted. */ if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) && (ldstub((uint8_t *)&ldcp->reset_active))) return; /* * If it is an LDC_UP event we first check the recorded * state of the channel. If this is UP then we know that * the channel moving to the UP state has already been dealt * with and don't need to dispatch a new task. * * The reason for this check is that when we do a ldc_up(), * depending on the state of the peer, we may or may not get * a LDC_UP event. As we can't depend on getting a LDC_UP evt * every time we do ldc_up() we explicitly check the channel * status to see has it come up (ldc_up() is asynch and will * complete at some undefined time), and take the appropriate * action. * * The flip side of this is that we may get a LDC_UP event * when we have already seen that the channel is up and have * dealt with that. */ mutex_enter(&ldcp->status_lock); if (evt == VSW_CONN_UP) { if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) { mutex_exit(&ldcp->status_lock); return; } } mutex_exit(&ldcp->status_lock); /* * The transaction group id allows us to identify and discard * any tasks which are still pending on the taskq and refer * to the handshake session we are about to restart or reset. * These stale messages no longer have any real meaning. */ (void) atomic_inc_32(&ldcp->hss_id); ASSERT(vswp->taskq_p != NULL); if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) { cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for" " connection event", vswp->instance); goto err_exit; } conn->evt = evt; conn->ldcp = ldcp; if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn, DDI_NOSLEEP) != DDI_SUCCESS) { cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task", vswp->instance); kmem_free(conn, sizeof (vsw_conn_evt_t)); goto err_exit; } D1(vswp, "%s: exit", __func__); return; err_exit: /* * Have mostly likely failed due to memory shortage. Clear the flag so * that future requests will at least be attempted and will hopefully * succeed. */ if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) ldcp->reset_active = 0; } /* * Deal with events relating to a connection. Invoked from a taskq. */ static void vsw_conn_task(void *arg) { vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg; vsw_ldc_t *ldcp = NULL; vsw_port_t *portp; vsw_t *vswp = NULL; uint16_t evt; ldc_status_t curr_status; ldcp = conn->ldcp; evt = conn->evt; vswp = ldcp->ldc_vswp; portp = ldcp->ldc_port; D1(vswp, "%s: enter", __func__); /* can safely free now have copied out data */ kmem_free(conn, sizeof (vsw_conn_evt_t)); if (ldcp->rcv_thread != NULL) { vsw_stop_rcv_thread(ldcp); } else if (ldcp->msg_thread != NULL) { vsw_stop_msg_thread(ldcp); } mutex_enter(&ldcp->status_lock); if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { cmn_err(CE_WARN, "!vsw%d: Unable to read status of " "channel %ld", vswp->instance, ldcp->ldc_id); mutex_exit(&ldcp->status_lock); return; } /* * If we wish to restart the handshake on this channel, then if * the channel is UP we bring it DOWN to flush the underlying * ldc queue. */ if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP)) (void) ldc_down(ldcp->ldc_handle); if ((portp->p_hio_capable) && (portp->p_hio_enabled)) { vsw_hio_stop(vswp, ldcp); } /* * re-init all the associated data structures. */ vsw_ldc_reinit(ldcp); /* * Bring the channel back up (note it does no harm to * do this even if the channel is already UP, Just * becomes effectively a no-op). */ (void) ldc_up(ldcp->ldc_handle); /* * Check if channel is now UP. This will only happen if * peer has also done a ldc_up(). */ if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { cmn_err(CE_WARN, "!vsw%d: Unable to read status of " "channel %ld", vswp->instance, ldcp->ldc_id); mutex_exit(&ldcp->status_lock); return; } ldcp->ldc_status = curr_status; /* channel UP so restart handshake by sending version info */ if (curr_status == LDC_UP) { if (ldcp->hcnt++ > vsw_num_handshakes) { cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted" " handshake attempts (%d) on channel %ld", vswp->instance, ldcp->hcnt, ldcp->ldc_id); mutex_exit(&ldcp->status_lock); return; } if (vsw_obp_ver_proto_workaround == B_FALSE && (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, DDI_NOSLEEP) != DDI_SUCCESS)) { cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task", vswp->instance); /* * Don't count as valid restart attempt if couldn't * send version msg. */ if (ldcp->hcnt > 0) ldcp->hcnt--; } } /* * Mark that the process is complete by clearing the flag. * * Note is it possible that the taskq dispatch above may have failed, * most likely due to memory shortage. We still clear the flag so * future attempts will at least be attempted and will hopefully * succeed. */ if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) ldcp->reset_active = 0; mutex_exit(&ldcp->status_lock); D1(vswp, "%s: exit", __func__); } /* * returns 0 if legal for event signified by flag to have * occured at the time it did. Otherwise returns 1. */ int vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) { vsw_t *vswp = ldcp->ldc_vswp; uint64_t state; uint64_t phase; if (dir == INBOUND) state = ldcp->lane_in.lstate; else state = ldcp->lane_out.lstate; phase = ldcp->hphase; switch (flag) { case VSW_VER_INFO_RECV: if (phase > VSW_MILESTONE0) { DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; case VSW_VER_ACK_RECV: case VSW_VER_NACK_RECV: if (!(state & VSW_VER_INFO_SENT)) { DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or " "VER_NACK when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_VER_INFO_SENT; break; case VSW_ATTR_INFO_RECV: if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; case VSW_ATTR_ACK_RECV: case VSW_ATTR_NACK_RECV: if (!(state & VSW_ATTR_INFO_SENT)) { DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" " or ATTR_NACK when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_ATTR_INFO_SENT; break; case VSW_DRING_INFO_RECV: if (phase < VSW_MILESTONE1) { DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; case VSW_DRING_ACK_RECV: case VSW_DRING_NACK_RECV: if (!(state & VSW_DRING_INFO_SENT)) { DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK " " or DRING_NACK when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_DRING_INFO_SENT; break; case VSW_RDX_INFO_RECV: if (phase < VSW_MILESTONE3) { DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; case VSW_RDX_ACK_RECV: case VSW_RDX_NACK_RECV: if (!(state & VSW_RDX_INFO_SENT)) { DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or " "RDX_NACK when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_RDX_INFO_SENT; break; case VSW_MCST_INFO_RECV: if (phase < VSW_MILESTONE3) { DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; default: DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", ldcp->ldc_id, flag); return (1); } if (dir == INBOUND) ldcp->lane_in.lstate = state; else ldcp->lane_out.lstate = state; D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); return (0); } void vsw_next_milestone(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_vswp; vsw_port_t *portp = ldcp->ldc_port; lane_t *lane_out = &ldcp->lane_out; lane_t *lane_in = &ldcp->lane_in; D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, ldcp->ldc_id, ldcp->hphase); DUMP_FLAGS(lane_in->lstate); DUMP_FLAGS(lane_out->lstate); switch (ldcp->hphase) { case VSW_MILESTONE0: /* * If we haven't started to handshake with our peer, * start to do so now. */ if (lane_out->lstate == 0) { D2(vswp, "%s: (chan %lld) starting handshake " "with peer", __func__, ldcp->ldc_id); vsw_process_conn_evt(ldcp, VSW_CONN_UP); } /* * Only way to pass this milestone is to have successfully * negotiated version info. */ if ((lane_in->lstate & VSW_VER_ACK_SENT) && (lane_out->lstate & VSW_VER_ACK_RECV)) { D2(vswp, "%s: (chan %lld) leaving milestone 0", __func__, ldcp->ldc_id); vsw_set_vnet_proto_ops(ldcp); /* * Next milestone is passed when attribute * information has been successfully exchanged. */ ldcp->hphase = VSW_MILESTONE1; vsw_send_attr(ldcp); } break; case VSW_MILESTONE1: /* * Only way to pass this milestone is to have successfully * negotiated attribute information, in both directions. */ if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) && (lane_out->lstate & VSW_ATTR_ACK_RECV))) { break; } ldcp->hphase = VSW_MILESTONE2; /* * If the peer device has said it wishes to * use descriptor rings then we send it our ring * info, otherwise we just set up a private ring * which we use an internal buffer */ if ((VSW_VER_GTEQ(ldcp, 1, 2) && (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) || (VSW_VER_LT(ldcp, 1, 2) && (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) { vsw_send_dring_info(ldcp); break; } /* * The peer doesn't operate in dring mode; we * can simply fallthru to the RDX phase from * here. */ /*FALLTHRU*/ case VSW_MILESTONE2: /* * If peer has indicated in its attribute message that * it wishes to use descriptor rings then the only way * to pass this milestone is for us to have received * valid dring info. * * If peer is not using descriptor rings then just fall * through. */ if ((VSW_VER_GTEQ(ldcp, 1, 2) && (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) || (VSW_VER_LT(ldcp, 1, 2) && (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) { if (!(lane_in->lstate & VSW_DRING_ACK_SENT)) break; } D2(vswp, "%s: (chan %lld) leaving milestone 2", __func__, ldcp->ldc_id); ldcp->hphase = VSW_MILESTONE3; vsw_send_rdx(ldcp); break; case VSW_MILESTONE3: /* * Pass this milestone when all paramaters have been * successfully exchanged and RDX sent in both directions. * * Mark the relevant lane as available to transmit data. In * RxDringData mode, lane_in is associated with transmit and * lane_out is associated with receive. It is the reverse in * TxDring mode. */ if ((lane_out->lstate & VSW_RDX_ACK_SENT) && (lane_in->lstate & VSW_RDX_ACK_RECV)) { D2(vswp, "%s: (chan %lld) leaving milestone 3", __func__, ldcp->ldc_id); D2(vswp, "%s: ** handshake complete (0x%llx : " "0x%llx) **", __func__, lane_in->lstate, lane_out->lstate); if (lane_out->dring_mode == VIO_RX_DRING_DATA) { lane_in->lstate |= VSW_LANE_ACTIVE; } else { lane_out->lstate |= VSW_LANE_ACTIVE; } ldcp->hphase = VSW_MILESTONE4; ldcp->hcnt = 0; DISPLAY_STATE(); /* Start HIO if enabled and capable */ if ((portp->p_hio_enabled) && (portp->p_hio_capable)) { D2(vswp, "%s: start HybridIO setup", __func__); vsw_hio_start(vswp, ldcp); } if (ldcp->pls_negotiated == B_TRUE) { /* * The vnet device has negotiated to get phys * link updates. Now that the handshake with * the vnet device is complete, send an initial * update with the current physical link state. */ vsw_send_physlink_msg(ldcp, vswp->phys_link_state); } } else { D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)", __func__, lane_in->lstate, lane_out->lstate); } break; case VSW_MILESTONE4: D2(vswp, "%s: (chan %lld) in milestone 4", __func__, ldcp->ldc_id); break; default: DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, ldcp->ldc_id, ldcp->hphase); } D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, ldcp->hphase); } /* * Check if major version is supported. * * Returns 0 if finds supported major number, and if necessary * adjusts the minor field. * * Returns 1 if can't match major number exactly. Sets mjor/minor * to next lowest support values, or to zero if no other values possible. */ static int vsw_supported_version(vio_ver_msg_t *vp) { int i; D1(NULL, "vsw_supported_version: enter"); for (i = 0; i < VSW_NUM_VER; i++) { if (vsw_versions[i].ver_major == vp->ver_major) { /* * Matching or lower major version found. Update * minor number if necessary. */ if (vp->ver_minor > vsw_versions[i].ver_minor) { D2(NULL, "%s: adjusting minor value from %d " "to %d", __func__, vp->ver_minor, vsw_versions[i].ver_minor); vp->ver_minor = vsw_versions[i].ver_minor; } return (0); } /* * If the message contains a higher major version number, set * the message's major/minor versions to the current values * and return false, so this message will get resent with * these values. */ if (vsw_versions[i].ver_major < vp->ver_major) { D2(NULL, "%s: adjusting major and minor " "values to %d, %d\n", __func__, vsw_versions[i].ver_major, vsw_versions[i].ver_minor); vp->ver_major = vsw_versions[i].ver_major; vp->ver_minor = vsw_versions[i].ver_minor; return (1); } } /* No match was possible, zero out fields */ vp->ver_major = 0; vp->ver_minor = 0; D1(NULL, "vsw_supported_version: exit"); return (1); } /* * Set vnet-protocol-version dependent functions based on version. */ static void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_vswp; lane_t *lp = &ldcp->lane_out; /* * Setup the appropriate dring data processing routine and any * associated thread based on the version. * * In versions < 1.6, we support only TxDring mode. In this mode, the * msg worker thread processes all types of VIO msgs (ctrl and data). * * In versions >= 1.6, we also support RxDringData mode. In this mode, * the rcv worker thread processes dring data messages (msgtype: * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The * rest of the data messages (including acks) and ctrl messages are * handled directly by the callback (intr) thread. * * However, for versions >= 1.6, we could still fallback to TxDring * mode. This could happen if RxDringData mode has been disabled (see * below) on this guest or on the peer guest. This info is determined * as part of attr exchange phase of handshake. Hence, we setup these * pointers for v1.6 after attr msg phase completes during handshake. */ if (VSW_VER_GTEQ(ldcp, 1, 6)) { /* * Set data dring mode for vsw_send_attr(). We setup msg worker * thread in TxDring mode or rcv worker thread in RxDringData * mode when attr phase of handshake completes. */ if (vsw_mapin_avail(ldcp) == B_TRUE) { lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING); } else { lp->dring_mode = VIO_TX_DRING; } } else { lp->dring_mode = VIO_TX_DRING; } /* * Setup the MTU for attribute negotiation based on the version. */ if (VSW_VER_GTEQ(ldcp, 1, 4)) { /* * If the version negotiated with peer is >= 1.4(Jumbo Frame * Support), set the mtu in our attributes to max_frame_size. */ lp->mtu = vswp->max_frame_size; } else if (VSW_VER_EQ(ldcp, 1, 3)) { /* * If the version negotiated with peer is == 1.3 (Vlan Tag * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ. */ lp->mtu = ETHERMAX + VLAN_TAGSZ; } else { vsw_port_t *portp = ldcp->ldc_port; /* * Pre-1.3 peers expect max frame size of ETHERMAX. * We can negotiate that size with those peers provided only * pvid is defined for our peer and there are no vids. Then we * can send/recv only untagged frames of max size ETHERMAX. * Note that pvid of the peer can be different, as vsw has to * serve the vnet in that vlan even if itself is not assigned * to that vlan. */ if (portp->nvids == 0) { lp->mtu = ETHERMAX; } } /* * Setup version dependent data processing functions. */ if (VSW_VER_GTEQ(ldcp, 1, 2)) { /* Versions >= 1.2 */ if (VSW_PRI_ETH_DEFINED(vswp)) { /* * enable priority routines and pkt mode only if * at least one pri-eth-type is specified in MD. */ ldcp->tx = vsw_ldctx_pri; ldcp->rx_pktdata = vsw_process_pkt_data; /* set xfer mode for vsw_send_attr() */ lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2; } else { /* no priority eth types defined in MD */ ldcp->tx = vsw_ldctx; ldcp->rx_pktdata = vsw_process_pkt_data_nop; /* set xfer mode for vsw_send_attr() */ lp->xfer_mode = VIO_DRING_MODE_V1_2; } } else { /* Versions prior to 1.2 */ vsw_reset_vnet_proto_ops(ldcp); } } /* * Reset vnet-protocol-version dependent functions to v1.0. */ static void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp) { lane_t *lp = &ldcp->lane_out; ldcp->tx = vsw_ldctx; ldcp->rx_pktdata = vsw_process_pkt_data_nop; /* set xfer mode for vsw_send_attr() */ lp->xfer_mode = VIO_DRING_MODE_V1_0; } static void vsw_process_evt_read(vsw_ldc_t *ldcp) { if (ldcp->msg_thread != NULL) { /* * TxDring mode; wakeup message worker * thread to process the VIO messages. */ mutex_exit(&ldcp->ldc_cblock); mutex_enter(&ldcp->msg_thr_lock); if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) { ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD; cv_signal(&ldcp->msg_thr_cv); } mutex_exit(&ldcp->msg_thr_lock); mutex_enter(&ldcp->ldc_cblock); } else { /* * We invoke vsw_process_pkt() in the context of the LDC * callback (vsw_ldc_cb()) during handshake, until the dring * mode is negotiated. After the dring mode is negotiated, the * msgs are processed by the msg worker thread (above case) if * the dring mode is TxDring. Otherwise (in RxDringData mode) * we continue to process the msgs directly in the callback * context. */ vsw_process_pkt(ldcp); } } /* * Main routine for processing messages received over LDC. */ void vsw_process_pkt(void *arg) { vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; vsw_t *vswp = ldcp->ldc_vswp; size_t msglen; vio_msg_tag_t *tagp; uint64_t *ldcmsg; int rv = 0; D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); ASSERT(MUTEX_HELD(&ldcp->ldc_cblock)); ldcmsg = ldcp->ldcmsg; /* * If channel is up read messages until channel is empty. */ do { msglen = ldcp->msglen; rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen); if (rv != 0) { DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n", __func__, ldcp->ldc_id, rv, msglen); } /* channel has been reset */ if (rv == ECONNRESET) { vsw_process_conn_evt(ldcp, VSW_CONN_RESET); break; } if (msglen == 0) { D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, ldcp->ldc_id); break; } D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, ldcp->ldc_id, msglen); /* * Figure out what sort of packet we have gotten by * examining the msg tag, and then switch it appropriately. */ tagp = (vio_msg_tag_t *)ldcmsg; switch (tagp->vio_msgtype) { case VIO_TYPE_CTRL: vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen); break; case VIO_TYPE_DATA: vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen); break; case VIO_TYPE_ERR: vsw_process_err_pkt(ldcp, ldcmsg, tagp); break; default: DERR(vswp, "%s: Unknown tag(%lx) ", __func__, "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id); break; } } while (msglen); D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); } /* * Dispatch a task to process a VIO control message. */ static void vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp, int msglen) { vsw_ctrl_task_t *ctaskp = NULL; vsw_port_t *port = ldcp->ldc_port; vsw_t *vswp = port->p_vswp; D1(vswp, "%s: enter", __func__); /* * We need to handle RDX ACK messages in-band as once they * are exchanged it is possible that we will get an * immediate (legitimate) data packet. */ if ((tagp->vio_subtype_env == VIO_RDX) && (tagp->vio_subtype == VIO_SUBTYPE_ACK)) { if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV)) return; ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV; D2(vswp, "%s (%ld) handling RDX_ACK in place " "(ostate 0x%llx : hphase %d)", __func__, ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase); vsw_next_milestone(ldcp); return; } ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); if (ctaskp == NULL) { DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } ctaskp->ldcp = ldcp; bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen); ctaskp->hss_id = ldcp->hss_id; /* * Dispatch task to processing taskq if port is not in * the process of being detached. */ mutex_enter(&port->state_lock); if (port->state == VSW_PORT_INIT) { if ((vswp->taskq_p == NULL) || (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) { mutex_exit(&port->state_lock); DERR(vswp, "%s: unable to dispatch task to taskq", __func__); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); return; } } else { kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); DWARN(vswp, "%s: port %d detaching, not dispatching " "task", __func__, port->p_instance); } mutex_exit(&port->state_lock); D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, ldcp->ldc_id); D1(vswp, "%s: exit", __func__); } /* * Process a VIO ctrl message. Invoked from taskq. */ static void vsw_process_ctrl_pkt(void *arg) { vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; vsw_ldc_t *ldcp = ctaskp->ldcp; vsw_t *vswp = ldcp->ldc_vswp; vio_msg_tag_t tag; uint16_t env; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); env = tag.vio_subtype_env; /* stale pkt check */ if (ctaskp->hss_id < ldcp->hss_id) { DWARN(vswp, "%s: discarding stale packet belonging to earlier" " (%ld) handshake session", __func__, ctaskp->hss_id); kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); return; } /* session id check */ if (ldcp->session_status & VSW_PEER_SESSION) { if (ldcp->peer_session != tag.vio_sid) { DERR(vswp, "%s (chan %d): invalid session id (%llx)", __func__, ldcp->ldc_id, tag.vio_sid); kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } } /* * Switch on vio_subtype envelope, then let lower routines * decide if its an INFO, ACK or NACK packet. */ switch (env) { case VIO_VER_INFO: vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); break; case VIO_DRING_REG: vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); break; case VIO_DRING_UNREG: vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); break; case VIO_ATTR_INFO: vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); break; case VNET_MCAST_INFO: vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); break; case VIO_RDX: vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); break; case VIO_DDS_INFO: vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp); break; case VNET_PHYSLINK_INFO: vsw_process_physlink_msg(ldcp, &ctaskp->pktp); break; default: DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env); } kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } /* * Version negotiation. We can end up here either because our peer * has responded to a handshake message we have sent it, or our peer * has initiated a handshake with us. If its the former then can only * be ACK or NACK, if its the later can only be INFO. * * If its an ACK we move to the next stage of the handshake, namely * attribute exchange. If its a NACK we see if we can specify another * version, if we can't we stop. * * If it is an INFO we reset all params associated with communication * in that direction over this channel (remember connection is * essentially 2 independent simplex channels). */ void vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) { vio_ver_msg_t *ver_pkt; vsw_t *vswp = ldcp->ldc_vswp; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); /* * We know this is a ctrl/version packet so * cast it into the correct structure. */ ver_pkt = (vio_ver_msg_t *)pkt; switch (ver_pkt->tag.vio_subtype) { case VIO_SUBTYPE_INFO: D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); /* * Record the session id, which we will use from now * until we see another VER_INFO msg. Even then the * session id in most cases will be unchanged, execpt * if channel was reset. */ if ((ldcp->session_status & VSW_PEER_SESSION) && (ldcp->peer_session != ver_pkt->tag.vio_sid)) { DERR(vswp, "%s: updating session id for chan %lld " "from %llx to %llx", __func__, ldcp->ldc_id, ldcp->peer_session, ver_pkt->tag.vio_sid); } ldcp->peer_session = ver_pkt->tag.vio_sid; ldcp->session_status |= VSW_PEER_SESSION; /* Legal message at this time ? */ if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) return; /* * First check the device class. Currently only expect * to be talking to a network device. In the future may * also talk to another switch. */ if (ver_pkt->dev_class != VDEV_NETWORK) { DERR(vswp, "%s: illegal device class %d", __func__, ver_pkt->dev_class); ver_pkt->tag.vio_sid = ldcp->local_session; ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); (void) vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t), B_TRUE); ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; vsw_next_milestone(ldcp); return; } else { ldcp->dev_class = ver_pkt->dev_class; } /* * Now check the version. */ if (vsw_supported_version(ver_pkt) == 0) { /* * Support this major version and possibly * adjusted minor version. */ D2(vswp, "%s: accepted ver %d:%d", __func__, ver_pkt->ver_major, ver_pkt->ver_minor); /* Store accepted values */ ldcp->lane_in.ver_major = ver_pkt->ver_major; ldcp->lane_in.ver_minor = ver_pkt->ver_minor; ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; if (vsw_obp_ver_proto_workaround == B_TRUE) { /* * Send a version info message * using the accepted version that * we are about to ack. Also note that * we send our ver info before we ack. * Otherwise, as soon as receiving the * ack, obp sends attr info msg, which * breaks vsw_check_flag() invoked * from vsw_process_ctrl_attr_pkt(); * as we also need VSW_VER_ACK_RECV to * be set in lane_out.lstate, before * we can receive attr info. */ vsw_send_ver(ldcp); } } else { /* * NACK back with the next lower major/minor * pairing we support (if don't suuport any more * versions then they will be set to zero. */ D2(vswp, "%s: replying with ver %d:%d", __func__, ver_pkt->ver_major, ver_pkt->ver_minor); /* Store updated values */ ldcp->lane_in.ver_major = ver_pkt->ver_major; ldcp->lane_in.ver_minor = ver_pkt->ver_minor; ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; } DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); ver_pkt->tag.vio_sid = ldcp->local_session; (void) vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t), B_TRUE); vsw_next_milestone(ldcp); break; case VIO_SUBTYPE_ACK: D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) return; /* Store updated values */ ldcp->lane_out.ver_major = ver_pkt->ver_major; ldcp->lane_out.ver_minor = ver_pkt->ver_minor; ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; vsw_next_milestone(ldcp); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) return; /* * If our peer sent us a NACK with the ver fields set to * zero then there is nothing more we can do. Otherwise see * if we support either the version suggested, or a lesser * one. */ if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { DERR(vswp, "%s: peer unable to negotiate any " "further.", __func__); ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; vsw_next_milestone(ldcp); return; } /* * Check to see if we support this major version or * a lower one. If we don't then maj/min will be set * to zero. */ (void) vsw_supported_version(ver_pkt); if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { /* Nothing more we can do */ DERR(vswp, "%s: version negotiation failed.\n", __func__); ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; vsw_next_milestone(ldcp); } else { /* found a supported major version */ ldcp->lane_out.ver_major = ver_pkt->ver_major; ldcp->lane_out.ver_minor = ver_pkt->ver_minor; D2(vswp, "%s: resending with updated values (%x, %x)", __func__, ver_pkt->ver_major, ver_pkt->ver_minor); ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; ver_pkt->tag.vio_sid = ldcp->local_session; ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); (void) vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t), B_TRUE); vsw_next_milestone(ldcp); } break; default: DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, ver_pkt->tag.vio_subtype); } D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); } static int vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg) { vsw_t *vswp = ldcp->ldc_vswp; vsw_port_t *port = ldcp->ldc_port; struct ether_addr ea; uint64_t macaddr = 0; lane_t *lane_out = &ldcp->lane_out; lane_t *lane_in = &ldcp->lane_in; uint32_t mtu; int i; uint8_t dring_mode; D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) { return (1); } if ((msg->xfer_mode != VIO_DESC_MODE) && (msg->xfer_mode != lane_out->xfer_mode)) { D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode); return (1); } /* Only support MAC addresses at moment. */ if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) { D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n", __func__, msg->addr_type, msg->addr); return (1); } /* * MAC address supplied by device should match that stored * in the vsw-port OBP node. Need to decide what to do if they * don't match, for the moment just warn but don't fail. */ vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet); if (ether_cmp(&ea, &port->p_macaddr) != 0) { DERR(NULL, "%s: device supplied address " "0x%llx doesn't match node address 0x%llx\n", __func__, msg->addr, port->p_macaddr); } /* * Ack freq only makes sense in pkt mode, in shared * mode the ring descriptors say whether or not to * send back an ACK. */ if ((VSW_VER_GTEQ(ldcp, 1, 2) && (msg->xfer_mode & VIO_DRING_MODE_V1_2)) || (VSW_VER_LT(ldcp, 1, 2) && (msg->xfer_mode == VIO_DRING_MODE_V1_0))) { if (msg->ack_freq > 0) { D2(NULL, "%s: non zero ack freq in SHM mode\n", __func__); return (1); } } /* * Process dring mode attribute. */ if (VSW_VER_GTEQ(ldcp, 1, 6)) { /* * Versions >= 1.6: * Though we are operating in v1.6 mode, it is possible that * RxDringData mode has been disabled either on this guest or * on the peer guest. If so, we revert to pre v1.6 behavior of * TxDring mode. But this must be agreed upon in both * directions of attr exchange. We first determine the mode * that can be negotiated. */ if ((msg->options & VIO_RX_DRING_DATA) != 0 && vsw_mapin_avail(ldcp) == B_TRUE) { /* * The peer is capable of handling RxDringData AND we * are also capable of it; we enable RxDringData mode * on this channel. */ dring_mode = VIO_RX_DRING_DATA; } else if ((msg->options & VIO_TX_DRING) != 0) { /* * If the peer is capable of TxDring mode, we * negotiate TxDring mode on this channel. */ dring_mode = VIO_TX_DRING; } else { /* * We support only VIO_TX_DRING and VIO_RX_DRING_DATA * modes. We don't support VIO_RX_DRING mode. */ return (1); } /* * If we have received an ack for the attr info that we sent, * then check if the dring mode matches what the peer had ack'd * (saved in lane_out). If they don't match, we fail the * handshake. */ if (lane_out->lstate & VSW_ATTR_ACK_RECV) { if (msg->options != lane_out->dring_mode) { /* send NACK */ return (1); } } else { /* * Save the negotiated dring mode in our attr * parameters, so it gets sent in the attr info from us * to the peer. */ lane_out->dring_mode = dring_mode; } /* save the negotiated dring mode in the msg to be replied */ msg->options = dring_mode; } /* * Process MTU attribute. */ if (VSW_VER_GTEQ(ldcp, 1, 4)) { /* * Versions >= 1.4: * Validate mtu of the peer is at least ETHERMAX. Then, the mtu * is negotiated down to the minimum of our mtu and peer's mtu. */ if (msg->mtu < ETHERMAX) { return (1); } mtu = MIN(msg->mtu, vswp->max_frame_size); /* * If we have received an ack for the attr info * that we sent, then check if the mtu computed * above matches the mtu that the peer had ack'd * (saved in local hparams). If they don't * match, we fail the handshake. */ if (lane_out->lstate & VSW_ATTR_ACK_RECV) { if (mtu != lane_out->mtu) { /* send NACK */ return (1); } } else { /* * Save the mtu computed above in our * attr parameters, so it gets sent in * the attr info from us to the peer. */ lane_out->mtu = mtu; } /* save the MIN mtu in the msg to be replied */ msg->mtu = mtu; } else { /* Versions < 1.4, mtu must match */ if (msg->mtu != lane_out->mtu) { D2(NULL, "%s: invalid MTU (0x%llx)\n", __func__, msg->mtu); return (1); } } /* * Otherwise store attributes for this lane and update * lane state. */ lane_in->mtu = msg->mtu; lane_in->addr = msg->addr; lane_in->addr_type = msg->addr_type; lane_in->xfer_mode = msg->xfer_mode; lane_in->ack_freq = msg->ack_freq; lane_in->physlink_update = msg->physlink_update; lane_in->dring_mode = msg->options; /* * Check if the client has requested physlink state updates. * If there is a physical device bound to this vswitch (L2 * mode), set the ack bits to indicate it is supported. * Otherwise, set the nack bits. */ if (VSW_VER_GTEQ(ldcp, 1, 5)) { /* Protocol ver >= 1.5 */ /* Does the vnet need phys link state updates ? */ if ((lane_in->physlink_update & PHYSLINK_UPDATE_STATE_MASK) == PHYSLINK_UPDATE_STATE) { if (vswp->smode & VSW_LAYER2) { /* is a net-dev assigned to us ? */ msg->physlink_update = PHYSLINK_UPDATE_STATE_ACK; ldcp->pls_negotiated = B_TRUE; } else { /* not in L2 mode */ msg->physlink_update = PHYSLINK_UPDATE_STATE_NACK; ldcp->pls_negotiated = B_FALSE; } } else { msg->physlink_update = PHYSLINK_UPDATE_NONE; ldcp->pls_negotiated = B_FALSE; } } else { /* * physlink_update bits are ignored * if set by clients < v1.5 protocol. */ msg->physlink_update = PHYSLINK_UPDATE_NONE; ldcp->pls_negotiated = B_FALSE; } macaddr = lane_in->addr; for (i = ETHERADDRL - 1; i >= 0; i--) { port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; macaddr >>= 8; } /* * Setup device specific xmit routines. Note this could be changed * further in vsw_send_dring_info() for versions >= 1.6 if operating in * RxDringData mode. */ mutex_enter(&port->tx_lock); if ((VSW_VER_GTEQ(ldcp, 1, 2) && (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) || (VSW_VER_LT(ldcp, 1, 2) && (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) { D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); port->transmit = vsw_dringsend; } else if (lane_in->xfer_mode == VIO_DESC_MODE) { D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); vsw_create_privring(ldcp); port->transmit = vsw_descrsend; lane_out->xfer_mode = VIO_DESC_MODE; } /* * HybridIO is supported only vnet, not by OBP. * So, set hio_capable to true only when in DRING mode. */ if (VSW_VER_GTEQ(ldcp, 1, 3) && (lane_in->xfer_mode != VIO_DESC_MODE)) { (void) atomic_swap_32(&port->p_hio_capable, B_TRUE); } else { (void) atomic_swap_32(&port->p_hio_capable, B_FALSE); } mutex_exit(&port->tx_lock); return (0); } static int vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg) { vsw_t *vswp = ldcp->ldc_vswp; lane_t *lane_out = &ldcp->lane_out; lane_t *lane_in = &ldcp->lane_in; D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) { return (1); } /* * Process dring mode attribute. */ if (VSW_VER_GTEQ(ldcp, 1, 6)) { /* * Versions >= 1.6: * The ack msg sent by the peer contains the negotiated dring * mode between our capability (that we had sent in our attr * info) and the peer's capability. */ if (lane_in->lstate & VSW_ATTR_ACK_SENT) { /* * If we have sent an ack for the attr info msg from * the peer, check if the dring mode that was * negotiated then (saved in lane_out) matches the * mode that the peer has ack'd. If they don't match, * we fail the handshake. */ if (lane_out->dring_mode != msg->options) { return (1); } } else { if ((msg->options & lane_out->dring_mode) == 0) { /* * Peer ack'd with a mode that we don't * support; we fail the handshake. */ return (1); } if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA)) == (VIO_TX_DRING|VIO_RX_DRING_DATA)) { /* * Peer must ack with only one negotiated mode. * Otherwise fail handshake. */ return (1); } /* * Save the negotiated mode, so we can validate it when * we receive attr info from the peer. */ lane_out->dring_mode = msg->options; } } /* * Process MTU attribute. */ if (VSW_VER_GTEQ(ldcp, 1, 4)) { /* * Versions >= 1.4: * The ack msg sent by the peer contains the minimum of * our mtu (that we had sent in our attr info) and the * peer's mtu. * * If we have sent an ack for the attr info msg from * the peer, check if the mtu that was computed then * (saved in lane_out params) matches the mtu that the * peer has ack'd. If they don't match, we fail the * handshake. */ if (lane_in->lstate & VSW_ATTR_ACK_SENT) { if (lane_out->mtu != msg->mtu) { return (1); } } else { /* * If the mtu ack'd by the peer is > our mtu * fail handshake. Otherwise, save the mtu, so * we can validate it when we receive attr info * from our peer. */ if (msg->mtu <= lane_out->mtu) { lane_out->mtu = msg->mtu; } else { return (1); } } } return (0); } /* * Process an attribute packet. We can end up here either because our peer * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our * peer has sent us an attribute INFO message * * If its an ACK we then move to the next stage of the handshake which * is to send our descriptor ring info to our peer. If its a NACK then * there is nothing more we can (currently) do. * * If we get a valid/acceptable INFO packet (and we have already negotiated * a version) we ACK back and set channel state to ATTR_RECV, otherwise we * NACK back and reset channel state to INACTIV. * * FUTURE: in time we will probably negotiate over attributes, but for * the moment unacceptable attributes are regarded as a fatal error. * */ void vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) { vnet_attr_msg_t *attr_pkt; vsw_t *vswp = ldcp->ldc_vswp; lane_t *lane_out = &ldcp->lane_out; lane_t *lane_in = &ldcp->lane_in; int rv; D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); /* * We know this is a ctrl/attr packet so * cast it into the correct structure. */ attr_pkt = (vnet_attr_msg_t *)pkt; switch (attr_pkt->tag.vio_subtype) { case VIO_SUBTYPE_INFO: rv = vsw_process_attr_info(ldcp, attr_pkt); if (rv != 0) { vsw_free_lane_resources(ldcp, INBOUND); attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; } else { attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; lane_in->lstate |= VSW_ATTR_ACK_SENT; } attr_pkt->tag.vio_sid = ldcp->local_session; DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); (void) vsw_send_msg(ldcp, (void *)attr_pkt, sizeof (vnet_attr_msg_t), B_TRUE); vsw_next_milestone(ldcp); break; case VIO_SUBTYPE_ACK: rv = vsw_process_attr_ack(ldcp, attr_pkt); if (rv != 0) { return; } lane_out->lstate |= VSW_ATTR_ACK_RECV; vsw_next_milestone(ldcp); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) return; lane_out->lstate |= VSW_ATTR_NACK_RECV; vsw_next_milestone(ldcp); break; default: DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, attr_pkt->tag.vio_subtype); } D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); } static int vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp) { int rv; vsw_t *vswp = ldcp->ldc_vswp; lane_t *lp = &ldcp->lane_out; dring_info_t *dp = NULL; D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV); if (rv != 0) { return (1); } if (VSW_VER_GTEQ(ldcp, 1, 6) && (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) { /* * The earlier version of Solaris vnet driver doesn't set the * option (VIO_TX_DRING in its case) correctly in its dring reg * message. We workaround that here by doing the check only * for versions >= v1.6. */ DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), " "negotiated mode (%d)\n", __func__, ldcp->ldc_id, ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode); return (1); } /* * Map dring exported by the peer. */ dp = vsw_map_dring(ldcp, (void *)tagp); if (dp == NULL) { return (1); } /* * Map data buffers exported by the peer if we are in RxDringData mode. */ if (lp->dring_mode == VIO_RX_DRING_DATA) { rv = vsw_map_data(ldcp, dp, (void *)tagp); if (rv != 0) { vsw_unmap_dring(ldcp); return (1); } } return (0); } static int vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp) { vsw_t *vswp = ldcp->ldc_vswp; dring_info_t *dp; D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) { return (1); } dp = ldcp->lane_out.dringp; /* save dring_ident acked by peer */ dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident; return (0); } /* * Process a dring info packet. We can end up here either because our peer * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our * peer has sent us a dring INFO message. * * If we get a valid/acceptable INFO packet (and we have already negotiated * a version) we ACK back and update the lane state, otherwise we NACK back. * * FUTURE: nothing to stop client from sending us info on multiple dring's * but for the moment we will just use the first one we are given. * */ void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) { int rv; int msgsize; dring_info_t *dp; vio_msg_tag_t *tagp = (vio_msg_tag_t *)pkt; vsw_t *vswp = ldcp->ldc_vswp; lane_t *lane_out = &ldcp->lane_out; lane_t *lane_in = &ldcp->lane_in; D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); switch (tagp->vio_subtype) { case VIO_SUBTYPE_INFO: rv = vsw_process_dring_reg_info(ldcp, tagp); if (rv != 0) { vsw_free_lane_resources(ldcp, INBOUND); tagp->vio_subtype = VIO_SUBTYPE_NACK; lane_in->lstate |= VSW_DRING_NACK_SENT; } else { tagp->vio_subtype = VIO_SUBTYPE_ACK; lane_in->lstate |= VSW_DRING_ACK_SENT; } tagp->vio_sid = ldcp->local_session; DUMP_TAG_PTR(tagp); if (lane_out->dring_mode == VIO_RX_DRING_DATA) { dp = lane_in->dringp; msgsize = VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies); } else { msgsize = sizeof (vio_dring_reg_msg_t); } (void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE); vsw_next_milestone(ldcp); break; case VIO_SUBTYPE_ACK: rv = vsw_process_dring_reg_ack(ldcp, tagp); if (rv != 0) { return; } lane_out->lstate |= VSW_DRING_ACK_RECV; vsw_next_milestone(ldcp); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) return; lane_out->lstate |= VSW_DRING_NACK_RECV; vsw_next_milestone(ldcp); break; default: DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, tagp->vio_subtype); } D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); } /* * Process a request from peer to unregister a dring. * * For the moment we just restart the handshake if our * peer endpoint attempts to unregister a dring. */ void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) { vsw_t *vswp = ldcp->ldc_vswp; vio_dring_unreg_msg_t *dring_pkt; /* * We know this is a ctrl/dring packet so * cast it into the correct structure. */ dring_pkt = (vio_dring_unreg_msg_t *)pkt; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); switch (dring_pkt->tag.vio_subtype) { case VIO_SUBTYPE_INFO: D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); DWARN(vswp, "%s: restarting handshake..", __func__); break; case VIO_SUBTYPE_ACK: D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); DWARN(vswp, "%s: restarting handshake..", __func__); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); DWARN(vswp, "%s: restarting handshake..", __func__); break; default: DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, dring_pkt->tag.vio_subtype); } vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } #define SND_MCST_NACK(ldcp, pkt) \ pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ pkt->tag.vio_sid = ldcp->local_session; \ (void) vsw_send_msg(ldcp, (void *)pkt, \ sizeof (vnet_mcast_msg_t), B_TRUE); /* * Process a multicast request from a vnet. * * Vnet's specify a multicast address that they are interested in. This * address is used as a key into the hash table which forms the multicast * forwarding database (mFDB). * * The table keys are the multicast addresses, while the table entries * are pointers to lists of ports which wish to receive packets for the * specified multicast address. * * When a multicast packet is being switched we use the address as a key * into the hash table, and then walk the appropriate port list forwarding * the pkt to each port in turn. * * If a vnet is no longer interested in a particular multicast grouping * we simply find the correct location in the hash table and then delete * the relevant port from the port list. * * To deal with the case whereby a port is being deleted without first * removing itself from the lists in the hash table, we maintain a list * of multicast addresses the port has registered an interest in, within * the port structure itself. We then simply walk that list of addresses * using them as keys into the hash table and remove the port from the * appropriate lists. */ static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) { vnet_mcast_msg_t *mcst_pkt; vsw_port_t *port = ldcp->ldc_port; vsw_t *vswp = ldcp->ldc_vswp; int i; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); /* * We know this is a ctrl/mcast packet so * cast it into the correct structure. */ mcst_pkt = (vnet_mcast_msg_t *)pkt; switch (mcst_pkt->tag.vio_subtype) { case VIO_SUBTYPE_INFO: D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); /* * Check if in correct state to receive a multicast * message (i.e. handshake complete). If not reset * the handshake. */ if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) return; /* * Before attempting to add or remove address check * that they are valid multicast addresses. * If not, then NACK back. */ for (i = 0; i < mcst_pkt->count; i++) { if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { DERR(vswp, "%s: invalid multicast address", __func__); SND_MCST_NACK(ldcp, mcst_pkt); return; } } /* * Now add/remove the addresses. If this fails we * NACK back. */ if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { SND_MCST_NACK(ldcp, mcst_pkt); return; } mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; mcst_pkt->tag.vio_sid = ldcp->local_session; DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); (void) vsw_send_msg(ldcp, (void *)mcst_pkt, sizeof (vnet_mcast_msg_t), B_TRUE); break; case VIO_SUBTYPE_ACK: DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); /* * We shouldn't ever get a multicast ACK message as * at the moment we never request multicast addresses * to be set on some other device. This may change in * the future if we have cascading switches. */ if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) return; /* Do nothing */ break; case VIO_SUBTYPE_NACK: DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); /* * We shouldn't get a multicast NACK packet for the * same reasons as we shouldn't get a ACK packet. */ if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) return; /* Do nothing */ break; default: DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, mcst_pkt->tag.vio_subtype); } D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) { vio_rdx_msg_t *rdx_pkt; vsw_t *vswp = ldcp->ldc_vswp; /* * We know this is a ctrl/rdx packet so * cast it into the correct structure. */ rdx_pkt = (vio_rdx_msg_t *)pkt; D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); switch (rdx_pkt->tag.vio_subtype) { case VIO_SUBTYPE_INFO: D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV)) return; rdx_pkt->tag.vio_sid = ldcp->local_session; rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT; (void) vsw_send_msg(ldcp, (void *)rdx_pkt, sizeof (vio_rdx_msg_t), B_TRUE); vsw_next_milestone(ldcp); break; case VIO_SUBTYPE_ACK: /* * Should be handled in-band by callback handler. */ DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV)) return; ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV; vsw_next_milestone(ldcp); break; default: DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, rdx_pkt->tag.vio_subtype); } D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } static void vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt) { vnet_physlink_msg_t *msgp; vsw_t *vswp = ldcp->ldc_vswp; msgp = (vnet_physlink_msg_t *)pkt; D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); switch (msgp->tag.vio_subtype) { case VIO_SUBTYPE_INFO: /* vsw shouldn't recv physlink info */ DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__); break; case VIO_SUBTYPE_ACK: D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); break; default: DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, msgp->tag.vio_subtype); } D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } static void vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp, uint32_t msglen) { uint16_t env = tagp->vio_subtype_env; vsw_t *vswp = ldcp->ldc_vswp; lane_t *lp = &ldcp->lane_out; uint8_t dring_mode = lp->dring_mode; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); /* session id check */ if (ldcp->session_status & VSW_PEER_SESSION) { if (ldcp->peer_session != tagp->vio_sid) { DERR(vswp, "%s (chan %d): invalid session id (%llx)", __func__, ldcp->ldc_id, tagp->vio_sid); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } } /* * It is an error for us to be getting data packets * before the handshake has completed. */ if (ldcp->hphase != VSW_MILESTONE4) { DERR(vswp, "%s: got data packet before handshake complete " "hphase %d (%x: %x)", __func__, ldcp->hphase, ldcp->lane_in.lstate, ldcp->lane_out.lstate); DUMP_FLAGS(ldcp->lane_in.lstate); DUMP_FLAGS(ldcp->lane_out.lstate); vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } if (dring_mode == VIO_TX_DRING) { /* * To reduce the locking contention, release the ldc_cblock * here and re-acquire it once we are done receiving packets. * We do this only in TxDring mode to allow further callbaks to * continue while the msg worker thread processes the messages. * In RxDringData mode, we process the messages in the callback * itself and wake up rcv worker thread to process only data * info messages. */ mutex_exit(&ldcp->ldc_cblock); mutex_enter(&ldcp->ldc_rxlock); } /* * Switch on vio_subtype envelope, then let lower routines * decide if its an INFO, ACK or NACK packet. */ if (env == VIO_DRING_DATA) { ldcp->rx_dringdata(ldcp, dpkt); } else if (env == VIO_PKT_DATA) { ldcp->rx_pktdata(ldcp, dpkt, msglen); } else if (env == VIO_DESC_DATA) { vsw_process_data_ibnd_pkt(ldcp, dpkt); } else { DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env); } if (dring_mode == VIO_TX_DRING) { mutex_exit(&ldcp->ldc_rxlock); mutex_enter(&ldcp->ldc_cblock); } D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } /* * dummy pkt data handler function for vnet protocol version 1.0 */ static void vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen) { _NOTE(ARGUNUSED(arg1, arg2, msglen)) } /* * This function handles raw pkt data messages received over the channel. * Currently, only priority-eth-type frames are received through this mechanism. * In this case, the frame(data) is present within the message itself which * is copied into an mblk before switching it. */ static void vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen) { vsw_ldc_t *ldcp = (vsw_ldc_t *)arg1; vio_raw_data_msg_t *dpkt = (vio_raw_data_msg_t *)arg2; uint32_t size; mblk_t *mp; vio_mblk_t *vmp; vsw_t *vswp = ldcp->ldc_vswp; vgen_stats_t *statsp = &ldcp->ldc_stats; lane_t *lp = &ldcp->lane_out; size = msglen - VIO_PKT_DATA_HDRSIZE; if (size < ETHERMIN || size > lp->mtu) { (void) atomic_inc_32(&statsp->rx_pri_fail); DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__, ldcp->ldc_id, size); return; } vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ); if (vmp == NULL) { mp = allocb(size + VLAN_TAGSZ, BPRI_MED); if (mp == NULL) { (void) atomic_inc_32(&statsp->rx_pri_fail); DWARN(vswp, "%s(%lld) allocb failure, " "unable to process priority frame\n", __func__, ldcp->ldc_id); return; } } else { mp = vmp->mp; } /* skip over the extra space for vlan tag */ mp->b_rptr += VLAN_TAGSZ; /* copy the frame from the payload of raw data msg into the mblk */ bcopy(dpkt->data, mp->b_rptr, size); mp->b_wptr = mp->b_rptr + size; if (vmp != NULL) { vmp->state = VIO_MBLK_HAS_DATA; } /* update stats */ (void) atomic_inc_64(&statsp->rx_pri_packets); (void) atomic_add_64(&statsp->rx_pri_bytes, size); /* * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed. */ (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp); /* switch the frame to destination */ vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL); } /* * Process an in-band descriptor message (most likely from * OBP). */ static void vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) { vnet_ibnd_desc_t *ibnd_desc; dring_info_t *dp = NULL; vsw_private_desc_t *priv_addr = NULL; vsw_t *vswp = ldcp->ldc_vswp; mblk_t *mp = NULL; size_t nbytes = 0; size_t off = 0; uint64_t idx = 0; uint32_t num = 1, len, datalen = 0; uint64_t ncookies = 0; int i, rv; int j = 0; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); ibnd_desc = (vnet_ibnd_desc_t *)pkt; switch (ibnd_desc->hdr.tag.vio_subtype) { case VIO_SUBTYPE_INFO: D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) return; /* * Data is padded to align on a 8 byte boundary, * nbytes is actual data length, i.e. minus that * padding. */ datalen = ibnd_desc->nbytes; D2(vswp, "%s(%lld): processing inband desc : " ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); ncookies = ibnd_desc->ncookies; /* * allocb(9F) returns an aligned data block. We * need to ensure that we ask ldc for an aligned * number of bytes also. */ nbytes = datalen; if (nbytes & 0x7) { off = 8 - (nbytes & 0x7); nbytes += off; } /* alloc extra space for VLAN_TAG */ mp = allocb(datalen + 8, BPRI_MED); if (mp == NULL) { DERR(vswp, "%s(%lld): allocb failed", __func__, ldcp->ldc_id); ldcp->ldc_stats.rx_allocb_fail++; return; } /* skip over the extra space for VLAN_TAG */ mp->b_rptr += 8; rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, LDC_COPY_IN); if (rv != 0) { DERR(vswp, "%s(%d): unable to copy in data from " "%d cookie(s)", __func__, ldcp->ldc_id, ncookies); freemsg(mp); ldcp->ldc_stats.ierrors++; return; } D2(vswp, "%s(%d): copied in %ld bytes using %d cookies", __func__, ldcp->ldc_id, nbytes, ncookies); /* point to the actual end of data */ mp->b_wptr = mp->b_rptr + datalen; ldcp->ldc_stats.ipackets++; ldcp->ldc_stats.rbytes += datalen; /* * We ACK back every in-band descriptor message we process */ ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; (void) vsw_send_msg(ldcp, (void *)ibnd_desc, sizeof (vnet_ibnd_desc_t), B_TRUE); /* * there is extra space alloc'd for VLAN_TAG */ (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp); /* send the packet to be switched */ vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL); break; case VIO_SUBTYPE_ACK: D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); /* Verify the ACK is valid */ idx = ibnd_desc->hdr.desc_handle; if (idx >= vsw_num_descriptors) { cmn_err(CE_WARN, "!vsw%d: corrupted ACK received " "(idx %ld)", vswp->instance, idx); return; } if ((dp = ldcp->lane_out.dringp) == NULL) { DERR(vswp, "%s: no dring found", __func__); return; } len = dp->num_descriptors; /* * If the descriptor we are being ACK'ed for is not the * one we expected, then pkts were lost somwhere, either * when we tried to send a msg, or a previous ACK msg from * our peer. In either case we now reclaim the descriptors * in the range from the last ACK we received up to the * current ACK. */ if (idx != dp->last_ack_recv) { DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", __func__, dp->last_ack_recv, idx); num = idx >= dp->last_ack_recv ? idx - dp->last_ack_recv + 1: (len - dp->last_ack_recv + 1) + idx; } /* * When we sent the in-band message to our peer we * marked the copy in our private ring as READY. We now * check that the descriptor we are being ACK'ed for is in * fact READY, i.e. it is one we have shared with our peer. * * If its not we flag an error, but still reset the descr * back to FREE. */ for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; mutex_enter(&priv_addr->dstate_lock); if (priv_addr->dstate != VIO_DESC_READY) { DERR(vswp, "%s: (%ld) desc at index %ld not " "READY (0x%lx)", __func__, ldcp->ldc_id, idx, priv_addr->dstate); DERR(vswp, "%s: bound %d: ncookies %ld : " "datalen %ld", __func__, priv_addr->bound, priv_addr->ncookies, priv_addr->datalen); } D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, ldcp->ldc_id, idx); /* release resources associated with sent msg */ priv_addr->datalen = 0; priv_addr->dstate = VIO_DESC_FREE; mutex_exit(&priv_addr->dstate_lock); } /* update to next expected value */ dp->last_ack_recv = (idx + 1) % dp->num_descriptors; break; case VIO_SUBTYPE_NACK: DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); /* * We should only get a NACK if our peer doesn't like * something about a message we have sent it. If this * happens we just release the resources associated with * the message. (We are relying on higher layers to decide * whether or not to resend. */ /* limit check */ idx = ibnd_desc->hdr.desc_handle; if (idx >= vsw_num_descriptors) { DERR(vswp, "%s: corrupted NACK received (idx %lld)", __func__, idx); return; } if ((dp = ldcp->lane_out.dringp) == NULL) { DERR(vswp, "%s: no dring found", __func__); return; } priv_addr = (vsw_private_desc_t *)dp->priv_addr; /* move to correct location in ring */ priv_addr += idx; /* release resources associated with sent msg */ mutex_enter(&priv_addr->dstate_lock); priv_addr->datalen = 0; priv_addr->dstate = VIO_DESC_FREE; mutex_exit(&priv_addr->dstate_lock); break; default: DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); } D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); } static void vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp) { _NOTE(ARGUNUSED(epkt)) vsw_t *vswp = ldcp->ldc_vswp; uint16_t env = tagp->vio_subtype_env; D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); /* * Error vio_subtypes have yet to be defined. So for * the moment we can't do anything. */ D2(vswp, "%s: (%x) vio_subtype env", __func__, env); D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); } /* transmit the packet over the given port */ int vsw_portsend(vsw_port_t *port, mblk_t *mp) { mblk_t *mpt; int count; vsw_ldc_t *ldcp = port->ldcp; int status = 0; count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt); if (count != 0) { status = ldcp->tx(ldcp, mp, mpt, count); } return (status); } /* * Break up frames into 2 seperate chains: normal and * priority, based on the frame type. The number of * priority frames is also counted and returned. * * Params: * vswp: pointer to the instance of vsw * np: head of packet chain to be broken * npt: tail of packet chain to be broken * * Returns: * np: head of normal data packets * npt: tail of normal data packets * hp: head of high priority packets * hpt: tail of high priority packets */ static uint32_t vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt, mblk_t **hp, mblk_t **hpt) { mblk_t *tmp = NULL; mblk_t *smp = NULL; mblk_t *hmp = NULL; /* high prio pkts head */ mblk_t *hmpt = NULL; /* high prio pkts tail */ mblk_t *nmp = NULL; /* normal pkts head */ mblk_t *nmpt = NULL; /* normal pkts tail */ uint32_t count = 0; int i; struct ether_header *ehp; uint32_t num_types; uint16_t *types; tmp = *np; while (tmp != NULL) { smp = tmp; tmp = tmp->b_next; smp->b_next = NULL; smp->b_prev = NULL; ehp = (struct ether_header *)smp->b_rptr; num_types = vswp->pri_num_types; types = vswp->pri_types; for (i = 0; i < num_types; i++) { if (ehp->ether_type == types[i]) { /* high priority frame */ if (hmp != NULL) { hmpt->b_next = smp; hmpt = smp; } else { hmp = hmpt = smp; } count++; break; } } if (i == num_types) { /* normal data frame */ if (nmp != NULL) { nmpt->b_next = smp; nmpt = smp; } else { nmp = nmpt = smp; } } } *hp = hmp; *hpt = hmpt; *np = nmp; *npt = nmpt; return (count); } /* * Wrapper function to transmit normal and/or priority frames over the channel. */ static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count) { vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; mblk_t *tmp; mblk_t *smp; mblk_t *hmp; /* high prio pkts head */ mblk_t *hmpt; /* high prio pkts tail */ mblk_t *nmp; /* normal pkts head */ mblk_t *nmpt; /* normal pkts tail */ uint32_t n = 0; vsw_t *vswp = ldcp->ldc_vswp; ASSERT(VSW_PRI_ETH_DEFINED(vswp)); ASSERT(count != 0); nmp = mp; nmpt = mpt; /* gather any priority frames from the chain of packets */ n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt); /* transmit priority frames */ tmp = hmp; while (tmp != NULL) { smp = tmp; tmp = tmp->b_next; smp->b_next = NULL; vsw_ldcsend_pkt(ldcp, smp); } count -= n; if (count == 0) { /* no normal data frames to process */ return (0); } return (vsw_ldctx(ldcp, nmp, nmpt, count)); } /* * Wrapper function to transmit normal frames over the channel. */ static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count) { vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; mblk_t *tmp = NULL; ASSERT(count != 0); /* * If the TX thread is enabled, then queue the * ordinary frames and signal the tx thread. */ if (ldcp->tx_thread != NULL) { mutex_enter(&ldcp->tx_thr_lock); if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) { /* * If we reached queue limit, * do not queue new packets, * drop them. */ ldcp->ldc_stats.tx_qfull += count; mutex_exit(&ldcp->tx_thr_lock); freemsgchain(mp); goto exit; } if (ldcp->tx_mhead == NULL) { ldcp->tx_mhead = mp; ldcp->tx_mtail = mpt; cv_signal(&ldcp->tx_thr_cv); } else { ldcp->tx_mtail->b_next = mp; ldcp->tx_mtail = mpt; } ldcp->tx_cnt += count; mutex_exit(&ldcp->tx_thr_lock); } else { while (mp != NULL) { tmp = mp->b_next; mp->b_next = mp->b_prev = NULL; (void) vsw_ldcsend(ldcp, mp, 1); mp = tmp; } } exit: return (0); } /* * This function transmits the frame in the payload of a raw data * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to * send special frames with high priorities, without going through * the normal data path which uses descriptor ring mechanism. */ static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp) { vio_raw_data_msg_t *pkt; mblk_t *bp; mblk_t *nmp = NULL; vio_mblk_t *vmp; caddr_t dst; uint32_t mblksz; uint32_t size; uint32_t nbytes; int rv; vsw_t *vswp = ldcp->ldc_vswp; vgen_stats_t *statsp = &ldcp->ldc_stats; if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == 0)) { (void) atomic_inc_32(&statsp->tx_pri_fail); DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate); goto send_pkt_exit; } size = msgsize(mp); /* frame size bigger than available payload len of raw data msg ? */ if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) { (void) atomic_inc_32(&statsp->tx_pri_fail); DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__, ldcp->ldc_id, size); goto send_pkt_exit; } if (size < ETHERMIN) size = ETHERMIN; /* alloc space for a raw data message */ vmp = vio_allocb(vswp->pri_tx_vmp); if (vmp == NULL) { (void) atomic_inc_32(&statsp->tx_pri_fail); DWARN(vswp, "vio_allocb failed\n"); goto send_pkt_exit; } else { nmp = vmp->mp; } pkt = (vio_raw_data_msg_t *)nmp->b_rptr; /* copy frame into the payload of raw data message */ dst = (caddr_t)pkt->data; for (bp = mp; bp != NULL; bp = bp->b_cont) { mblksz = MBLKL(bp); bcopy(bp->b_rptr, dst, mblksz); dst += mblksz; } vmp->state = VIO_MBLK_HAS_DATA; /* setup the raw data msg */ pkt->tag.vio_msgtype = VIO_TYPE_DATA; pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; pkt->tag.vio_subtype_env = VIO_PKT_DATA; pkt->tag.vio_sid = ldcp->local_session; nbytes = VIO_PKT_DATA_HDRSIZE + size; /* send the msg over ldc */ rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE); if (rv != 0) { (void) atomic_inc_32(&statsp->tx_pri_fail); DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__, ldcp->ldc_id); goto send_pkt_exit; } /* update stats */ (void) atomic_inc_64(&statsp->tx_pri_packets); (void) atomic_add_64(&statsp->tx_pri_packets, size); send_pkt_exit: if (nmp != NULL) freemsg(nmp); freemsg(mp); } /* * Transmit the packet over the given LDC channel. * * The 'retries' argument indicates how many times a packet * is retried before it is dropped. Note, the retry is done * only for a resource related failure, for all other failures * the packet is dropped immediately. */ static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries) { int i; int rc; int status = 0; vsw_port_t *port = ldcp->ldc_port; dring_info_t *dp = NULL; lane_t *lp = &ldcp->lane_out; for (i = 0; i < retries; ) { /* * Send the message out using the appropriate * transmit function which will free mblock when it * is finished with it. */ mutex_enter(&port->tx_lock); if (port->transmit != NULL) { status = (*port->transmit)(ldcp, mp); } if (status == LDC_TX_SUCCESS) { mutex_exit(&port->tx_lock); break; } i++; /* increment the counter here */ /* If its the last retry, then update the oerror */ if ((i == retries) && (status == LDC_TX_NORESOURCES)) { ldcp->ldc_stats.oerrors++; } mutex_exit(&port->tx_lock); if (status != LDC_TX_NORESOURCES) { /* * No retrying required for errors un-related * to resources. */ break; } if (((dp = ldcp->lane_out.dringp) != NULL) && ((VSW_VER_GTEQ(ldcp, 1, 2) && (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) || ((VSW_VER_LT(ldcp, 1, 2) && (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) { /* Need to reclaim in TxDring mode. */ if (lp->dring_mode == VIO_TX_DRING) { rc = vsw_reclaim_dring(dp, dp->end_idx); } } else { /* * If there is no dring or the xfer_mode is * set to DESC_MODE(ie., OBP), then simply break here. */ break; } /* * Delay only if none were reclaimed * and its not the last retry. */ if ((rc == 0) && (i < retries)) { delay(drv_usectohz(vsw_ldc_tx_delay)); } } freemsg(mp); return (status); } /* * Send an in-band descriptor message over ldc. */ static int vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) { vsw_t *vswp = ldcp->ldc_vswp; vnet_ibnd_desc_t ibnd_msg; vsw_private_desc_t *priv_desc = NULL; dring_info_t *dp = NULL; size_t n, size = 0; caddr_t bufp; mblk_t *bp; int idx, i; int status = LDC_TX_SUCCESS; static int warn_msg = 1; lane_t *lp = &ldcp->lane_out; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); ASSERT(mp != NULL); if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == 0)) { DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", __func__, ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate); ldcp->ldc_stats.oerrors++; return (LDC_TX_FAILURE); } /* * The dring here is as an internal buffer, * rather than a transfer channel. */ if ((dp = ldcp->lane_out.dringp) == NULL) { DERR(vswp, "%s(%lld): no dring for outbound lane", __func__, ldcp->ldc_id); DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__, ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate); ldcp->ldc_stats.oerrors++; return (LDC_TX_FAILURE); } size = msgsize(mp); if (size > (size_t)lp->mtu) { DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, ldcp->ldc_id, size); ldcp->ldc_stats.oerrors++; return (LDC_TX_FAILURE); } /* * Find a free descriptor in our buffer ring */ if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { if (warn_msg) { DERR(vswp, "%s(%lld): no descriptor available for ring " "at 0x%llx", __func__, ldcp->ldc_id, dp); warn_msg = 0; } /* nothing more we can do */ status = LDC_TX_NORESOURCES; goto vsw_descrsend_free_exit; } else { D2(vswp, "%s(%lld): free private descriptor found at pos " "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc); warn_msg = 1; } /* copy data into the descriptor */ bufp = priv_desc->datap; for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { n = MBLKL(bp); bcopy(bp->b_rptr, bufp, n); bufp += n; } priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; /* create and send the in-band descp msg */ ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; /* * Copy the mem cookies describing the data from the * private region of the descriptor ring into the inband * descriptor. */ for (i = 0; i < priv_desc->ncookies; i++) { bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], sizeof (ldc_mem_cookie_t)); } ibnd_msg.hdr.desc_handle = idx; ibnd_msg.ncookies = priv_desc->ncookies; ibnd_msg.nbytes = size; ldcp->ldc_stats.opackets++; ldcp->ldc_stats.obytes += size; (void) vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vnet_ibnd_desc_t), B_TRUE); vsw_descrsend_free_exit: D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); return (status); } static void vsw_send_ver(void *arg) { vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; vsw_t *vswp = ldcp->ldc_vswp; lane_t *lp = &ldcp->lane_out; vio_ver_msg_t ver_msg; D1(vswp, "%s enter", __func__); ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; ver_msg.tag.vio_subtype_env = VIO_VER_INFO; ver_msg.tag.vio_sid = ldcp->local_session; if (vsw_obp_ver_proto_workaround == B_FALSE) { ver_msg.ver_major = vsw_versions[0].ver_major; ver_msg.ver_minor = vsw_versions[0].ver_minor; } else { /* use the major,minor that we've ack'd */ lane_t *lpi = &ldcp->lane_in; ver_msg.ver_major = lpi->ver_major; ver_msg.ver_minor = lpi->ver_minor; } ver_msg.dev_class = VDEV_NETWORK_SWITCH; lp->lstate |= VSW_VER_INFO_SENT; lp->ver_major = ver_msg.ver_major; lp->ver_minor = ver_msg.ver_minor; DUMP_TAG(ver_msg.tag); (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE); D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); } static void vsw_send_attr(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_vswp; lane_t *lp = &ldcp->lane_out; vnet_attr_msg_t attr_msg; D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); /* * Subtype is set to INFO by default */ attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; attr_msg.tag.vio_sid = ldcp->local_session; /* payload copied from default settings for lane */ attr_msg.mtu = lp->mtu; attr_msg.addr_type = lp->addr_type; attr_msg.xfer_mode = lp->xfer_mode; attr_msg.ack_freq = lp->xfer_mode; attr_msg.options = lp->dring_mode; READ_ENTER(&vswp->if_lockrw); attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet); RW_EXIT(&vswp->if_lockrw); ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; DUMP_TAG(attr_msg.tag); (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE); D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); } static void vsw_send_dring_info(vsw_ldc_t *ldcp) { int msgsize; void *msg; vsw_t *vswp = ldcp->ldc_vswp; vsw_port_t *port = ldcp->ldc_port; lane_t *lp = &ldcp->lane_out; vgen_stats_t *statsp = &ldcp->ldc_stats; D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); /* dring mode has been negotiated in attr phase; save in stats */ statsp->dring_mode = lp->dring_mode; if (lp->dring_mode == VIO_RX_DRING_DATA) { /* * Change the transmit routine for RxDringData mode. */ port->transmit = vsw_dringsend_shm; msg = (void *) vsw_create_rx_dring_info(ldcp); if (msg == NULL) { return; } msgsize = VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies); ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ, vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri); ldcp->rx_dringdata = vsw_process_dringdata_shm; } else { msg = (void *) vsw_create_tx_dring_info(ldcp); if (msg == NULL) { return; } msgsize = sizeof (vio_dring_reg_msg_t); ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ, vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri); ldcp->rx_dringdata = vsw_process_dringdata; } lp->lstate |= VSW_DRING_INFO_SENT; DUMP_TAG_PTR((vio_msg_tag_t *)msg); (void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE); kmem_free(msg, msgsize); D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); } static void vsw_send_rdx(vsw_ldc_t *ldcp) { vsw_t *vswp = ldcp->ldc_vswp; vio_rdx_msg_t rdx_msg; D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; rdx_msg.tag.vio_subtype_env = VIO_RDX; rdx_msg.tag.vio_sid = ldcp->local_session; ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT; DUMP_TAG(rdx_msg.tag); (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE); D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); } /* * Remove the specified address from the list of address maintained * in this port node. */ mcst_addr_t * vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) { vsw_t *vswp = NULL; vsw_port_t *port = NULL; mcst_addr_t *prev_p = NULL; mcst_addr_t *curr_p = NULL; D1(NULL, "%s: enter : devtype %d : addr 0x%llx", __func__, devtype, addr); if (devtype == VSW_VNETPORT) { port = (vsw_port_t *)arg; mutex_enter(&port->mca_lock); prev_p = curr_p = port->mcap; } else { vswp = (vsw_t *)arg; mutex_enter(&vswp->mca_lock); prev_p = curr_p = vswp->mcap; } while (curr_p != NULL) { if (curr_p->addr == addr) { D2(NULL, "%s: address found", __func__); /* match found */ if (prev_p == curr_p) { /* list head */ if (devtype == VSW_VNETPORT) port->mcap = curr_p->nextp; else vswp->mcap = curr_p->nextp; } else { prev_p->nextp = curr_p->nextp; } break; } else { prev_p = curr_p; curr_p = curr_p->nextp; } } if (devtype == VSW_VNETPORT) mutex_exit(&port->mca_lock); else mutex_exit(&vswp->mca_lock); D1(NULL, "%s: exit", __func__); return (curr_p); } /* * Create a ring consisting of just a private portion and link * it into the list of rings for the outbound lane. * * These type of rings are used primarily for temporary data * storage (i.e. as data buffers). */ void vsw_create_privring(vsw_ldc_t *ldcp) { dring_info_t *dp; vsw_t *vswp = ldcp->ldc_vswp; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); ldcp->lane_out.dringp = dp; /* no public section */ dp->pub_addr = NULL; dp->priv_addr = kmem_zalloc( (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP); dp->num_descriptors = vsw_num_descriptors; if (vsw_setup_tx_dring(ldcp, dp)) { DERR(vswp, "%s: setup of ring failed", __func__); vsw_destroy_tx_dring(ldcp); return; } /* haven't used any descriptors yet */ dp->end_idx = 0; dp->restart_reqd = B_TRUE; D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } /* * Set the default lane attributes. These are copied into * the attr msg we send to our peer. If they are not acceptable * then (currently) the handshake ends. */ static void vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) { bzero(lp, sizeof (lane_t)); READ_ENTER(&vswp->if_lockrw); ether_copy(&(vswp->if_addr), &(lp->addr)); RW_EXIT(&vswp->if_lockrw); lp->mtu = vswp->max_frame_size; lp->addr_type = ADDR_TYPE_MAC; lp->xfer_mode = VIO_DRING_MODE_V1_0; lp->ack_freq = 0; /* for shared mode */ lp->seq_num = VNET_ISS; } /* * Map the descriptor ring exported by the peer. */ static dring_info_t * vsw_map_dring(vsw_ldc_t *ldcp, void *pkt) { dring_info_t *dp = NULL; lane_t *lp = &ldcp->lane_out; if (lp->dring_mode == VIO_RX_DRING_DATA) { /* * In RxDringData mode, dring that we map in * becomes our transmit descriptor ring. */ dp = vsw_map_tx_dring(ldcp, pkt); } else { /* * In TxDring mode, dring that we map in * becomes our receive descriptor ring. */ dp = vsw_map_rx_dring(ldcp, pkt); } return (dp); } /* * Common dring mapping function used in both TxDring and RxDringData modes. */ dring_info_t * vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt) { int rv; dring_info_t *dp; ldc_mem_info_t minfo; vsw_t *vswp = ldcp->ldc_vswp; /* * If the dring params are unacceptable then we NACK back. */ if ((dring_pkt->num_descriptors == 0) || (dring_pkt->descriptor_size == 0) || (dring_pkt->ncookies != 1)) { DERR(vswp, "%s (%lld): invalid dring info", __func__, ldcp->ldc_id); return (NULL); } dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); dp->num_descriptors = dring_pkt->num_descriptors; dp->descriptor_size = dring_pkt->descriptor_size; dp->options = dring_pkt->options; dp->dring_ncookies = dring_pkt->ncookies; /* * Note: should only get one cookie. Enforced in * the ldc layer. */ bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0], sizeof (ldc_mem_cookie_t)); rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0], dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size, LDC_DIRECT_MAP, &(dp->dring_handle)); if (rv != 0) { goto fail; } rv = ldc_mem_dring_info(dp->dring_handle, &minfo); if (rv != 0) { goto fail; } /* store the address of the ring */ dp->pub_addr = minfo.vaddr; /* cache the dring mtype */ dp->dring_mtype = minfo.mtype; /* no private section as we are importing */ dp->priv_addr = NULL; /* * Using simple mono increasing int for ident at the moment. */ dp->ident = ldcp->next_ident; ldcp->next_ident++; /* * Acknowledge it; we send back a unique dring identifier that * the sending side will use in future to refer to this * descriptor ring. */ dring_pkt->dring_ident = dp->ident; return (dp); fail: if (dp->dring_handle != 0) { (void) ldc_mem_dring_unmap(dp->dring_handle); } kmem_free(dp, sizeof (*dp)); return (NULL); } /* * Unmap the descriptor ring exported by the peer. */ static void vsw_unmap_dring(vsw_ldc_t *ldcp) { lane_t *lane_out = &ldcp->lane_out; if (lane_out->dring_mode == VIO_RX_DRING_DATA) { vsw_unmap_tx_dring(ldcp); } else { vsw_unmap_rx_dring(ldcp); } } /* * Map the shared memory data buffer area exported by the peer. * Used in RxDringData mode only. */ static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt) { int rv; vio_dring_reg_ext_msg_t *emsg; vio_dring_reg_msg_t *msg = pkt; uint8_t *buf = (uint8_t *)msg->cookie; vsw_t *vswp = ldcp->ldc_vswp; ldc_mem_info_t minfo; /* skip over dring cookies */ ASSERT(msg->ncookies == 1); buf += (msg->ncookies * sizeof (ldc_mem_cookie_t)); emsg = (vio_dring_reg_ext_msg_t *)buf; if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) { return (1); } /* save # of data area cookies */ dp->data_ncookies = emsg->data_ncookies; /* save data area size */ dp->data_sz = emsg->data_area_size; /* allocate ldc mem handle for data area */ rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle); if (rv != 0) { cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n"); DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n", __func__, ldcp->ldc_id, rv); return (1); } /* map the data area */ rv = ldc_mem_map(dp->data_handle, emsg->data_cookie, emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R, (caddr_t *)&dp->data_addr, NULL); if (rv != 0) { cmn_err(CE_WARN, "ldc_mem_map failed\n"); DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n", __func__, ldcp->ldc_id, rv); return (1); } /* get the map info */ rv = ldc_mem_info(dp->data_handle, &minfo); if (rv != 0) { cmn_err(CE_WARN, "ldc_mem_info failed\n"); DWARN(vswp, "%s (%lld) ldc_mem_info() failed: %d\n", __func__, ldcp->ldc_id, rv); return (1); } if (minfo.mtype != LDC_DIRECT_MAP) { DWARN(vswp, "%s (%lld) mtype(%d) is not direct map\n", __func__, ldcp->ldc_id, minfo.mtype); return (1); } /* allocate memory for data area cookies */ dp->data_cookie = kmem_zalloc(emsg->data_ncookies * sizeof (ldc_mem_cookie_t), KM_SLEEP); /* save data area cookies */ bcopy(emsg->data_cookie, dp->data_cookie, emsg->data_ncookies * sizeof (ldc_mem_cookie_t)); return (0); } /* * Reset and free all the resources associated with the channel. */ static void vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) { lane_t *lp; D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); if (dir == INBOUND) { D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" " of channel %lld", __func__, ldcp->ldc_id); lp = &ldcp->lane_in; } else { D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" " of channel %lld", __func__, ldcp->ldc_id); lp = &ldcp->lane_out; } lp->lstate = VSW_LANE_INACTIV; lp->seq_num = VNET_ISS; if (dir == INBOUND) { /* Unmap the remote dring which is imported from the peer */ vsw_unmap_dring(ldcp); } else { /* Destroy the local dring which is exported to the peer */ vsw_destroy_dring(ldcp); } D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); } /* * Destroy the descriptor ring. */ static void vsw_destroy_dring(vsw_ldc_t *ldcp) { lane_t *lp = &ldcp->lane_out; if (lp->dring_mode == VIO_RX_DRING_DATA) { vsw_destroy_rx_dring(ldcp); } else { vsw_destroy_tx_dring(ldcp); } } /* * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data. * This thread is woken up by the vsw_portsend to transmit * packets. */ static void vsw_ldc_tx_worker(void *arg) { callb_cpr_t cprinfo; vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; vsw_t *vswp = ldcp->ldc_vswp; mblk_t *mp; mblk_t *tmp; D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id); CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr, "vnet_tx_thread"); mutex_enter(&ldcp->tx_thr_lock); while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) { CALLB_CPR_SAFE_BEGIN(&cprinfo); /* * Wait until the data is received or a stop * request is received. */ while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) && (ldcp->tx_mhead == NULL)) { cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock); } CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock) /* * First process the stop request. */ if (ldcp->tx_thr_flags & VSW_WTHR_STOP) { D2(vswp, "%s(%lld):tx thread stopped\n", __func__, ldcp->ldc_id); break; } mp = ldcp->tx_mhead; ldcp->tx_mhead = ldcp->tx_mtail = NULL; ldcp->tx_cnt = 0; mutex_exit(&ldcp->tx_thr_lock); D2(vswp, "%s(%lld):calling vsw_ldcsend\n", __func__, ldcp->ldc_id); while (mp != NULL) { tmp = mp->b_next; mp->b_next = mp->b_prev = NULL; (void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries); mp = tmp; } mutex_enter(&ldcp->tx_thr_lock); } /* * Update the run status and wakeup the thread that * has sent the stop request. */ ldcp->tx_thr_flags &= ~VSW_WTHR_STOP; ldcp->tx_thread = NULL; CALLB_CPR_EXIT(&cprinfo); D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id); thread_exit(); } /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */ static void vsw_stop_tx_thread(vsw_ldc_t *ldcp) { kt_did_t tid = 0; vsw_t *vswp = ldcp->ldc_vswp; D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id); /* * Send a stop request by setting the stop flag and * wait until the receive thread stops. */ mutex_enter(&ldcp->tx_thr_lock); if (ldcp->tx_thread != NULL) { tid = ldcp->tx_thread->t_did; ldcp->tx_thr_flags |= VSW_WTHR_STOP; cv_signal(&ldcp->tx_thr_cv); } mutex_exit(&ldcp->tx_thr_lock); if (tid != 0) { thread_join(tid); } D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id); } static int vsw_mapin_avail(vsw_ldc_t *ldcp) { int rv; ldc_info_t info; uint64_t mapin_sz_req; uint64_t dblk_sz; vsw_t *vswp = ldcp->ldc_vswp; rv = ldc_info(ldcp->ldc_handle, &info); if (rv != 0) { return (B_FALSE); } dblk_sz = RXDRING_DBLK_SZ(vswp->max_frame_size); mapin_sz_req = (VSW_RXDRING_NRBUFS * dblk_sz); if (info.direct_map_size_max >= mapin_sz_req) { return (B_TRUE); } return (B_FALSE); } /* * Debugging routines */ static void display_state(void) { vsw_t *vswp; vsw_port_list_t *plist; vsw_port_t *port; vsw_ldc_t *ldcp; extern vsw_t *vsw_head; cmn_err(CE_NOTE, "***** system state *****"); for (vswp = vsw_head; vswp; vswp = vswp->next) { plist = &vswp->plist; READ_ENTER(&plist->lockrw); cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", vswp->instance, plist->num_ports); for (port = plist->head; port != NULL; port = port->p_next) { cmn_err(CE_CONT, "port %d : %d ldcs attached\n", port->p_instance, port->num_ldcs); ldcp = port->ldcp; cmn_err(CE_CONT, "chan %lu : dev %d : " "status %d : phase %u\n", ldcp->ldc_id, ldcp->dev_class, ldcp->ldc_status, ldcp->hphase); cmn_err(CE_CONT, "chan %lu : lsession %lu : " "psession %lu\n", ldcp->ldc_id, ldcp->local_session, ldcp->peer_session); cmn_err(CE_CONT, "Inbound lane:\n"); display_lane(&ldcp->lane_in); cmn_err(CE_CONT, "Outbound lane:\n"); display_lane(&ldcp->lane_out); } RW_EXIT(&plist->lockrw); } cmn_err(CE_NOTE, "***** system state *****"); } static void display_lane(lane_t *lp) { dring_info_t *drp = lp->dringp; cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", lp->addr_type, lp->addr, lp->xfer_mode); cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); cmn_err(CE_CONT, "Dring info:\n"); cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", drp->num_descriptors, drp->descriptor_size); cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle); cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", drp->ident, drp->end_idx); display_ring(drp); } static void display_ring(dring_info_t *dringp) { uint64_t i; uint64_t priv_count = 0; uint64_t pub_count = 0; vnet_public_desc_t *pub_addr = NULL; vsw_private_desc_t *priv_addr = NULL; for (i = 0; i < vsw_num_descriptors; i++) { if (dringp->pub_addr != NULL) { pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; if (pub_addr->hdr.dstate == VIO_DESC_FREE) pub_count++; } if (dringp->priv_addr != NULL) { priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i; if (priv_addr->dstate == VIO_DESC_FREE) priv_count++; } } cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", i, priv_count, pub_count); } static void dump_flags(uint64_t state) { int i; typedef struct flag_name { int flag_val; char *flag_name; } flag_name_t; flag_name_t flags[] = { VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; DERR(NULL, "DUMP_FLAGS: %llx\n", state); for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { if (state & flags[i].flag_val) DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); } }