/* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* * This file contains code imported from the OFED rds source file ib.c * Oracle elects to have and use the contents of ib.c under and governed * by the OpenIB.org BSD license (see below for full license text). However, * the following notice accompanied the original version of this file: */ /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include #include #include #include unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; struct list rdsv3_ib_devices; /* NOTE: if also grabbing ibdev lock, grab this first */ kmutex_t ib_nodev_conns_lock; list_t ib_nodev_conns; extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags); extern void rdsv3_ib_frag_destructor(void *buf, void *arg); void rdsv3_ib_add_one(ib_device_t *device) { struct rdsv3_ib_device *rds_ibdev; ibt_hca_attr_t *dev_attr; char name[64]; RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device); /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return; dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), KM_NOSLEEP); if (!dev_attr) return; if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { RDSV3_DPRINTF2("rdsv3_ib_add_one", "Query device failed for %s", device->name); goto free_attr; } /* We depend on Reserved Lkey */ if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { RDSV3_DPRINTF2("rdsv3_ib_add_one", "Reserved Lkey support is required: %s", device->name); goto free_attr; } rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); if (!rds_ibdev) goto free_attr; rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device); rds_ibdev->hca_attr = *dev_attr; rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL); mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp; rds_ibdev->max_responder_resources = (uint_t)dev_attr->hca_max_rdma_in_qp; rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); if (IS_ERR(rds_ibdev->pd)) goto free_dev; if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { goto free_dev; } if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) { rdsv3_ib_destroy_mr_pool(rds_ibdev); goto free_dev; } (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx", (longlong_t)htonll(dev_attr->hca_node_guid)); rds_ibdev->ib_frag_slab = kmem_cache_create(name, sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor, rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0); if (rds_ibdev->ib_frag_slab == NULL) { RDSV3_DPRINTF2("rdsv3_ib_add_one", "kmem_cache_create for ib_frag_slab failed for device: %s", device->name); rdsv3_ib_destroy_mr_pool(rds_ibdev); rdsv3_ib_destroy_inc_pool(rds_ibdev); goto free_dev; } rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl, (uint64_t)rds_ibdev->hca_attr.hca_node_guid); if (rds_ibdev->aft_hcagp == NULL) { rdsv3_ib_destroy_mr_pool(rds_ibdev); rdsv3_ib_destroy_inc_pool(rds_ibdev); kmem_cache_destroy(rds_ibdev->ib_frag_slab); goto free_dev; } rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn, (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU, rds_ibdev->aft_hcagp); if (rds_ibdev->fmr_soft_cq == NULL) { rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); rdsv3_ib_destroy_mr_pool(rds_ibdev); rdsv3_ib_destroy_inc_pool(rds_ibdev); kmem_cache_destroy(rds_ibdev->ib_frag_slab); goto free_dev; } rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist, (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU, rds_ibdev->aft_hcagp); if (rds_ibdev->inc_soft_cq == NULL) { rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); rdsv3_ib_destroy_mr_pool(rds_ibdev); rdsv3_ib_destroy_inc_pool(rds_ibdev); kmem_cache_destroy(rds_ibdev->ib_frag_slab); goto free_dev; } list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), offsetof(struct rdsv3_ib_ipaddr, list)); list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), offsetof(struct rdsv3_ib_connection, ib_node)); list_insert_tail(&rdsv3_ib_devices, rds_ibdev); ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device); goto free_attr; err_pd: (void) ib_dealloc_pd(rds_ibdev->pd); free_dev: mutex_destroy(&rds_ibdev->spinlock); rw_destroy(&rds_ibdev->rwlock); kmem_free(rds_ibdev, sizeof (*rds_ibdev)); free_attr: kmem_free(dev_attr, sizeof (*dev_attr)); } void rdsv3_ib_remove_one(struct ib_device *device) { struct rdsv3_ib_device *rds_ibdev; struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device); rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); if (!rds_ibdev) return; RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_remove_node(&i_ipaddr->list); kmem_free(i_ipaddr, sizeof (*i_ipaddr)); } rdsv3_ib_destroy_conns(rds_ibdev); if (rds_ibdev->fmr_soft_cq) rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); if (rds_ibdev->inc_soft_cq) rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq); rdsv3_ib_destroy_mr_pool(rds_ibdev); rdsv3_ib_destroy_inc_pool(rds_ibdev); kmem_cache_destroy(rds_ibdev->ib_frag_slab); rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); #if 0 while (ib_dealloc_pd(rds_ibdev->pd)) { #ifndef __lock_lint RDSV3_DPRINTF5("rdsv3_ib_remove_one", "%s-%d Failed to dealloc pd %p", __func__, __LINE__, rds_ibdev->pd); #endif delay(drv_usectohz(1000)); } #else if (ib_dealloc_pd(rds_ibdev->pd)) { #ifndef __lock_lint RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Failed to dealloc pd %p\n", rds_ibdev->pd); #endif } #endif list_destroy(&rds_ibdev->ipaddr_list); list_destroy(&rds_ibdev->conn_list); list_remove_node(&rds_ibdev->list); mutex_destroy(&rds_ibdev->spinlock); rw_destroy(&rds_ibdev->rwlock); kmem_free(rds_ibdev, sizeof (*rds_ibdev)); RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device); } #ifndef __lock_lint struct ib_client rdsv3_ib_client = { .name = "rdsv3_ib", .add = rdsv3_ib_add_one, .remove = rdsv3_ib_remove_one, .clnt_hdl = NULL, .state = IB_CLNT_UNINITIALIZED }; #else struct ib_client rdsv3_ib_client = { "rdsv3_ib", rdsv3_ib_add_one, rdsv3_ib_remove_one, NULL, NULL, IB_CLNT_UNINITIALIZED }; #endif static int rds_ib_conn_info_visitor(struct rdsv3_connection *conn, void *buffer) { struct rds_info_rdma_connection *iinfo = buffer; struct rdsv3_ib_connection *ic; RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", conn, buffer); /* We will only ever look at IB transports */ if (conn->c_trans != &rdsv3_ib_transport) return (0); iinfo->src_addr = conn->c_laddr; iinfo->dst_addr = conn->c_faddr; (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { struct rdsv3_ib_device *rds_ibdev; struct rdma_dev_addr *dev_addr; ic = conn->c_transport_data; dev_addr = &ic->i_cm_id->route.addr.dev_addr; ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; } RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", conn, buffer); return (1); } static void rds_ib_ic_info(struct rsock *sock, unsigned int len, struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) { RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", sock, iter, lens, len); rdsv3_for_each_conn_info(sock, len, iter, lens, rds_ib_conn_info_visitor, sizeof (struct rds_info_rdma_connection)); } /* * Early RDS/IB was built to only bind to an address if there is an IPoIB * device with that address set. * * If it were me, I'd advocate for something more flexible. Sending and * receiving should be device-agnostic. Transports would try and maintain * connections between peers who have messages queued. Userspace would be * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ static int rds_ib_laddr_check(uint32_be_t addr) { int ret; struct rdma_cm_id *cm_id; struct sockaddr_in sin; RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); /* * Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); if (!cm_id) return (-EADDRNOTAVAIL); (void) memset(&sin, 0, sizeof (sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); /* rdma_bind_addr will only succeed for IB & iWARP devices */ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* * due to this, we will claim to support iWARP devices unless we * check node_type. */ if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; RDSV3_DPRINTF5("rds_ib_laddr_check", "addr %u.%u.%u.%u ret %d node type %d", NIPQUAD(addr), ret, cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); return (ret); } void rdsv3_ib_exit(void) { RDSV3_DPRINTF4("rds_ib_exit", "Enter"); rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); rdsv3_ib_destroy_nodev_conns(); ib_unregister_client(&rdsv3_ib_client); rdsv3_ib_sysctl_exit(); rdsv3_ib_recv_exit(); rdsv3_trans_unregister(&rdsv3_ib_transport); kmem_free(rdsv3_ib_stats, nr_cpus * sizeof (struct rdsv3_ib_statistics)); mutex_destroy(&ib_nodev_conns_lock); list_destroy(&ib_nodev_conns); list_destroy(&rdsv3_ib_devices); RDSV3_DPRINTF4("rds_ib_exit", "Return"); } #ifndef __lock_lint struct rdsv3_transport rdsv3_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_complete = rdsv3_ib_xmit_complete, .xmit = rdsv3_ib_xmit, .xmit_cong_map = NULL, .xmit_rdma = rdsv3_ib_xmit_rdma, .recv = rdsv3_ib_recv, .conn_alloc = rdsv3_ib_conn_alloc, .conn_free = rdsv3_ib_conn_free, .conn_connect = rdsv3_ib_conn_connect, .conn_shutdown = rdsv3_ib_conn_shutdown, .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, .inc_free = rdsv3_ib_inc_free, .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, .cm_handle_connect = rdsv3_ib_cm_handle_connect, .cm_connect_complete = rdsv3_ib_cm_connect_complete, .stats_info_copy = rdsv3_ib_stats_info_copy, .exit = rdsv3_ib_exit, .get_mr = rdsv3_ib_get_mr, .sync_mr = rdsv3_ib_sync_mr, .free_mr = rdsv3_ib_free_mr, .flush_mrs = rdsv3_ib_flush_mrs, .t_name = "infiniband", .t_type = RDS_TRANS_IB }; #else struct rdsv3_transport rdsv3_ib_transport; #endif int rdsv3_ib_init(void) { int ret; RDSV3_DPRINTF4("rds_ib_init", "Enter"); list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), offsetof(struct rdsv3_ib_device, list)); list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), offsetof(struct rdsv3_ib_connection, ib_node)); mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); /* allocate space for ib statistics */ ASSERT(rdsv3_ib_stats == NULL); rdsv3_ib_stats = kmem_zalloc(nr_cpus * sizeof (struct rdsv3_ib_statistics), KM_SLEEP); rdsv3_ib_client.dip = rdsv3_dev_info; ret = ib_register_client(&rdsv3_ib_client); if (ret) goto out; ret = rdsv3_ib_sysctl_init(); if (ret) goto out_ibreg; ret = rdsv3_ib_recv_init(); if (ret) goto out_sysctl; ret = rdsv3_trans_register(&rdsv3_ib_transport); if (ret) goto out_recv; rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); RDSV3_DPRINTF4("rds_ib_init", "Return"); return (0); out_recv: rdsv3_ib_recv_exit(); out_sysctl: rdsv3_ib_sysctl_exit(); out_ibreg: ib_unregister_client(&rdsv3_ib_client); out: kmem_free(rdsv3_ib_stats, nr_cpus * sizeof (struct rdsv3_ib_statistics)); mutex_destroy(&ib_nodev_conns_lock); list_destroy(&ib_nodev_conns); list_destroy(&rdsv3_ib_devices); return (ret); }