/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uint_t rdsv3_one_sec_in_hz; ddi_taskq_t *rdsv3_taskq = NULL; extern kmem_cache_t *rdsv3_alloc_cache; extern unsigned int ip_ocsum(ushort_t *address, int halfword_count, unsigned int sum); /* * Check if the IP interface named by `lifrp' is RDS-capable. */ boolean_t rdsv3_capable_interface(struct lifreq *lifrp) { char ifname[LIFNAMSIZ]; char drv[MAXLINKNAMELEN]; uint_t ppa; char *cp; RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter"); if (lifrp->lifr_type == IFT_IB) return (B_TRUE); /* * Strip off the logical interface portion before getting * intimate with the name. */ (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); if ((cp = strchr(ifname, ':')) != NULL) *cp = '\0'; if (strcmp("lo0", ifname) == 0) { /* * loopback is considered RDS-capable */ return (B_TRUE); } return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && rdsv3_if_lookup_by_name(drv)); } int rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs) { struct lifnum lifn; struct lifconf lifc; struct lifreq *lp, *rlp, lifr; int rval = 0; int numifs; int bufsize, rbufsize; void *buf, *rbuf; int i, j, n, rc; *ipaddrs = NULL; *size = 0; *nifs = 0; RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter"); retry_count: /* snapshot the current number of interfaces */ lifn.lifn_family = PF_UNSPEC; lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; lifn.lifn_count = 0; rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED()); if (rval != 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "ksocket_ioctl returned: %d", rval); return (rval); } numifs = lifn.lifn_count; if (numifs <= 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found"); return (0); } /* allocate extra room in case more interfaces appear */ numifs += 10; /* get the interface names and ip addresses */ bufsize = numifs * sizeof (struct lifreq); buf = kmem_alloc(bufsize, KM_SLEEP); lifc.lifc_family = AF_UNSPEC; lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; lifc.lifc_len = bufsize; lifc.lifc_buf = buf; rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED()); if (rc != 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed"); kmem_free(buf, bufsize); return (rc); } /* if our extra room is used up, try again */ if (bufsize <= lifc.lifc_len) { kmem_free(buf, bufsize); buf = NULL; goto retry_count; } /* calc actual number of ifconfs */ n = lifc.lifc_len / sizeof (struct lifreq); /* * Count the RDS interfaces */ for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) { /* * Copy as the SIOCGLIFFLAGS ioctl is destructive */ bcopy(lp, &lifr, sizeof (struct lifreq)); /* * fetch the flags using the socket of the correct family */ switch (lifr.lifr_addr.ss_family) { case AF_INET: rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, &rval, CRED()); break; default: continue; } if (rc != 0) continue; /* * If we got the flags, skip uninteresting * interfaces based on flags */ if ((lifr.lifr_flags & IFF_UP) != IFF_UP) continue; if (lifr.lifr_flags & (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) continue; if (!rdsv3_capable_interface(&lifr)) continue; j++; } if (j <= 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces"); kmem_free(buf, bufsize); return (rval); } numifs = j; /* This is the buffer we pass back */ rbufsize = numifs * sizeof (struct lifreq); rbuf = kmem_alloc(rbufsize, KM_SLEEP); rlp = (struct lifreq *)rbuf; /* * Examine the array of interfaces and filter uninteresting ones */ for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) { /* * Copy the address as the SIOCGLIFFLAGS ioctl is destructive */ bcopy(lp, &lifr, sizeof (struct lifreq)); /* * fetch the flags using the socket of the correct family */ switch (lifr.lifr_addr.ss_family) { case AF_INET: rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, &rval, CRED()); break; default: continue; } if (rc != 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "ksocket_ioctl failed" " for %s", lifr.lifr_name); continue; } /* * If we got the flags, skip uninteresting * interfaces based on flags */ if ((lifr.lifr_flags & IFF_UP) != IFF_UP) continue; if (lifr.lifr_flags & (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) continue; if (!rdsv3_capable_interface(&lifr)) continue; /* save the record */ bcopy(lp, rlp, sizeof (struct lifreq)); rlp->lifr_addr.ss_family = AF_INET_OFFLOAD; rlp++; } kmem_free(buf, bufsize); *ipaddrs = rbuf; *size = rbufsize; *nifs = numifs; RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return"); return (rval); } /* * Check if the IP interface named by `ifrp' is RDS-capable. */ boolean_t rdsv3_capable_interface_old(struct ifreq *ifrp) { char ifname[IFNAMSIZ]; char drv[MAXLINKNAMELEN]; uint_t ppa; char *cp; RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter"); /* * Strip off the logical interface portion before getting * intimate with the name. */ (void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ); if ((cp = strchr(ifname, ':')) != NULL) *cp = '\0'; RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname); if ((strcmp("lo0", ifname) == 0) || (strncmp("ibd", ifname, 3) == 0)) { /* * loopback and IB are considered RDS-capable */ return (B_TRUE); } return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && rdsv3_if_lookup_by_name(drv)); } int rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs) { uint_t ifn; struct ifconf ifc; struct ifreq *lp, *rlp, ifr; int rval = 0; int numifs; int bufsize, rbufsize; void *buf, *rbuf; int i, j, n, rc; *ipaddrs = NULL; *size = 0; *nifs = 0; RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter"); retry_count: rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval, CRED()); if (rval != 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval); return (rval); } numifs = ifn; if (numifs <= 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found"); return (0); } /* allocate extra room in case more interfaces appear */ numifs += 10; /* get the interface names and ip addresses */ bufsize = numifs * sizeof (struct ifreq); buf = kmem_alloc(bufsize, KM_SLEEP); ifc.ifc_len = bufsize; ifc.ifc_buf = buf; rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED()); if (rc != 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "SIOCGLIFCONF failed: %d", rc); kmem_free(buf, bufsize); return (rc); } /* if our extra room is used up, try again */ if (bufsize <= ifc.ifc_len) { kmem_free(buf, bufsize); buf = NULL; goto retry_count; } /* calc actual number of ifconfs */ n = ifc.ifc_len / sizeof (struct ifreq); /* * Count the RDS interfaces */ for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) { /* * Copy as the SIOCGIFFLAGS ioctl is destructive */ bcopy(lp, &ifr, sizeof (struct ifreq)); /* * fetch the flags using the socket of the correct family */ switch (ifr.ifr_addr.sa_family) { case AF_INET: rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, &rval, CRED()); break; default: continue; } if (rc != 0) continue; RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "1. ifr_name: %s, flags: %d", ifr.ifr_name, (ushort_t)ifr.ifr_flags); /* * If we got the flags, skip uninteresting * interfaces based on flags */ if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) continue; RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "2. ifr_name: %s, flags: %d", ifr.ifr_name, (ushort_t)ifr.ifr_flags); if (((ushort_t)ifr.ifr_flags) & (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) continue; RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "3. ifr_name: %s, flags: %d", ifr.ifr_name, (ushort_t)ifr.ifr_flags); if (!rdsv3_capable_interface_old(&ifr)) continue; RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "4. ifr_name: %s, flags: %d", ifr.ifr_name, (ushort_t)ifr.ifr_flags); j++; } if (j <= 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces"); kmem_free(buf, bufsize); return (rval); } numifs = j; /* This is the buffer we pass back */ rbufsize = numifs * sizeof (struct ifreq); rbuf = kmem_alloc(rbufsize, KM_SLEEP); rlp = (struct ifreq *)rbuf; /* * Examine the array of interfaces and filter uninteresting ones */ for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) { /* * Copy the address as the SIOCGIFFLAGS ioctl is destructive */ bcopy(lp, &ifr, sizeof (struct ifreq)); /* * fetch the flags using the socket of the correct family */ switch (ifr.ifr_addr.sa_family) { case AF_INET: rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, &rval, CRED()); break; default: continue; } if (rc != 0) { RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "ksocket_ioctl failed: %d for %s", rc, ifr.ifr_name); continue; } /* * If we got the flags, skip uninteresting * interfaces based on flags */ if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) continue; if (((ushort_t)ifr.ifr_flags) & (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) continue; if (!rdsv3_capable_interface_old(&ifr)) continue; /* save the record */ bcopy(lp, rlp, sizeof (struct ifreq)); rlp->ifr_addr.sa_family = AF_INET_OFFLOAD; rlp++; } kmem_free(buf, bufsize); *ipaddrs = rbuf; *size = rbufsize; *nifs = numifs; RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return"); return (rval); } boolean_t rdsv3_isloopback(ipaddr_t addr) { ip_stack_t *ipst; ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip; ASSERT(ipst != NULL); if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) { netstack_rele(ipst->ips_netstack); return (B_FALSE); } netstack_rele(ipst->ips_netstack); return (B_TRUE); } /* * Work Queue Implementation */ #define RDSV3_WQ_THREAD_IDLE 0 #define RDSV3_WQ_THREAD_RUNNING 1 #define RDSV3_WQ_THREAD_FLUSHING 2 #define RDSV3_WQ_THREAD_EXITING 3 /* worker thread */ void rdsv3_worker_thread(void *arg) { rdsv3_workqueue_struct_t *wq = arg; rdsv3_work_t *work; RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq); mutex_enter(&wq->wq_lock); work = list_remove_head(&wq->wq_queue); while (work) { mutex_exit(&wq->wq_lock); /* process work */ work->func(work); mutex_enter(&wq->wq_lock); work = list_remove_head(&wq->wq_queue); } /* No more work, go home, until called again */ if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) { wq->wq_state = RDSV3_WQ_THREAD_IDLE; } mutex_exit(&wq->wq_lock); RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq); } /* XXX */ void rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq) { RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq); mutex_enter(&wq->wq_lock); switch (wq->wq_state) { case RDSV3_WQ_THREAD_IDLE: /* nothing to do */ ASSERT(list_is_empty(&wq->wq_queue)); break; case RDSV3_WQ_THREAD_RUNNING: wq->wq_state = RDSV3_WQ_THREAD_FLUSHING; /* FALLTHRU */ case RDSV3_WQ_THREAD_FLUSHING: /* already flushing, wait until the flushing is complete */ do { mutex_exit(&wq->wq_lock); delay(drv_usectohz(1000000)); mutex_enter(&wq->wq_lock); } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); break; case RDSV3_WQ_THREAD_EXITING: mutex_exit(&wq->wq_lock); rdsv3_worker_thread(wq); return; } mutex_exit(&wq->wq_lock); RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq); } void rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp) { RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp); mutex_enter(&wq->wq_lock); if (list_link_active(&wp->work_item)) { /* This is already in the queue, ignore this call */ mutex_exit(&wq->wq_lock); RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp); return; } switch (wq->wq_state) { case RDSV3_WQ_THREAD_RUNNING: list_insert_tail(&wq->wq_queue, wp); mutex_exit(&wq->wq_lock); break; case RDSV3_WQ_THREAD_FLUSHING: do { mutex_exit(&wq->wq_lock); delay(drv_usectohz(1000000)); mutex_enter(&wq->wq_lock); } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) { list_insert_tail(&wq->wq_queue, wp); mutex_exit(&wq->wq_lock); break; } /* FALLTHRU */ case RDSV3_WQ_THREAD_IDLE: list_insert_tail(&wq->wq_queue, wp); wq->wq_state = RDSV3_WQ_THREAD_RUNNING; mutex_exit(&wq->wq_lock); (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq, DDI_SLEEP); break; case RDSV3_WQ_THREAD_EXITING: mutex_exit(&wq->wq_lock); break; } RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp); } /* timeout handler for delayed work queuing */ void rdsv3_work_timeout_handler(void *arg) { rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg; RDSV3_DPRINTF4("rdsv3_work_timeout_handler", "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work); mutex_enter(&dwp->lock); dwp->timeid = 0; mutex_exit(&dwp->lock); mutex_enter(&dwp->wq->wq_lock); dwp->wq->wq_pending--; if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) { mutex_exit(&dwp->wq->wq_lock); return; } mutex_exit(&dwp->wq->wq_lock); rdsv3_queue_work(dwp->wq, &dwp->work); RDSV3_DPRINTF4("rdsv3_work_timeout_handler", "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work); } void rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq, rdsv3_delayed_work_t *dwp, uint_t delay) { RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Enter(wq: %p, wp: %p)", wq, dwp); if (delay == 0) { rdsv3_queue_work(wq, &dwp->work); return; } mutex_enter(&wq->wq_lock); if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) { mutex_exit(&wq->wq_lock); RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp); return; } wq->wq_pending++; mutex_exit(&wq->wq_lock); mutex_enter(&dwp->lock); if (dwp->timeid == 0) { dwp->wq = wq; dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp, jiffies + (delay * rdsv3_one_sec_in_hz)); mutex_exit(&dwp->lock); } else { mutex_exit(&dwp->lock); RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p", dwp); mutex_enter(&wq->wq_lock); wq->wq_pending--; mutex_exit(&wq->wq_lock); } RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Return(wq: %p, wp: %p)", wq, dwp); } void rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp) { RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", "Enter(wq: %p, dwp: %p)", dwp->wq, dwp); mutex_enter(&dwp->lock); if (dwp->timeid != 0) { (void) untimeout(dwp->timeid); dwp->timeid = 0; } else { RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp); mutex_exit(&dwp->lock); return; } mutex_exit(&dwp->lock); mutex_enter(&dwp->wq->wq_lock); dwp->wq->wq_pending--; mutex_exit(&dwp->wq->wq_lock); RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", "Return(wq: %p, dwp: %p)", dwp->wq, dwp); } void rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq) { RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter"); ASSERT(wq); mutex_enter(&wq->wq_lock); wq->wq_state = RDSV3_WQ_THREAD_EXITING; while (wq->wq_pending > 0) { mutex_exit(&wq->wq_lock); delay(drv_usectohz(1000000)); mutex_enter(&wq->wq_lock); }; mutex_exit(&wq->wq_lock); rdsv3_flush_workqueue(wq); list_destroy(&wq->wq_queue); mutex_destroy(&wq->wq_lock); kmem_free(wq, sizeof (rdsv3_workqueue_struct_t)); ASSERT(rdsv3_taskq); ddi_taskq_destroy(rdsv3_taskq); wq = NULL; rdsv3_taskq = NULL; RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return"); } /* ARGSUSED */ void rdsv3_rdma_init_worker(struct rdsv3_work_s *work) { rdsv3_rdma_init(); } #define RDSV3_NUM_TASKQ_THREADS 1 rdsv3_workqueue_struct_t * rdsv3_create_task_workqueue(char *name) { rdsv3_workqueue_struct_t *wq; RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)", rdsv3_dev_info); rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name, RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0); if (rdsv3_taskq == NULL) { RDSV3_DPRINTF2(__FILE__, "ddi_taskq_create failed for rdsv3_taskq"); return (NULL); } wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP); if (wq == NULL) { RDSV3_DPRINTF2(__FILE__, "kmem_zalloc failed for wq"); ddi_taskq_destroy(rdsv3_taskq); return (NULL); } list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s), offsetof(struct rdsv3_work_s, work_item)); mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL); wq->wq_state = RDSV3_WQ_THREAD_IDLE; wq->wq_pending = 0; rdsv3_one_sec_in_hz = drv_usectohz(1000000); RDSV3_DPRINTF2("create_singlethread_workqueue", "Return"); return (wq); } /* * Implementation for struct sock */ void rdsv3_sock_exit_data(struct rsock *sk) { struct rdsv3_sock *rs = sk->sk_protinfo; RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); ASSERT(rs != NULL); ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD)); rs->rs_sk = NULL; list_destroy(&rs->rs_send_queue); list_destroy(&rs->rs_notify_queue); list_destroy(&rs->rs_recv_queue); rw_destroy(&rs->rs_recv_lock); mutex_destroy(&rs->rs_lock); mutex_destroy(&rs->rs_rdma_lock); avl_destroy(&rs->rs_rdma_keys); mutex_destroy(&rs->rs_conn_lock); mutex_destroy(&rs->rs_congested_lock); cv_destroy(&rs->rs_congested_cv); rdsv3_exit_waitqueue(sk->sk_sleep); kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t)); mutex_destroy(&sk->sk_lock); kmem_cache_free(rdsv3_alloc_cache, sk); RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); } /* XXX - figure out right values */ #define RDSV3_RECV_HIWATER (256 * 1024) #define RDSV3_RECV_LOWATER 128 #define RDSV3_XMIT_HIWATER (256 * 1024) #define RDSV3_XMIT_LOWATER 1024 struct rsock * rdsv3_sk_alloc() { struct rsock *sk; sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP); if (sk == NULL) { RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed"); return (NULL); } bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock)); return (sk); } void rdsv3_sock_init_data(struct rsock *sk) { sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP); rdsv3_init_waitqueue(sk->sk_sleep); mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL); sk->sk_refcount = 1; sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1); sk->sk_sndbuf = RDSV3_XMIT_HIWATER; sk->sk_rcvbuf = RDSV3_RECV_HIWATER; } /* * Connection cache */ /* ARGSUSED */ int rdsv3_conn_constructor(void *buf, void *arg, int kmflags) { struct rdsv3_connection *conn = buf; bzero(conn, sizeof (struct rdsv3_connection)); conn->c_next_tx_seq = 1; mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL); conn->c_send_generation = 1; conn->c_senders = 0; list_create(&conn->c_send_queue, sizeof (struct rdsv3_message), offsetof(struct rdsv3_message, m_conn_item)); list_create(&conn->c_retrans, sizeof (struct rdsv3_message), offsetof(struct rdsv3_message, m_conn_item)); return (0); } /* ARGSUSED */ void rdsv3_conn_destructor(void *buf, void *arg) { struct rdsv3_connection *conn = buf; ASSERT(list_is_empty(&conn->c_send_queue)); ASSERT(list_is_empty(&conn->c_retrans)); list_destroy(&conn->c_send_queue); list_destroy(&conn->c_retrans); mutex_destroy(&conn->c_send_lock); mutex_destroy(&conn->c_lock); } int rdsv3_conn_compare(const void *conn1, const void *conn2) { uint32_be_t laddr1, faddr1, laddr2, faddr2; laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr; laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr; if (laddr1 == laddr2) { faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr; faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr; if (faddr1 == faddr2) return (0); if (faddr1 < faddr2) return (-1); return (1); } if (laddr1 < laddr2) return (-1); return (1); } /* rdsv3_ib_incoming cache */ /* ARGSUSED */ int rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags) { list_create(&((struct rdsv3_ib_incoming *)buf)->ii_frags, sizeof (struct rdsv3_page_frag), offsetof(struct rdsv3_page_frag, f_item)); return (0); } /* ARGSUSED */ void rdsv3_ib_inc_destructor(void *buf, void *arg) { list_destroy(&((struct rdsv3_ib_incoming *)buf)->ii_frags); } /* ib_frag_slab cache */ /* ARGSUSED */ int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags) { struct rdsv3_page_frag *frag = (struct rdsv3_page_frag *)buf; struct rdsv3_ib_device *rds_ibdev = (struct rdsv3_ib_device *)arg; ibt_iov_attr_t iov_attr; ibt_iov_t iov_arr[1]; ibt_all_wr_t wr; bzero(frag, sizeof (struct rdsv3_page_frag)); list_link_init(&frag->f_item); frag->f_page = kmem_alloc(PAGE_SIZE, kmflags); if (frag->f_page == NULL) { RDSV3_DPRINTF2("rdsv3_ib_frag_constructor", "kmem_alloc for %d failed", PAGE_SIZE); return (-1); } frag->f_offset = 0; iov_attr.iov_as = NULL; iov_attr.iov = &iov_arr[0]; iov_attr.iov_buf = NULL; iov_attr.iov_list_len = 1; iov_attr.iov_wr_nds = 1; iov_attr.iov_lso_hdr_sz = 0; iov_attr.iov_flags = IBT_IOV_SLEEP | IBT_IOV_RECV; iov_arr[0].iov_addr = frag->f_page; iov_arr[0].iov_len = PAGE_SIZE; wr.recv.wr_nds = 1; wr.recv.wr_sgl = &frag->f_sge; if (ibt_map_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev->dev), &iov_attr, &wr, &frag->f_mapped) != IBT_SUCCESS) { RDSV3_DPRINTF2("rdsv3_ib_frag_constructor", "ibt_map_mem_iov failed"); kmem_free(frag->f_page, PAGE_SIZE); return (-1); } return (0); } /* ARGSUSED */ void rdsv3_ib_frag_destructor(void *buf, void *arg) { struct rdsv3_page_frag *frag = (struct rdsv3_page_frag *)buf; struct rdsv3_ib_device *rds_ibdev = (struct rdsv3_ib_device *)arg; /* unmap the page */ if (ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev->dev), frag->f_mapped) != IBT_SUCCESS) RDSV3_DPRINTF2("rdsv3_ib_frag_destructor", "ibt_unmap_mem_iov failed"); /* free the page */ kmem_free(frag->f_page, PAGE_SIZE); } /* loop.c */ extern kmutex_t loop_conns_lock; extern list_t loop_conns; struct rdsv3_loop_connection { struct list_node loop_node; struct rdsv3_connection *conn; }; void rdsv3_loop_init(void) { list_create(&loop_conns, sizeof (struct rdsv3_loop_connection), offsetof(struct rdsv3_loop_connection, loop_node)); mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL); } /* rdma.c */ /* IB Rkey is used here for comparison */ int rdsv3_mr_compare(const void *mr1, const void *mr2) { uint32_t key1 = *(uint32_t *)mr1; uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key; if (key1 < key2) return (-1); if (key1 > key2) return (1); return (0); } /* transport.c */ extern struct rdsv3_transport *transports[]; extern krwlock_t trans_sem; void rdsv3_trans_exit(void) { struct rdsv3_transport *trans; int i; RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter"); /* currently, only IB transport */ rw_enter(&trans_sem, RW_READER); trans = NULL; for (i = 0; i < RDS_TRANS_COUNT; i++) { if (transports[i]) { trans = transports[i]; break; } } rw_exit(&trans_sem); /* trans->exit() will remove the trans from the list */ if (trans) trans->exit(); rw_destroy(&trans_sem); RDSV3_DPRINTF2("rdsv3_trans_exit", "Return"); } void rdsv3_trans_init() { RDSV3_DPRINTF2("rdsv3_trans_init", "Enter"); rw_init(&trans_sem, NULL, RW_DRIVER, NULL); RDSV3_DPRINTF2("rdsv3_trans_init", "Return"); } int rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size, void *payload) { struct cmsghdr *cp; char *bp; size_t cmlen; size_t cmspace; size_t bufsz; RDSV3_DPRINTF4("rdsv3_put_cmsg", "Enter(msg: %p level: %d type: %d sz: %d)", msg, level, type, size); if (msg == NULL || msg->msg_controllen == 0) { return (0); } /* check for first cmsg or this is another cmsg to be appended */ if (msg->msg_control == NULL) msg->msg_controllen = 0; cmlen = CMSG_LEN(size); cmspace = CMSG_SPACE(size); bufsz = msg->msg_controllen + cmspace; /* extend the existing cmsg to append the next cmsg */ bp = kmem_alloc(bufsz, KM_SLEEP); if (msg->msg_control) { bcopy(msg->msg_control, bp, msg->msg_controllen); kmem_free(msg->msg_control, (size_t)msg->msg_controllen); } /* assign payload the proper cmsg location */ cp = (struct cmsghdr *)(bp + msg->msg_controllen); cp->cmsg_len = cmlen; cp->cmsg_level = level; cp->cmsg_type = type; bcopy(payload, CMSG_DATA(cp), cmlen - (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr))); msg->msg_control = bp; msg->msg_controllen = bufsz; RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len); return (0); } /* ARGSUSED */ int rdsv3_verify_bind_address(ipaddr_t addr) { return (1); } /* checksum */ uint16_t rdsv3_ip_fast_csum(void *hdr, size_t length) { return (0xffff & (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0))); } /* scatterlist implementation */ /* ARGSUSED */ caddr_t rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat, uint_t offset) { return (0); } uint_t rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat, uint_t num) { struct rdsv3_scatterlist *s, *first; ibt_iov_t *iov; ibt_wr_ds_t *sgl; ibt_iov_attr_t iov_attr; ibt_send_wr_t swr; uint_t i; RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num); s = first = &scat[0]; ASSERT(first->mihdl == NULL); iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP); sgl = kmem_zalloc((num * 2) * sizeof (ibt_wr_ds_t), KM_SLEEP); for (i = 0; i < num; i++, s++) { iov[i].iov_addr = s->vaddr; iov[i].iov_len = s->length; } iov_attr.iov_as = NULL; iov_attr.iov = iov; iov_attr.iov_buf = NULL; iov_attr.iov_list_len = num; iov_attr.iov_wr_nds = num * 2; iov_attr.iov_lso_hdr_sz = 0; iov_attr.iov_flags = IBT_IOV_SLEEP; swr.wr_sgl = sgl; i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev), &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl); kmem_free(iov, num * sizeof (ibt_iov_t)); if (i != IBT_SUCCESS) { RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg", "ibt_map_mem_iov returned: %d", i); return (0); } s = first; for (i = 0; i < num; i++, s++, sgl++) { s->sgl = sgl; } return (num); } void rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat, uint_t num) { /* Zero length messages have no scatter gather entries */ if (num != 0) { ASSERT(scat->mihdl != NULL); ASSERT(scat->sgl != NULL); (void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl); kmem_free(scat->sgl, (num * 2) * sizeof (ibt_wr_ds_t)); scat->sgl = NULL; scat->mihdl = NULL; } } int rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) { caddr_t addr; size_t size; ibt_mr_attr_t mr_attr; ibt_mr_desc_t mr_desc; ibt_mr_hdl_t mr_hdl; int ret; RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev); ASSERT(ic->i_mr == NULL); size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) * sizeof (struct rdsv3_header); addr = kmem_zalloc(size, KM_NOSLEEP); if (addr == NULL) return (-1); mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr; mr_attr.mr_len = size; mr_attr.mr_as = NULL; mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE; ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd), &mr_attr, &mr_hdl, &mr_desc); if (ret != IBT_SUCCESS) { RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs", "ibt_register_mr returned: " "%d", ret); return (-1); } ic->i_mr = (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr), KM_SLEEP); ic->i_mr->addr = addr; ic->i_mr->size = size; ic->i_mr->hdl = mr_hdl; ic->i_mr->lkey = mr_desc.md_lkey; ic->i_send_hdrs = (struct rdsv3_header *)addr; ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr; ic->i_recv_hdrs = (struct rdsv3_header *)(addr + (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr + (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); ic->i_ack = (struct rdsv3_header *)(addr + ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * sizeof (struct rdsv3_header))); ic->i_ack_dma = (uint64_t)(uintptr_t)(addr + ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * sizeof (struct rdsv3_header))); RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev); return (0); } void rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) { RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev); ASSERT(ic->i_mr != NULL); ic->i_send_hdrs = NULL; ic->i_send_hdrs_dma = 0; ic->i_recv_hdrs = NULL; ic->i_recv_hdrs_dma = 0; ic->i_ack = NULL; ic->i_ack_dma = 0; (void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl); kmem_free(ic->i_mr->addr, ic->i_mr->size); kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr)); ic->i_mr = NULL; RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev); } /* * atomic_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if @v was not @u, and zero otherwise. */ int atomic_add_unless(atomic_t *v, uint_t a, ulong_t u) { uint_t c, old; c = *v; while (c != u && (old = atomic_cas_uint(v, c, c + a)) != c) { c = old; } return ((ulong_t)c != u); }