1c0dd49bdSEiji Ota /* 216e76cddSagiri * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3*48bbca81SDaniel Hoffman * Copyright (c) 2016 by Delphix. All rights reserved. 4c0dd49bdSEiji Ota */ 516e76cddSagiri 6c0dd49bdSEiji Ota /* 716e76cddSagiri * This file contains code imported from the OFED rds source file recv.c 816e76cddSagiri * Oracle elects to have and use the contents of rds_recv.c under and governed 916e76cddSagiri * by the OpenIB.org BSD license (see below for full license text). However, 1016e76cddSagiri * the following notice accompanied the original version of this file: 11c0dd49bdSEiji Ota */ 12c0dd49bdSEiji Ota 13c0dd49bdSEiji Ota /* 14c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved. 15c0dd49bdSEiji Ota * 16c0dd49bdSEiji Ota * This software is available to you under a choice of one of two 17c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU 18c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file 19c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the 20c0dd49bdSEiji Ota * OpenIB.org BSD license below: 21c0dd49bdSEiji Ota * 22c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or 23c0dd49bdSEiji Ota * without modification, are permitted provided that the following 24c0dd49bdSEiji Ota * conditions are met: 25c0dd49bdSEiji Ota * 26c0dd49bdSEiji Ota * - Redistributions of source code must retain the above 27c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 28c0dd49bdSEiji Ota * disclaimer. 29c0dd49bdSEiji Ota * 30c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above 31c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 32c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials 33c0dd49bdSEiji Ota * provided with the distribution. 34c0dd49bdSEiji Ota * 35c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 36c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 37c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 38c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 39c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 40c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 41c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 42c0dd49bdSEiji Ota * SOFTWARE. 43c0dd49bdSEiji Ota * 44c0dd49bdSEiji Ota */ 45c0dd49bdSEiji Ota #include <sys/rds.h> 46c0dd49bdSEiji Ota 47c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h> 48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdma.h> 49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 50c0dd49bdSEiji Ota 51c0dd49bdSEiji Ota void 52c0dd49bdSEiji Ota rdsv3_inc_init(struct rdsv3_incoming *inc, struct rdsv3_connection *conn, 53c0dd49bdSEiji Ota uint32_be_t saddr) 54c0dd49bdSEiji Ota { 55c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_inc_init", "Enter(inc: %p, conn: %p)", inc, conn); 56c0dd49bdSEiji Ota inc->i_refcount = 1; 57c0dd49bdSEiji Ota list_link_init(&inc->i_item); 58c0dd49bdSEiji Ota inc->i_conn = conn; 59c0dd49bdSEiji Ota inc->i_saddr = saddr; 60c0dd49bdSEiji Ota inc->i_rdma_cookie = 0; 61c0dd49bdSEiji Ota } 62c0dd49bdSEiji Ota 63c0dd49bdSEiji Ota void 64c0dd49bdSEiji Ota rdsv3_inc_addref(struct rdsv3_incoming *inc) 65c0dd49bdSEiji Ota { 66c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_inc_addref", 67c0dd49bdSEiji Ota "addref inc %p ref %d", inc, atomic_get(&inc->i_refcount)); 681a5e258fSJosef 'Jeff' Sipek atomic_inc_32(&inc->i_refcount); 69c0dd49bdSEiji Ota } 70c0dd49bdSEiji Ota 71c0dd49bdSEiji Ota void 72c0dd49bdSEiji Ota rdsv3_inc_put(struct rdsv3_incoming *inc) 73c0dd49bdSEiji Ota { 74c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_inc_put", "put inc %p ref %d", 75c0dd49bdSEiji Ota inc, atomic_get(&inc->i_refcount)); 76c0dd49bdSEiji Ota if (atomic_dec_and_test(&inc->i_refcount)) { 77c0dd49bdSEiji Ota ASSERT(!list_link_active(&inc->i_item)); 78c0dd49bdSEiji Ota 79c0dd49bdSEiji Ota inc->i_conn->c_trans->inc_free(inc); 80c0dd49bdSEiji Ota } 81c0dd49bdSEiji Ota } 82c0dd49bdSEiji Ota 83c0dd49bdSEiji Ota /*ARGSUSED*/ 84c0dd49bdSEiji Ota static void 85c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(struct rdsv3_sock *rs, struct rsock *sk, 86c0dd49bdSEiji Ota struct rdsv3_cong_map *map, 87c0dd49bdSEiji Ota int delta, uint16_be_t port) 88c0dd49bdSEiji Ota { 89c0dd49bdSEiji Ota int now_congested; 90c0dd49bdSEiji Ota 91c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", 92c0dd49bdSEiji Ota "Enter(rs: %p, map: %p, delta: %d, port: %d)", 93c0dd49bdSEiji Ota rs, map, delta, port); 94c0dd49bdSEiji Ota 95c0dd49bdSEiji Ota if (delta == 0) 96c0dd49bdSEiji Ota return; 97c0dd49bdSEiji Ota 98c0dd49bdSEiji Ota rs->rs_rcv_bytes += delta; 99c0dd49bdSEiji Ota now_congested = rs->rs_rcv_bytes > rdsv3_sk_rcvbuf(rs); 100c0dd49bdSEiji Ota 101c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_rcvbuf_delta", 102c0dd49bdSEiji Ota "rs %p (%u.%u.%u.%u:%u) recv bytes %d buf %d " 103c0dd49bdSEiji Ota "now_cong %d delta %d", 104c0dd49bdSEiji Ota rs, NIPQUAD(rs->rs_bound_addr), 105c0dd49bdSEiji Ota (int)ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, 106c0dd49bdSEiji Ota rdsv3_sk_rcvbuf(rs), now_congested, delta); 107c0dd49bdSEiji Ota 108c0dd49bdSEiji Ota /* wasn't -> am congested */ 109c0dd49bdSEiji Ota if (!rs->rs_congested && now_congested) { 110c0dd49bdSEiji Ota rs->rs_congested = 1; 111c0dd49bdSEiji Ota rdsv3_cong_set_bit(map, port); 112c0dd49bdSEiji Ota rdsv3_cong_queue_updates(map); 113c0dd49bdSEiji Ota } 114c0dd49bdSEiji Ota /* was -> aren't congested */ 115c0dd49bdSEiji Ota /* 116c0dd49bdSEiji Ota * Require more free space before reporting uncongested to prevent 117c0dd49bdSEiji Ota * bouncing cong/uncong state too often 118c0dd49bdSEiji Ota */ 119c0dd49bdSEiji Ota else if (rs->rs_congested && 120c0dd49bdSEiji Ota (rs->rs_rcv_bytes < (rdsv3_sk_rcvbuf(rs)/2))) { 121c0dd49bdSEiji Ota rs->rs_congested = 0; 122c0dd49bdSEiji Ota rdsv3_cong_clear_bit(map, port); 123c0dd49bdSEiji Ota rdsv3_cong_queue_updates(map); 124c0dd49bdSEiji Ota } 125c0dd49bdSEiji Ota 126c0dd49bdSEiji Ota /* do nothing if no change in cong state */ 127c0dd49bdSEiji Ota 128c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", "Return(rs: %p)", rs); 129c0dd49bdSEiji Ota } 130c0dd49bdSEiji Ota 131c0dd49bdSEiji Ota /* 132c0dd49bdSEiji Ota * Process all extension headers that come with this message. 133c0dd49bdSEiji Ota */ 134c0dd49bdSEiji Ota static void 135c0dd49bdSEiji Ota rdsv3_recv_incoming_exthdrs(struct rdsv3_incoming *inc, struct rdsv3_sock *rs) 136c0dd49bdSEiji Ota { 137c0dd49bdSEiji Ota struct rdsv3_header *hdr = &inc->i_hdr; 138c0dd49bdSEiji Ota unsigned int pos = 0, type, len; 139c0dd49bdSEiji Ota union { 140c0dd49bdSEiji Ota struct rdsv3_ext_header_version version; 141c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma rdma; 142c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma_dest rdma_dest; 143c0dd49bdSEiji Ota } buffer; 144c0dd49bdSEiji Ota 145c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Enter"); 146c0dd49bdSEiji Ota while (1) { 147c0dd49bdSEiji Ota len = sizeof (buffer); 148c0dd49bdSEiji Ota type = rdsv3_message_next_extension(hdr, &pos, &buffer, &len); 149c0dd49bdSEiji Ota if (type == RDSV3_EXTHDR_NONE) 150c0dd49bdSEiji Ota break; 151c0dd49bdSEiji Ota RDSV3_DPRINTF4("recv_incoming_exthdrs", "type %d", type); 152c0dd49bdSEiji Ota /* Process extension header here */ 153c0dd49bdSEiji Ota switch (type) { 154c0dd49bdSEiji Ota case RDSV3_EXTHDR_RDMA: 155c0dd49bdSEiji Ota rdsv3_rdma_unuse(rs, ntohl(buffer.rdma.h_rdma_rkey), 156c0dd49bdSEiji Ota 0); 157c0dd49bdSEiji Ota break; 158c0dd49bdSEiji Ota 159c0dd49bdSEiji Ota case RDSV3_EXTHDR_RDMA_DEST: 160c0dd49bdSEiji Ota /* 161c0dd49bdSEiji Ota * We ignore the size for now. We could stash it 162c0dd49bdSEiji Ota * somewhere and use it for error checking. 163c0dd49bdSEiji Ota */ 164c0dd49bdSEiji Ota inc->i_rdma_cookie = rdsv3_rdma_make_cookie( 165c0dd49bdSEiji Ota ntohl(buffer.rdma_dest.h_rdma_rkey), 166c0dd49bdSEiji Ota ntohl(buffer.rdma_dest.h_rdma_offset)); 167c0dd49bdSEiji Ota 168c0dd49bdSEiji Ota break; 169c0dd49bdSEiji Ota } 170c0dd49bdSEiji Ota } 171c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Return"); 172c0dd49bdSEiji Ota } 173c0dd49bdSEiji Ota 174c0dd49bdSEiji Ota /* 175c0dd49bdSEiji Ota * The transport must make sure that this is serialized against other 176c0dd49bdSEiji Ota * rx and conn reset on this specific conn. 177c0dd49bdSEiji Ota * 178c0dd49bdSEiji Ota * We currently assert that only one fragmented message will be sent 179c0dd49bdSEiji Ota * down a connection at a time. This lets us reassemble in the conn 180c0dd49bdSEiji Ota * instead of per-flow which means that we don't have to go digging through 181c0dd49bdSEiji Ota * flows to tear down partial reassembly progress on conn failure and 182c0dd49bdSEiji Ota * we save flow lookup and locking for each frag arrival. It does mean 183c0dd49bdSEiji Ota * that small messages will wait behind large ones. Fragmenting at all 184c0dd49bdSEiji Ota * is only to reduce the memory consumption of pre-posted buffers. 185c0dd49bdSEiji Ota * 186c0dd49bdSEiji Ota * The caller passes in saddr and daddr instead of us getting it from the 187c0dd49bdSEiji Ota * conn. This lets loopback, who only has one conn for both directions, 188c0dd49bdSEiji Ota * tell us which roles the addrs in the conn are playing for this message. 189c0dd49bdSEiji Ota */ 190c0dd49bdSEiji Ota /* ARGSUSED */ 191c0dd49bdSEiji Ota void 192c0dd49bdSEiji Ota rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr, 193c0dd49bdSEiji Ota uint32_be_t daddr, struct rdsv3_incoming *inc, int gfp) 194c0dd49bdSEiji Ota { 195c0dd49bdSEiji Ota struct rdsv3_sock *rs = NULL; 196c0dd49bdSEiji Ota struct rsock *sk; 197c0dd49bdSEiji Ota 198c0dd49bdSEiji Ota inc->i_conn = conn; 199c0dd49bdSEiji Ota inc->i_rx_jiffies = jiffies; 200c0dd49bdSEiji Ota 201c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_incoming", 202c0dd49bdSEiji Ota "conn %p next %llu inc %p seq %llu len %u sport %u dport %u " 203c0dd49bdSEiji Ota "flags 0x%x rx_jiffies %lu", conn, 204c0dd49bdSEiji Ota (unsigned long long)conn->c_next_rx_seq, 205c0dd49bdSEiji Ota inc, 206c0dd49bdSEiji Ota (unsigned long long)ntohll(inc->i_hdr.h_sequence), 207c0dd49bdSEiji Ota ntohl(inc->i_hdr.h_len), 208c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_sport), 209c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_dport), 210c0dd49bdSEiji Ota inc->i_hdr.h_flags, 211c0dd49bdSEiji Ota inc->i_rx_jiffies); 212c0dd49bdSEiji Ota 213c0dd49bdSEiji Ota /* 214c0dd49bdSEiji Ota * Sequence numbers should only increase. Messages get their 215c0dd49bdSEiji Ota * sequence number as they're queued in a sending conn. They 216c0dd49bdSEiji Ota * can be dropped, though, if the sending socket is closed before 217c0dd49bdSEiji Ota * they hit the wire. So sequence numbers can skip forward 218c0dd49bdSEiji Ota * under normal operation. They can also drop back in the conn 219c0dd49bdSEiji Ota * failover case as previously sent messages are resent down the 220c0dd49bdSEiji Ota * new instance of a conn. We drop those, otherwise we have 221c0dd49bdSEiji Ota * to assume that the next valid seq does not come after a 222c0dd49bdSEiji Ota * hole in the fragment stream. 223c0dd49bdSEiji Ota * 224c0dd49bdSEiji Ota * The headers don't give us a way to realize if fragments of 225c0dd49bdSEiji Ota * a message have been dropped. We assume that frags that arrive 226c0dd49bdSEiji Ota * to a flow are part of the current message on the flow that is 227c0dd49bdSEiji Ota * being reassembled. This means that senders can't drop messages 228c0dd49bdSEiji Ota * from the sending conn until all their frags are sent. 229c0dd49bdSEiji Ota * 230c0dd49bdSEiji Ota * XXX we could spend more on the wire to get more robust failure 231c0dd49bdSEiji Ota * detection, arguably worth it to avoid data corruption. 232c0dd49bdSEiji Ota */ 233c0dd49bdSEiji Ota if (ntohll(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && 234c0dd49bdSEiji Ota (inc->i_hdr.h_flags & RDSV3_FLAG_RETRANSMITTED)) { 235c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_old_seq); 236c0dd49bdSEiji Ota goto out; 237c0dd49bdSEiji Ota } 238c0dd49bdSEiji Ota conn->c_next_rx_seq = ntohll(inc->i_hdr.h_sequence) + 1; 239c0dd49bdSEiji Ota 240c0dd49bdSEiji Ota if (rdsv3_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { 241c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_ping); 242c0dd49bdSEiji Ota (void) rdsv3_send_pong(conn, inc->i_hdr.h_sport); 243c0dd49bdSEiji Ota goto out; 244c0dd49bdSEiji Ota } 245c0dd49bdSEiji Ota 24680166370Sagiri rs = rdsv3_find_bound(conn, inc->i_hdr.h_dport); 2475d5562f5SEiji Ota if (!rs) { 248c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_no_sock); 249c0dd49bdSEiji Ota goto out; 250c0dd49bdSEiji Ota } 251c0dd49bdSEiji Ota 252c0dd49bdSEiji Ota /* Process extension headers */ 253c0dd49bdSEiji Ota rdsv3_recv_incoming_exthdrs(inc, rs); 254c0dd49bdSEiji Ota 255c0dd49bdSEiji Ota /* We can be racing with rdsv3_release() which marks the socket dead. */ 256c0dd49bdSEiji Ota sk = rdsv3_rs_to_sk(rs); 257c0dd49bdSEiji Ota 258c0dd49bdSEiji Ota /* serialize with rdsv3_release -> sock_orphan */ 259c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER); 260c0dd49bdSEiji Ota if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD)) { 261c0dd49bdSEiji Ota int error, bytes; 262c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_incoming", 263c0dd49bdSEiji Ota "adding inc %p to rs %p's recv queue", inc, rs); 264c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_queued); 265c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 266c0dd49bdSEiji Ota ntohl(inc->i_hdr.h_len), 267c0dd49bdSEiji Ota inc->i_hdr.h_dport); 268c0dd49bdSEiji Ota rdsv3_inc_addref(inc); 269c0dd49bdSEiji Ota list_insert_tail(&rs->rs_recv_queue, inc); 270c0dd49bdSEiji Ota bytes = rs->rs_rcv_bytes; 271c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 272c0dd49bdSEiji Ota 273c0dd49bdSEiji Ota __rdsv3_wake_sk_sleep(sk); 274c0dd49bdSEiji Ota 275c0dd49bdSEiji Ota /* wake up anyone waiting in poll */ 276c0dd49bdSEiji Ota sk->sk_upcalls->su_recv(sk->sk_upper_handle, NULL, 277c0dd49bdSEiji Ota bytes, 0, &error, NULL); 278c0dd49bdSEiji Ota if (error != 0) { 279c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_recv_incoming", 280c0dd49bdSEiji Ota "su_recv returned: %d", error); 281c0dd49bdSEiji Ota } 282c0dd49bdSEiji Ota } else { 283c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_dead_sock); 284c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 285c0dd49bdSEiji Ota } 286c0dd49bdSEiji Ota 287c0dd49bdSEiji Ota out: 288c0dd49bdSEiji Ota if (rs) 289c0dd49bdSEiji Ota rdsv3_sock_put(rs); 290c0dd49bdSEiji Ota } 291c0dd49bdSEiji Ota 292c0dd49bdSEiji Ota /* 293c0dd49bdSEiji Ota * be very careful here. This is being called as the condition in 294c0dd49bdSEiji Ota * wait_event_*() needs to cope with being called many times. 295c0dd49bdSEiji Ota */ 296c0dd49bdSEiji Ota static int 297c0dd49bdSEiji Ota rdsv3_next_incoming(struct rdsv3_sock *rs, struct rdsv3_incoming **inc) 298c0dd49bdSEiji Ota { 2995d5562f5SEiji Ota if (!*inc) { 300c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_READER); 301c0dd49bdSEiji Ota if (!list_is_empty(&rs->rs_recv_queue)) { 302c0dd49bdSEiji Ota *inc = list_head(&rs->rs_recv_queue); 303c0dd49bdSEiji Ota rdsv3_inc_addref(*inc); 304c0dd49bdSEiji Ota } 305c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 306c0dd49bdSEiji Ota } 307c0dd49bdSEiji Ota 308c0dd49bdSEiji Ota return (*inc != NULL); 309c0dd49bdSEiji Ota } 310c0dd49bdSEiji Ota 311c0dd49bdSEiji Ota static int 312c0dd49bdSEiji Ota rdsv3_still_queued(struct rdsv3_sock *rs, struct rdsv3_incoming *inc, 313c0dd49bdSEiji Ota int drop) 314c0dd49bdSEiji Ota { 315c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 316c0dd49bdSEiji Ota int ret = 0; 317c0dd49bdSEiji Ota 318c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_still_queued", "Enter rs: %p inc: %p drop: %d", 319c0dd49bdSEiji Ota rs, inc, drop); 320c0dd49bdSEiji Ota 321c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER); 322c0dd49bdSEiji Ota if (list_link_active(&inc->i_item)) { 323c0dd49bdSEiji Ota ret = 1; 324c0dd49bdSEiji Ota if (drop) { 325c0dd49bdSEiji Ota /* XXX make sure this i_conn is reliable */ 326c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 327c0dd49bdSEiji Ota -ntohl(inc->i_hdr.h_len), 328c0dd49bdSEiji Ota inc->i_hdr.h_dport); 329c0dd49bdSEiji Ota list_remove_node(&inc->i_item); 330c0dd49bdSEiji Ota rdsv3_inc_put(inc); 331c0dd49bdSEiji Ota } 332c0dd49bdSEiji Ota } 333c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 334c0dd49bdSEiji Ota 335c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_still_queued", 336c0dd49bdSEiji Ota "inc %p rs %p still %d dropped %d", inc, rs, ret, drop); 337c0dd49bdSEiji Ota return (ret); 338c0dd49bdSEiji Ota } 339c0dd49bdSEiji Ota 340c0dd49bdSEiji Ota /* 341c0dd49bdSEiji Ota * Pull errors off the error queue. 342c0dd49bdSEiji Ota * If msghdr is NULL, we will just purge the error queue. 343c0dd49bdSEiji Ota */ 344c0dd49bdSEiji Ota int 345c0dd49bdSEiji Ota rdsv3_notify_queue_get(struct rdsv3_sock *rs, struct msghdr *msghdr) 346c0dd49bdSEiji Ota { 347c0dd49bdSEiji Ota struct rdsv3_notifier *notifier; 348fe817b60SEiji Ota struct rds_rdma_notify cmsg; 349c0dd49bdSEiji Ota unsigned int count = 0, max_messages = ~0U; 350c0dd49bdSEiji Ota list_t copy; 351c0dd49bdSEiji Ota int err = 0; 352c0dd49bdSEiji Ota 353c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Enter(rs: %p)", rs); 354c0dd49bdSEiji Ota 355c0dd49bdSEiji Ota list_create(©, sizeof (struct rdsv3_notifier), 356c0dd49bdSEiji Ota offsetof(struct rdsv3_notifier, n_list)); 357c0dd49bdSEiji Ota 358c0dd49bdSEiji Ota 359c0dd49bdSEiji Ota /* 360c0dd49bdSEiji Ota * put_cmsg copies to user space and thus may sleep. We can't do this 361c0dd49bdSEiji Ota * with rs_lock held, so first grab as many notifications as we can 362c0dd49bdSEiji Ota * stuff 363c0dd49bdSEiji Ota * in the user provided cmsg buffer. We don't try to copy more, to avoid 364c0dd49bdSEiji Ota * losing notifications - except when the buffer is so small that 365c0dd49bdSEiji Ota * it wouldn't 366*48bbca81SDaniel Hoffman * even hold a single notification. Then we give as much of this 367c0dd49bdSEiji Ota * single 368c0dd49bdSEiji Ota * msg as we can squeeze in, and set MSG_CTRUNC. 369c0dd49bdSEiji Ota */ 370c0dd49bdSEiji Ota if (msghdr) { 371c0dd49bdSEiji Ota max_messages = 372c0dd49bdSEiji Ota msghdr->msg_controllen / CMSG_SPACE(sizeof (cmsg)); 373c0dd49bdSEiji Ota if (!max_messages) 374c0dd49bdSEiji Ota max_messages = 1; 375c0dd49bdSEiji Ota } 376c0dd49bdSEiji Ota 377c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 378c0dd49bdSEiji Ota while (!list_is_empty(&rs->rs_notify_queue) && count < max_messages) { 379c0dd49bdSEiji Ota notifier = list_remove_head(&rs->rs_notify_queue); 380c0dd49bdSEiji Ota list_insert_tail(©, notifier); 381c0dd49bdSEiji Ota count++; 382c0dd49bdSEiji Ota } 383c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 384c0dd49bdSEiji Ota 385c0dd49bdSEiji Ota if (!count) 386c0dd49bdSEiji Ota return (0); 387c0dd49bdSEiji Ota 388c0dd49bdSEiji Ota while (!list_is_empty(©)) { 389c0dd49bdSEiji Ota notifier = list_remove_head(©); 390c0dd49bdSEiji Ota 391c0dd49bdSEiji Ota if (msghdr) { 392c0dd49bdSEiji Ota cmsg.user_token = notifier->n_user_token; 393c0dd49bdSEiji Ota cmsg.status = notifier->n_status; 394c0dd49bdSEiji Ota 395c0dd49bdSEiji Ota err = rdsv3_put_cmsg(msghdr, SOL_RDS, 396fe817b60SEiji Ota RDS_CMSG_RDMA_STATUS, sizeof (cmsg), &cmsg); 397c0dd49bdSEiji Ota if (err) 398c0dd49bdSEiji Ota break; 399c0dd49bdSEiji Ota } 400c0dd49bdSEiji Ota 401c0dd49bdSEiji Ota kmem_free(notifier, sizeof (struct rdsv3_notifier)); 402c0dd49bdSEiji Ota } 403c0dd49bdSEiji Ota 404c0dd49bdSEiji Ota /* 405c0dd49bdSEiji Ota * If we bailed out because of an error in put_cmsg, 406c0dd49bdSEiji Ota * we may be left with one or more notifications that we 407c0dd49bdSEiji Ota * didn't process. Return them to the head of the list. 408c0dd49bdSEiji Ota */ 409c0dd49bdSEiji Ota if (!list_is_empty(©)) { 410c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 411c0dd49bdSEiji Ota list_splice(©, &rs->rs_notify_queue); 412c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 413c0dd49bdSEiji Ota } 414c0dd49bdSEiji Ota 415c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Return(rs: %p)", rs); 416c0dd49bdSEiji Ota 417c0dd49bdSEiji Ota return (err); 418c0dd49bdSEiji Ota } 419c0dd49bdSEiji Ota 420c0dd49bdSEiji Ota /* 421c0dd49bdSEiji Ota * Queue a congestion notification 422c0dd49bdSEiji Ota */ 423c0dd49bdSEiji Ota static int 424c0dd49bdSEiji Ota rdsv3_notify_cong(struct rdsv3_sock *rs, struct msghdr *msghdr) 425c0dd49bdSEiji Ota { 426c0dd49bdSEiji Ota uint64_t notify = rs->rs_cong_notify; 427c0dd49bdSEiji Ota int err; 428c0dd49bdSEiji Ota 429fe817b60SEiji Ota err = rdsv3_put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, 430c0dd49bdSEiji Ota sizeof (notify), ¬ify); 431c0dd49bdSEiji Ota if (err) 432c0dd49bdSEiji Ota return (err); 433c0dd49bdSEiji Ota 434c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 435c0dd49bdSEiji Ota rs->rs_cong_notify &= ~notify; 436c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 437c0dd49bdSEiji Ota 438c0dd49bdSEiji Ota return (0); 439c0dd49bdSEiji Ota } 440c0dd49bdSEiji Ota 441c0dd49bdSEiji Ota /* 442c0dd49bdSEiji Ota * Receive any control messages. 443c0dd49bdSEiji Ota */ 444c0dd49bdSEiji Ota static int 445c0dd49bdSEiji Ota rdsv3_cmsg_recv(struct rdsv3_incoming *inc, struct msghdr *msg) 446c0dd49bdSEiji Ota { 4479b3d509cSEiji Ota int ret = 0; 4489b3d509cSEiji Ota if (inc->i_rdma_cookie) { 4499b3d509cSEiji Ota ret = rdsv3_put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, 4509b3d509cSEiji Ota sizeof (inc->i_rdma_cookie), &inc->i_rdma_cookie); 4519b3d509cSEiji Ota } 4529b3d509cSEiji Ota return (ret); 453c0dd49bdSEiji Ota } 454c0dd49bdSEiji Ota 455c0dd49bdSEiji Ota int 456c0dd49bdSEiji Ota rdsv3_recvmsg(struct rdsv3_sock *rs, uio_t *uio, 457c0dd49bdSEiji Ota struct nmsghdr *msg, size_t size, int msg_flags) 458c0dd49bdSEiji Ota { 459c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 460c0dd49bdSEiji Ota long timeo; 461b27516f5Sagiri int ret = 0; 462c0dd49bdSEiji Ota struct sockaddr_in *sin = NULL; 463c0dd49bdSEiji Ota struct rdsv3_incoming *inc = NULL; 464b27516f5Sagiri boolean_t nonblock = B_FALSE; 465c0dd49bdSEiji Ota 466c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recvmsg", 467c0dd49bdSEiji Ota "Enter(rs: %p size: %d msg_flags: 0x%x)", rs, size, msg_flags); 468c0dd49bdSEiji Ota 469b27516f5Sagiri if ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) || 470b27516f5Sagiri (msg_flags & MSG_DONTWAIT)) 471b27516f5Sagiri nonblock = B_TRUE; 472b27516f5Sagiri 473c0dd49bdSEiji Ota /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ 474c0dd49bdSEiji Ota timeo = rdsv3_rcvtimeo(sk, nonblock); 475c0dd49bdSEiji Ota 476c0dd49bdSEiji Ota if (msg_flags & MSG_OOB) 477c0dd49bdSEiji Ota goto out; 478c0dd49bdSEiji Ota 479c0dd49bdSEiji Ota /* mark the first cmsg position */ 480c0dd49bdSEiji Ota if (msg) { 481c0dd49bdSEiji Ota msg->msg_control = NULL; 482c0dd49bdSEiji Ota } 483c0dd49bdSEiji Ota 484c0dd49bdSEiji Ota while (1) { 485c0dd49bdSEiji Ota /* 486c0dd49bdSEiji Ota * If there are pending notifications, do those - 487c0dd49bdSEiji Ota * and nothing else 488c0dd49bdSEiji Ota */ 489c0dd49bdSEiji Ota if (!list_is_empty(&rs->rs_notify_queue)) { 490c0dd49bdSEiji Ota ret = rdsv3_notify_queue_get(rs, msg); 491c0dd49bdSEiji Ota 492c0dd49bdSEiji Ota if (msg && msg->msg_namelen) { 493c0dd49bdSEiji Ota sin = kmem_zalloc(sizeof (struct sockaddr_in), 494c0dd49bdSEiji Ota KM_SLEEP); 495c0dd49bdSEiji Ota sin->sin_family = AF_INET_OFFLOAD; 496c0dd49bdSEiji Ota if (inc) { 497c0dd49bdSEiji Ota sin->sin_port = inc->i_hdr.h_sport; 498c0dd49bdSEiji Ota sin->sin_addr.s_addr = inc->i_saddr; 499c0dd49bdSEiji Ota } 500c0dd49bdSEiji Ota msg->msg_namelen = sizeof (struct sockaddr_in); 501c0dd49bdSEiji Ota msg->msg_name = sin; 502c0dd49bdSEiji Ota } 503c0dd49bdSEiji Ota break; 504c0dd49bdSEiji Ota } 505c0dd49bdSEiji Ota 506c0dd49bdSEiji Ota if (rs->rs_cong_notify) { 507c0dd49bdSEiji Ota ret = rdsv3_notify_cong(rs, msg); 508c0dd49bdSEiji Ota goto out; 509c0dd49bdSEiji Ota } 510c0dd49bdSEiji Ota 511c0dd49bdSEiji Ota if (!rdsv3_next_incoming(rs, &inc)) { 512c0dd49bdSEiji Ota if (nonblock) { 513c0dd49bdSEiji Ota ret = -EAGAIN; 514c0dd49bdSEiji Ota break; 515c0dd49bdSEiji Ota } 516c0dd49bdSEiji Ota 517c0dd49bdSEiji Ota RDSV3_DPRINTF3("rdsv3_recvmsg", 518c0dd49bdSEiji Ota "Before wait (rs: %p)", rs); 519c0dd49bdSEiji Ota 5206e18d381Sagiri #if 0 5216e18d381Sagiri ret = rdsv3_wait_sig(sk->sk_sleep, 5226e18d381Sagiri !(list_is_empty(&rs->rs_notify_queue) && 5236e18d381Sagiri !rs->rs_cong_notify && 5246e18d381Sagiri !rdsv3_next_incoming(rs, &inc))); 5256e18d381Sagiri if (ret == 0) { 5266e18d381Sagiri /* signal/timeout pending */ 5276e18d381Sagiri RDSV3_DPRINTF2("rdsv3_recvmsg", 5286e18d381Sagiri "woke due to signal"); 5296e18d381Sagiri ret = -ERESTART; 5306e18d381Sagiri } 5316e18d381Sagiri #else 532c0dd49bdSEiji Ota mutex_enter(&sk->sk_sleep->waitq_mutex); 5336e18d381Sagiri sk->sk_sleep->waitq_waiters++; 534c0dd49bdSEiji Ota while ((list_is_empty(&rs->rs_notify_queue) && 535c0dd49bdSEiji Ota !rs->rs_cong_notify && 536c0dd49bdSEiji Ota !rdsv3_next_incoming(rs, &inc))) { 537c0dd49bdSEiji Ota ret = cv_wait_sig(&sk->sk_sleep->waitq_cv, 538c0dd49bdSEiji Ota &sk->sk_sleep->waitq_mutex); 539c0dd49bdSEiji Ota if (ret == 0) { 540c0dd49bdSEiji Ota /* signal/timeout pending */ 541c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_recvmsg", 542c0dd49bdSEiji Ota "woke due to signal"); 5435e12ddadSEiji Ota ret = -EINTR; 544c0dd49bdSEiji Ota break; 545c0dd49bdSEiji Ota } 546c0dd49bdSEiji Ota } 5476e18d381Sagiri sk->sk_sleep->waitq_waiters--; 548c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex); 5496e18d381Sagiri #endif 550c0dd49bdSEiji Ota 551c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recvmsg", 552c0dd49bdSEiji Ota "recvmsg woke rs: %p inc %p ret %d", 553c0dd49bdSEiji Ota rs, inc, -ret); 554c0dd49bdSEiji Ota 555c0dd49bdSEiji Ota if (ret < 0) 556c0dd49bdSEiji Ota break; 557c0dd49bdSEiji Ota 558c0dd49bdSEiji Ota /* 559c0dd49bdSEiji Ota * if the wakeup was due to rs_notify_queue or 560c0dd49bdSEiji Ota * rs_cong_notify then we need to handle those first. 561c0dd49bdSEiji Ota */ 562c0dd49bdSEiji Ota continue; 563c0dd49bdSEiji Ota } 564c0dd49bdSEiji Ota 565c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recvmsg", 566c0dd49bdSEiji Ota "copying inc %p from %u.%u.%u.%u:%u to user", inc, 567c0dd49bdSEiji Ota NIPQUAD(inc->i_conn->c_faddr), 568c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_sport)); 569cadbfdc3SEiji Ota 570c0dd49bdSEiji Ota ret = inc->i_conn->c_trans->inc_copy_to_user(inc, uio, size); 571c0dd49bdSEiji Ota if (ret < 0) 572c0dd49bdSEiji Ota break; 573c0dd49bdSEiji Ota 574c0dd49bdSEiji Ota /* 575c0dd49bdSEiji Ota * if the message we just copied isn't at the head of the 576c0dd49bdSEiji Ota * recv queue then someone else raced us to return it, try 577c0dd49bdSEiji Ota * to get the next message. 578c0dd49bdSEiji Ota */ 579c0dd49bdSEiji Ota if (!rdsv3_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { 580c0dd49bdSEiji Ota rdsv3_inc_put(inc); 581c0dd49bdSEiji Ota inc = NULL; 582c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_deliver_raced); 583c0dd49bdSEiji Ota continue; 584c0dd49bdSEiji Ota } 585c0dd49bdSEiji Ota 586c0dd49bdSEiji Ota if (ret < ntohl(inc->i_hdr.h_len)) { 587c0dd49bdSEiji Ota if (msg_flags & MSG_TRUNC) 588c0dd49bdSEiji Ota ret = ntohl(inc->i_hdr.h_len); 589c0dd49bdSEiji Ota msg->msg_flags |= MSG_TRUNC; 590c0dd49bdSEiji Ota } 591c0dd49bdSEiji Ota 592c0dd49bdSEiji Ota if (rdsv3_cmsg_recv(inc, msg)) { 593c0dd49bdSEiji Ota ret = -EFAULT; 594c0dd49bdSEiji Ota goto out; 595c0dd49bdSEiji Ota } 596c0dd49bdSEiji Ota 597c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_delivered); 598c0dd49bdSEiji Ota 599c0dd49bdSEiji Ota if (msg->msg_namelen) { 600c0dd49bdSEiji Ota sin = kmem_alloc(sizeof (struct sockaddr_in), KM_SLEEP); 601c0dd49bdSEiji Ota sin->sin_family = AF_INET_OFFLOAD; 602c0dd49bdSEiji Ota sin->sin_port = inc->i_hdr.h_sport; 603c0dd49bdSEiji Ota sin->sin_addr.s_addr = inc->i_saddr; 604c0dd49bdSEiji Ota (void) memset(sin->sin_zero, 0, 605c0dd49bdSEiji Ota sizeof (sin->sin_zero)); 606c0dd49bdSEiji Ota msg->msg_namelen = sizeof (struct sockaddr_in); 607c0dd49bdSEiji Ota msg->msg_name = sin; 608c0dd49bdSEiji Ota } 609c0dd49bdSEiji Ota break; 610c0dd49bdSEiji Ota } 611c0dd49bdSEiji Ota 612c0dd49bdSEiji Ota if (inc) 613c0dd49bdSEiji Ota rdsv3_inc_put(inc); 614c0dd49bdSEiji Ota 615c0dd49bdSEiji Ota out: 6169b3d509cSEiji Ota if (msg && msg->msg_control == NULL) 6179b3d509cSEiji Ota msg->msg_controllen = 0; 6189b3d509cSEiji Ota 619c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recvmsg", "Return(rs: %p, ret: %d)", rs, ret); 620c0dd49bdSEiji Ota 621c0dd49bdSEiji Ota return (ret); 622c0dd49bdSEiji Ota } 623c0dd49bdSEiji Ota 624c0dd49bdSEiji Ota /* 625c0dd49bdSEiji Ota * The socket is being shut down and we're asked to drop messages that were 626c0dd49bdSEiji Ota * queued for recvmsg. The caller has unbound the socket so the receive path 627c0dd49bdSEiji Ota * won't queue any more incoming fragments or messages on the socket. 628c0dd49bdSEiji Ota */ 629c0dd49bdSEiji Ota void 630c0dd49bdSEiji Ota rdsv3_clear_recv_queue(struct rdsv3_sock *rs) 631c0dd49bdSEiji Ota { 632c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 633c0dd49bdSEiji Ota struct rdsv3_incoming *inc, *tmp; 634c0dd49bdSEiji Ota 635c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Enter(rs: %p)", rs); 636c0dd49bdSEiji Ota 637c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER); 638c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(inc, tmp, &rs->rs_recv_queue, i_item) { 639c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 640c0dd49bdSEiji Ota -ntohl(inc->i_hdr.h_len), 641c0dd49bdSEiji Ota inc->i_hdr.h_dport); 642c0dd49bdSEiji Ota list_remove_node(&inc->i_item); 643c0dd49bdSEiji Ota rdsv3_inc_put(inc); 644c0dd49bdSEiji Ota } 645c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 646c0dd49bdSEiji Ota 647c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Return(rs: %p)", rs); 648c0dd49bdSEiji Ota } 649c0dd49bdSEiji Ota 650c0dd49bdSEiji Ota /* 651c0dd49bdSEiji Ota * inc->i_saddr isn't used here because it is only set in the receive 652c0dd49bdSEiji Ota * path. 653c0dd49bdSEiji Ota */ 654c0dd49bdSEiji Ota void 655c0dd49bdSEiji Ota rdsv3_inc_info_copy(struct rdsv3_incoming *inc, 656c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter, 657c0dd49bdSEiji Ota uint32_be_t saddr, uint32_be_t daddr, int flip) 658c0dd49bdSEiji Ota { 659fe817b60SEiji Ota struct rds_info_message minfo; 660c0dd49bdSEiji Ota 661c0dd49bdSEiji Ota minfo.seq = ntohll(inc->i_hdr.h_sequence); 662c0dd49bdSEiji Ota minfo.len = ntohl(inc->i_hdr.h_len); 663c0dd49bdSEiji Ota 664c0dd49bdSEiji Ota if (flip) { 665c0dd49bdSEiji Ota minfo.laddr = daddr; 666c0dd49bdSEiji Ota minfo.faddr = saddr; 667c0dd49bdSEiji Ota minfo.lport = inc->i_hdr.h_dport; 668c0dd49bdSEiji Ota minfo.fport = inc->i_hdr.h_sport; 669c0dd49bdSEiji Ota } else { 670c0dd49bdSEiji Ota minfo.laddr = saddr; 671c0dd49bdSEiji Ota minfo.faddr = daddr; 672c0dd49bdSEiji Ota minfo.lport = inc->i_hdr.h_sport; 673c0dd49bdSEiji Ota minfo.fport = inc->i_hdr.h_dport; 674c0dd49bdSEiji Ota } 675c0dd49bdSEiji Ota 676c0dd49bdSEiji Ota rdsv3_info_copy(iter, &minfo, sizeof (minfo)); 677c0dd49bdSEiji Ota } 678