1c0dd49bdSEiji Ota /* 216e76cddSagiri * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3c0dd49bdSEiji Ota */ 416e76cddSagiri 5c0dd49bdSEiji Ota /* 616e76cddSagiri * This file contains code imported from the OFED rds source file recv.c 716e76cddSagiri * Oracle elects to have and use the contents of rds_recv.c under and governed 816e76cddSagiri * by the OpenIB.org BSD license (see below for full license text). However, 916e76cddSagiri * the following notice accompanied the original version of this file: 10c0dd49bdSEiji Ota */ 11c0dd49bdSEiji Ota 12c0dd49bdSEiji Ota /* 13c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved. 14c0dd49bdSEiji Ota * 15c0dd49bdSEiji Ota * This software is available to you under a choice of one of two 16c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU 17c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file 18c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the 19c0dd49bdSEiji Ota * OpenIB.org BSD license below: 20c0dd49bdSEiji Ota * 21c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or 22c0dd49bdSEiji Ota * without modification, are permitted provided that the following 23c0dd49bdSEiji Ota * conditions are met: 24c0dd49bdSEiji Ota * 25c0dd49bdSEiji Ota * - Redistributions of source code must retain the above 26c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 27c0dd49bdSEiji Ota * disclaimer. 28c0dd49bdSEiji Ota * 29c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above 30c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 31c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials 32c0dd49bdSEiji Ota * provided with the distribution. 33c0dd49bdSEiji Ota * 34c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41c0dd49bdSEiji Ota * SOFTWARE. 42c0dd49bdSEiji Ota * 43c0dd49bdSEiji Ota */ 44c0dd49bdSEiji Ota #include <sys/rds.h> 45c0dd49bdSEiji Ota 46c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h> 47c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdma.h> 48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 49c0dd49bdSEiji Ota 50c0dd49bdSEiji Ota void 51c0dd49bdSEiji Ota rdsv3_inc_init(struct rdsv3_incoming *inc, struct rdsv3_connection *conn, 52c0dd49bdSEiji Ota uint32_be_t saddr) 53c0dd49bdSEiji Ota { 54c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_inc_init", "Enter(inc: %p, conn: %p)", inc, conn); 55c0dd49bdSEiji Ota inc->i_refcount = 1; 56c0dd49bdSEiji Ota list_link_init(&inc->i_item); 57c0dd49bdSEiji Ota inc->i_conn = conn; 58c0dd49bdSEiji Ota inc->i_saddr = saddr; 59c0dd49bdSEiji Ota inc->i_rdma_cookie = 0; 60c0dd49bdSEiji Ota } 61c0dd49bdSEiji Ota 62c0dd49bdSEiji Ota void 63c0dd49bdSEiji Ota rdsv3_inc_addref(struct rdsv3_incoming *inc) 64c0dd49bdSEiji Ota { 65c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_inc_addref", 66c0dd49bdSEiji Ota "addref inc %p ref %d", inc, atomic_get(&inc->i_refcount)); 67c0dd49bdSEiji Ota atomic_add_32(&inc->i_refcount, 1); 68c0dd49bdSEiji Ota } 69c0dd49bdSEiji Ota 70c0dd49bdSEiji Ota void 71c0dd49bdSEiji Ota rdsv3_inc_put(struct rdsv3_incoming *inc) 72c0dd49bdSEiji Ota { 73c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_inc_put", "put inc %p ref %d", 74c0dd49bdSEiji Ota inc, atomic_get(&inc->i_refcount)); 75c0dd49bdSEiji Ota if (atomic_dec_and_test(&inc->i_refcount)) { 76c0dd49bdSEiji Ota ASSERT(!list_link_active(&inc->i_item)); 77c0dd49bdSEiji Ota 78c0dd49bdSEiji Ota inc->i_conn->c_trans->inc_free(inc); 79c0dd49bdSEiji Ota } 80c0dd49bdSEiji Ota } 81c0dd49bdSEiji Ota 82c0dd49bdSEiji Ota /*ARGSUSED*/ 83c0dd49bdSEiji Ota static void 84c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(struct rdsv3_sock *rs, struct rsock *sk, 85c0dd49bdSEiji Ota struct rdsv3_cong_map *map, 86c0dd49bdSEiji Ota int delta, uint16_be_t port) 87c0dd49bdSEiji Ota { 88c0dd49bdSEiji Ota int now_congested; 89c0dd49bdSEiji Ota 90c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", 91c0dd49bdSEiji Ota "Enter(rs: %p, map: %p, delta: %d, port: %d)", 92c0dd49bdSEiji Ota rs, map, delta, port); 93c0dd49bdSEiji Ota 94c0dd49bdSEiji Ota if (delta == 0) 95c0dd49bdSEiji Ota return; 96c0dd49bdSEiji Ota 97c0dd49bdSEiji Ota rs->rs_rcv_bytes += delta; 98c0dd49bdSEiji Ota now_congested = rs->rs_rcv_bytes > rdsv3_sk_rcvbuf(rs); 99c0dd49bdSEiji Ota 100c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_rcvbuf_delta", 101c0dd49bdSEiji Ota "rs %p (%u.%u.%u.%u:%u) recv bytes %d buf %d " 102c0dd49bdSEiji Ota "now_cong %d delta %d", 103c0dd49bdSEiji Ota rs, NIPQUAD(rs->rs_bound_addr), 104c0dd49bdSEiji Ota (int)ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, 105c0dd49bdSEiji Ota rdsv3_sk_rcvbuf(rs), now_congested, delta); 106c0dd49bdSEiji Ota 107c0dd49bdSEiji Ota /* wasn't -> am congested */ 108c0dd49bdSEiji Ota if (!rs->rs_congested && now_congested) { 109c0dd49bdSEiji Ota rs->rs_congested = 1; 110c0dd49bdSEiji Ota rdsv3_cong_set_bit(map, port); 111c0dd49bdSEiji Ota rdsv3_cong_queue_updates(map); 112c0dd49bdSEiji Ota } 113c0dd49bdSEiji Ota /* was -> aren't congested */ 114c0dd49bdSEiji Ota /* 115c0dd49bdSEiji Ota * Require more free space before reporting uncongested to prevent 116c0dd49bdSEiji Ota * bouncing cong/uncong state too often 117c0dd49bdSEiji Ota */ 118c0dd49bdSEiji Ota else if (rs->rs_congested && 119c0dd49bdSEiji Ota (rs->rs_rcv_bytes < (rdsv3_sk_rcvbuf(rs)/2))) { 120c0dd49bdSEiji Ota rs->rs_congested = 0; 121c0dd49bdSEiji Ota rdsv3_cong_clear_bit(map, port); 122c0dd49bdSEiji Ota rdsv3_cong_queue_updates(map); 123c0dd49bdSEiji Ota } 124c0dd49bdSEiji Ota 125c0dd49bdSEiji Ota /* do nothing if no change in cong state */ 126c0dd49bdSEiji Ota 127c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", "Return(rs: %p)", rs); 128c0dd49bdSEiji Ota } 129c0dd49bdSEiji Ota 130c0dd49bdSEiji Ota /* 131c0dd49bdSEiji Ota * Process all extension headers that come with this message. 132c0dd49bdSEiji Ota */ 133c0dd49bdSEiji Ota static void 134c0dd49bdSEiji Ota rdsv3_recv_incoming_exthdrs(struct rdsv3_incoming *inc, struct rdsv3_sock *rs) 135c0dd49bdSEiji Ota { 136c0dd49bdSEiji Ota struct rdsv3_header *hdr = &inc->i_hdr; 137c0dd49bdSEiji Ota unsigned int pos = 0, type, len; 138c0dd49bdSEiji Ota union { 139c0dd49bdSEiji Ota struct rdsv3_ext_header_version version; 140c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma rdma; 141c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma_dest rdma_dest; 142c0dd49bdSEiji Ota } buffer; 143c0dd49bdSEiji Ota 144c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Enter"); 145c0dd49bdSEiji Ota while (1) { 146c0dd49bdSEiji Ota len = sizeof (buffer); 147c0dd49bdSEiji Ota type = rdsv3_message_next_extension(hdr, &pos, &buffer, &len); 148c0dd49bdSEiji Ota if (type == RDSV3_EXTHDR_NONE) 149c0dd49bdSEiji Ota break; 150c0dd49bdSEiji Ota RDSV3_DPRINTF4("recv_incoming_exthdrs", "type %d", type); 151c0dd49bdSEiji Ota /* Process extension header here */ 152c0dd49bdSEiji Ota switch (type) { 153c0dd49bdSEiji Ota case RDSV3_EXTHDR_RDMA: 154c0dd49bdSEiji Ota rdsv3_rdma_unuse(rs, ntohl(buffer.rdma.h_rdma_rkey), 155c0dd49bdSEiji Ota 0); 156c0dd49bdSEiji Ota break; 157c0dd49bdSEiji Ota 158c0dd49bdSEiji Ota case RDSV3_EXTHDR_RDMA_DEST: 159c0dd49bdSEiji Ota /* 160c0dd49bdSEiji Ota * We ignore the size for now. We could stash it 161c0dd49bdSEiji Ota * somewhere and use it for error checking. 162c0dd49bdSEiji Ota */ 163c0dd49bdSEiji Ota inc->i_rdma_cookie = rdsv3_rdma_make_cookie( 164c0dd49bdSEiji Ota ntohl(buffer.rdma_dest.h_rdma_rkey), 165c0dd49bdSEiji Ota ntohl(buffer.rdma_dest.h_rdma_offset)); 166c0dd49bdSEiji Ota 167c0dd49bdSEiji Ota break; 168c0dd49bdSEiji Ota } 169c0dd49bdSEiji Ota } 170c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Return"); 171c0dd49bdSEiji Ota } 172c0dd49bdSEiji Ota 173c0dd49bdSEiji Ota /* 174c0dd49bdSEiji Ota * The transport must make sure that this is serialized against other 175c0dd49bdSEiji Ota * rx and conn reset on this specific conn. 176c0dd49bdSEiji Ota * 177c0dd49bdSEiji Ota * We currently assert that only one fragmented message will be sent 178c0dd49bdSEiji Ota * down a connection at a time. This lets us reassemble in the conn 179c0dd49bdSEiji Ota * instead of per-flow which means that we don't have to go digging through 180c0dd49bdSEiji Ota * flows to tear down partial reassembly progress on conn failure and 181c0dd49bdSEiji Ota * we save flow lookup and locking for each frag arrival. It does mean 182c0dd49bdSEiji Ota * that small messages will wait behind large ones. Fragmenting at all 183c0dd49bdSEiji Ota * is only to reduce the memory consumption of pre-posted buffers. 184c0dd49bdSEiji Ota * 185c0dd49bdSEiji Ota * The caller passes in saddr and daddr instead of us getting it from the 186c0dd49bdSEiji Ota * conn. This lets loopback, who only has one conn for both directions, 187c0dd49bdSEiji Ota * tell us which roles the addrs in the conn are playing for this message. 188c0dd49bdSEiji Ota */ 189c0dd49bdSEiji Ota /* ARGSUSED */ 190c0dd49bdSEiji Ota void 191c0dd49bdSEiji Ota rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr, 192c0dd49bdSEiji Ota uint32_be_t daddr, struct rdsv3_incoming *inc, int gfp) 193c0dd49bdSEiji Ota { 194c0dd49bdSEiji Ota struct rdsv3_sock *rs = NULL; 195c0dd49bdSEiji Ota struct rsock *sk; 196c0dd49bdSEiji Ota 197c0dd49bdSEiji Ota inc->i_conn = conn; 198c0dd49bdSEiji Ota inc->i_rx_jiffies = jiffies; 199c0dd49bdSEiji Ota 200c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_incoming", 201c0dd49bdSEiji Ota "conn %p next %llu inc %p seq %llu len %u sport %u dport %u " 202c0dd49bdSEiji Ota "flags 0x%x rx_jiffies %lu", conn, 203c0dd49bdSEiji Ota (unsigned long long)conn->c_next_rx_seq, 204c0dd49bdSEiji Ota inc, 205c0dd49bdSEiji Ota (unsigned long long)ntohll(inc->i_hdr.h_sequence), 206c0dd49bdSEiji Ota ntohl(inc->i_hdr.h_len), 207c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_sport), 208c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_dport), 209c0dd49bdSEiji Ota inc->i_hdr.h_flags, 210c0dd49bdSEiji Ota inc->i_rx_jiffies); 211c0dd49bdSEiji Ota 212c0dd49bdSEiji Ota /* 213c0dd49bdSEiji Ota * Sequence numbers should only increase. Messages get their 214c0dd49bdSEiji Ota * sequence number as they're queued in a sending conn. They 215c0dd49bdSEiji Ota * can be dropped, though, if the sending socket is closed before 216c0dd49bdSEiji Ota * they hit the wire. So sequence numbers can skip forward 217c0dd49bdSEiji Ota * under normal operation. They can also drop back in the conn 218c0dd49bdSEiji Ota * failover case as previously sent messages are resent down the 219c0dd49bdSEiji Ota * new instance of a conn. We drop those, otherwise we have 220c0dd49bdSEiji Ota * to assume that the next valid seq does not come after a 221c0dd49bdSEiji Ota * hole in the fragment stream. 222c0dd49bdSEiji Ota * 223c0dd49bdSEiji Ota * The headers don't give us a way to realize if fragments of 224c0dd49bdSEiji Ota * a message have been dropped. We assume that frags that arrive 225c0dd49bdSEiji Ota * to a flow are part of the current message on the flow that is 226c0dd49bdSEiji Ota * being reassembled. This means that senders can't drop messages 227c0dd49bdSEiji Ota * from the sending conn until all their frags are sent. 228c0dd49bdSEiji Ota * 229c0dd49bdSEiji Ota * XXX we could spend more on the wire to get more robust failure 230c0dd49bdSEiji Ota * detection, arguably worth it to avoid data corruption. 231c0dd49bdSEiji Ota */ 232c0dd49bdSEiji Ota if (ntohll(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && 233c0dd49bdSEiji Ota (inc->i_hdr.h_flags & RDSV3_FLAG_RETRANSMITTED)) { 234c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_old_seq); 235c0dd49bdSEiji Ota goto out; 236c0dd49bdSEiji Ota } 237c0dd49bdSEiji Ota conn->c_next_rx_seq = ntohll(inc->i_hdr.h_sequence) + 1; 238c0dd49bdSEiji Ota 239c0dd49bdSEiji Ota if (rdsv3_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { 240c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_ping); 241c0dd49bdSEiji Ota (void) rdsv3_send_pong(conn, inc->i_hdr.h_sport); 242c0dd49bdSEiji Ota goto out; 243c0dd49bdSEiji Ota } 244c0dd49bdSEiji Ota 245*80166370Sagiri rs = rdsv3_find_bound(conn, inc->i_hdr.h_dport); 2465d5562f5SEiji Ota if (!rs) { 247c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_no_sock); 248c0dd49bdSEiji Ota goto out; 249c0dd49bdSEiji Ota } 250c0dd49bdSEiji Ota 251c0dd49bdSEiji Ota /* Process extension headers */ 252c0dd49bdSEiji Ota rdsv3_recv_incoming_exthdrs(inc, rs); 253c0dd49bdSEiji Ota 254c0dd49bdSEiji Ota /* We can be racing with rdsv3_release() which marks the socket dead. */ 255c0dd49bdSEiji Ota sk = rdsv3_rs_to_sk(rs); 256c0dd49bdSEiji Ota 257c0dd49bdSEiji Ota /* serialize with rdsv3_release -> sock_orphan */ 258c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER); 259c0dd49bdSEiji Ota if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD)) { 260c0dd49bdSEiji Ota int error, bytes; 261c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_incoming", 262c0dd49bdSEiji Ota "adding inc %p to rs %p's recv queue", inc, rs); 263c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_queued); 264c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 265c0dd49bdSEiji Ota ntohl(inc->i_hdr.h_len), 266c0dd49bdSEiji Ota inc->i_hdr.h_dport); 267c0dd49bdSEiji Ota rdsv3_inc_addref(inc); 268c0dd49bdSEiji Ota list_insert_tail(&rs->rs_recv_queue, inc); 269c0dd49bdSEiji Ota bytes = rs->rs_rcv_bytes; 270c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 271c0dd49bdSEiji Ota 272c0dd49bdSEiji Ota __rdsv3_wake_sk_sleep(sk); 273c0dd49bdSEiji Ota 274c0dd49bdSEiji Ota /* wake up anyone waiting in poll */ 275c0dd49bdSEiji Ota sk->sk_upcalls->su_recv(sk->sk_upper_handle, NULL, 276c0dd49bdSEiji Ota bytes, 0, &error, NULL); 277c0dd49bdSEiji Ota if (error != 0) { 278c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_recv_incoming", 279c0dd49bdSEiji Ota "su_recv returned: %d", error); 280c0dd49bdSEiji Ota } 281c0dd49bdSEiji Ota } else { 282c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_dead_sock); 283c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 284c0dd49bdSEiji Ota } 285c0dd49bdSEiji Ota 286c0dd49bdSEiji Ota out: 287c0dd49bdSEiji Ota if (rs) 288c0dd49bdSEiji Ota rdsv3_sock_put(rs); 289c0dd49bdSEiji Ota } 290c0dd49bdSEiji Ota 291c0dd49bdSEiji Ota /* 292c0dd49bdSEiji Ota * be very careful here. This is being called as the condition in 293c0dd49bdSEiji Ota * wait_event_*() needs to cope with being called many times. 294c0dd49bdSEiji Ota */ 295c0dd49bdSEiji Ota static int 296c0dd49bdSEiji Ota rdsv3_next_incoming(struct rdsv3_sock *rs, struct rdsv3_incoming **inc) 297c0dd49bdSEiji Ota { 2985d5562f5SEiji Ota if (!*inc) { 299c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_READER); 300c0dd49bdSEiji Ota if (!list_is_empty(&rs->rs_recv_queue)) { 301c0dd49bdSEiji Ota *inc = list_head(&rs->rs_recv_queue); 302c0dd49bdSEiji Ota rdsv3_inc_addref(*inc); 303c0dd49bdSEiji Ota } 304c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 305c0dd49bdSEiji Ota } 306c0dd49bdSEiji Ota 307c0dd49bdSEiji Ota return (*inc != NULL); 308c0dd49bdSEiji Ota } 309c0dd49bdSEiji Ota 310c0dd49bdSEiji Ota static int 311c0dd49bdSEiji Ota rdsv3_still_queued(struct rdsv3_sock *rs, struct rdsv3_incoming *inc, 312c0dd49bdSEiji Ota int drop) 313c0dd49bdSEiji Ota { 314c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 315c0dd49bdSEiji Ota int ret = 0; 316c0dd49bdSEiji Ota 317c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_still_queued", "Enter rs: %p inc: %p drop: %d", 318c0dd49bdSEiji Ota rs, inc, drop); 319c0dd49bdSEiji Ota 320c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER); 321c0dd49bdSEiji Ota if (list_link_active(&inc->i_item)) { 322c0dd49bdSEiji Ota ret = 1; 323c0dd49bdSEiji Ota if (drop) { 324c0dd49bdSEiji Ota /* XXX make sure this i_conn is reliable */ 325c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 326c0dd49bdSEiji Ota -ntohl(inc->i_hdr.h_len), 327c0dd49bdSEiji Ota inc->i_hdr.h_dport); 328c0dd49bdSEiji Ota list_remove_node(&inc->i_item); 329c0dd49bdSEiji Ota rdsv3_inc_put(inc); 330c0dd49bdSEiji Ota } 331c0dd49bdSEiji Ota } 332c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 333c0dd49bdSEiji Ota 334c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_still_queued", 335c0dd49bdSEiji Ota "inc %p rs %p still %d dropped %d", inc, rs, ret, drop); 336c0dd49bdSEiji Ota return (ret); 337c0dd49bdSEiji Ota } 338c0dd49bdSEiji Ota 339c0dd49bdSEiji Ota /* 340c0dd49bdSEiji Ota * Pull errors off the error queue. 341c0dd49bdSEiji Ota * If msghdr is NULL, we will just purge the error queue. 342c0dd49bdSEiji Ota */ 343c0dd49bdSEiji Ota int 344c0dd49bdSEiji Ota rdsv3_notify_queue_get(struct rdsv3_sock *rs, struct msghdr *msghdr) 345c0dd49bdSEiji Ota { 346c0dd49bdSEiji Ota struct rdsv3_notifier *notifier; 347fe817b60SEiji Ota struct rds_rdma_notify cmsg; 348c0dd49bdSEiji Ota unsigned int count = 0, max_messages = ~0U; 349c0dd49bdSEiji Ota list_t copy; 350c0dd49bdSEiji Ota int err = 0; 351c0dd49bdSEiji Ota 352c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Enter(rs: %p)", rs); 353c0dd49bdSEiji Ota 354c0dd49bdSEiji Ota list_create(©, sizeof (struct rdsv3_notifier), 355c0dd49bdSEiji Ota offsetof(struct rdsv3_notifier, n_list)); 356c0dd49bdSEiji Ota 357c0dd49bdSEiji Ota 358c0dd49bdSEiji Ota /* 359c0dd49bdSEiji Ota * put_cmsg copies to user space and thus may sleep. We can't do this 360c0dd49bdSEiji Ota * with rs_lock held, so first grab as many notifications as we can 361c0dd49bdSEiji Ota * stuff 362c0dd49bdSEiji Ota * in the user provided cmsg buffer. We don't try to copy more, to avoid 363c0dd49bdSEiji Ota * losing notifications - except when the buffer is so small that 364c0dd49bdSEiji Ota * it wouldn't 365c0dd49bdSEiji Ota * even hold a single notification. Then we give him as much of this 366c0dd49bdSEiji Ota * single 367c0dd49bdSEiji Ota * msg as we can squeeze in, and set MSG_CTRUNC. 368c0dd49bdSEiji Ota */ 369c0dd49bdSEiji Ota if (msghdr) { 370c0dd49bdSEiji Ota max_messages = 371c0dd49bdSEiji Ota msghdr->msg_controllen / CMSG_SPACE(sizeof (cmsg)); 372c0dd49bdSEiji Ota if (!max_messages) 373c0dd49bdSEiji Ota max_messages = 1; 374c0dd49bdSEiji Ota } 375c0dd49bdSEiji Ota 376c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 377c0dd49bdSEiji Ota while (!list_is_empty(&rs->rs_notify_queue) && count < max_messages) { 378c0dd49bdSEiji Ota notifier = list_remove_head(&rs->rs_notify_queue); 379c0dd49bdSEiji Ota list_insert_tail(©, notifier); 380c0dd49bdSEiji Ota count++; 381c0dd49bdSEiji Ota } 382c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 383c0dd49bdSEiji Ota 384c0dd49bdSEiji Ota if (!count) 385c0dd49bdSEiji Ota return (0); 386c0dd49bdSEiji Ota 387c0dd49bdSEiji Ota while (!list_is_empty(©)) { 388c0dd49bdSEiji Ota notifier = list_remove_head(©); 389c0dd49bdSEiji Ota 390c0dd49bdSEiji Ota if (msghdr) { 391c0dd49bdSEiji Ota cmsg.user_token = notifier->n_user_token; 392c0dd49bdSEiji Ota cmsg.status = notifier->n_status; 393c0dd49bdSEiji Ota 394c0dd49bdSEiji Ota err = rdsv3_put_cmsg(msghdr, SOL_RDS, 395fe817b60SEiji Ota RDS_CMSG_RDMA_STATUS, sizeof (cmsg), &cmsg); 396c0dd49bdSEiji Ota if (err) 397c0dd49bdSEiji Ota break; 398c0dd49bdSEiji Ota } 399c0dd49bdSEiji Ota 400c0dd49bdSEiji Ota kmem_free(notifier, sizeof (struct rdsv3_notifier)); 401c0dd49bdSEiji Ota } 402c0dd49bdSEiji Ota 403c0dd49bdSEiji Ota /* 404c0dd49bdSEiji Ota * If we bailed out because of an error in put_cmsg, 405c0dd49bdSEiji Ota * we may be left with one or more notifications that we 406c0dd49bdSEiji Ota * didn't process. Return them to the head of the list. 407c0dd49bdSEiji Ota */ 408c0dd49bdSEiji Ota if (!list_is_empty(©)) { 409c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 410c0dd49bdSEiji Ota list_splice(©, &rs->rs_notify_queue); 411c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 412c0dd49bdSEiji Ota } 413c0dd49bdSEiji Ota 414c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Return(rs: %p)", rs); 415c0dd49bdSEiji Ota 416c0dd49bdSEiji Ota return (err); 417c0dd49bdSEiji Ota } 418c0dd49bdSEiji Ota 419c0dd49bdSEiji Ota /* 420c0dd49bdSEiji Ota * Queue a congestion notification 421c0dd49bdSEiji Ota */ 422c0dd49bdSEiji Ota static int 423c0dd49bdSEiji Ota rdsv3_notify_cong(struct rdsv3_sock *rs, struct msghdr *msghdr) 424c0dd49bdSEiji Ota { 425c0dd49bdSEiji Ota uint64_t notify = rs->rs_cong_notify; 426c0dd49bdSEiji Ota int err; 427c0dd49bdSEiji Ota 428fe817b60SEiji Ota err = rdsv3_put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, 429c0dd49bdSEiji Ota sizeof (notify), ¬ify); 430c0dd49bdSEiji Ota if (err) 431c0dd49bdSEiji Ota return (err); 432c0dd49bdSEiji Ota 433c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 434c0dd49bdSEiji Ota rs->rs_cong_notify &= ~notify; 435c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 436c0dd49bdSEiji Ota 437c0dd49bdSEiji Ota return (0); 438c0dd49bdSEiji Ota } 439c0dd49bdSEiji Ota 440c0dd49bdSEiji Ota /* 441c0dd49bdSEiji Ota * Receive any control messages. 442c0dd49bdSEiji Ota */ 443c0dd49bdSEiji Ota static int 444c0dd49bdSEiji Ota rdsv3_cmsg_recv(struct rdsv3_incoming *inc, struct msghdr *msg) 445c0dd49bdSEiji Ota { 446fe817b60SEiji Ota return (rdsv3_put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, 447c0dd49bdSEiji Ota sizeof (inc->i_rdma_cookie), &inc->i_rdma_cookie)); 448c0dd49bdSEiji Ota } 449c0dd49bdSEiji Ota 450c0dd49bdSEiji Ota int 451c0dd49bdSEiji Ota rdsv3_recvmsg(struct rdsv3_sock *rs, uio_t *uio, 452c0dd49bdSEiji Ota struct nmsghdr *msg, size_t size, int msg_flags) 453c0dd49bdSEiji Ota { 454c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 455c0dd49bdSEiji Ota long timeo; 456b27516f5Sagiri int ret = 0; 457c0dd49bdSEiji Ota struct sockaddr_in *sin = NULL; 458c0dd49bdSEiji Ota struct rdsv3_incoming *inc = NULL; 459b27516f5Sagiri boolean_t nonblock = B_FALSE; 460c0dd49bdSEiji Ota 461c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recvmsg", 462c0dd49bdSEiji Ota "Enter(rs: %p size: %d msg_flags: 0x%x)", rs, size, msg_flags); 463c0dd49bdSEiji Ota 464b27516f5Sagiri if ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) || 465b27516f5Sagiri (msg_flags & MSG_DONTWAIT)) 466b27516f5Sagiri nonblock = B_TRUE; 467b27516f5Sagiri 468c0dd49bdSEiji Ota /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ 469c0dd49bdSEiji Ota timeo = rdsv3_rcvtimeo(sk, nonblock); 470c0dd49bdSEiji Ota 471c0dd49bdSEiji Ota if (msg_flags & MSG_OOB) 472c0dd49bdSEiji Ota goto out; 473c0dd49bdSEiji Ota 474c0dd49bdSEiji Ota /* mark the first cmsg position */ 475c0dd49bdSEiji Ota if (msg) { 476c0dd49bdSEiji Ota msg->msg_control = NULL; 477c0dd49bdSEiji Ota } 478c0dd49bdSEiji Ota 479c0dd49bdSEiji Ota while (1) { 480c0dd49bdSEiji Ota /* 481c0dd49bdSEiji Ota * If there are pending notifications, do those - 482c0dd49bdSEiji Ota * and nothing else 483c0dd49bdSEiji Ota */ 484c0dd49bdSEiji Ota if (!list_is_empty(&rs->rs_notify_queue)) { 485c0dd49bdSEiji Ota ret = rdsv3_notify_queue_get(rs, msg); 486c0dd49bdSEiji Ota 487c0dd49bdSEiji Ota if (msg && msg->msg_namelen) { 488c0dd49bdSEiji Ota sin = kmem_zalloc(sizeof (struct sockaddr_in), 489c0dd49bdSEiji Ota KM_SLEEP); 490c0dd49bdSEiji Ota sin->sin_family = AF_INET_OFFLOAD; 491c0dd49bdSEiji Ota if (inc) { 492c0dd49bdSEiji Ota sin->sin_port = inc->i_hdr.h_sport; 493c0dd49bdSEiji Ota sin->sin_addr.s_addr = inc->i_saddr; 494c0dd49bdSEiji Ota } 495c0dd49bdSEiji Ota msg->msg_namelen = sizeof (struct sockaddr_in); 496c0dd49bdSEiji Ota msg->msg_name = sin; 497c0dd49bdSEiji Ota } 498c0dd49bdSEiji Ota break; 499c0dd49bdSEiji Ota } 500c0dd49bdSEiji Ota 501c0dd49bdSEiji Ota if (rs->rs_cong_notify) { 502c0dd49bdSEiji Ota ret = rdsv3_notify_cong(rs, msg); 503c0dd49bdSEiji Ota goto out; 504c0dd49bdSEiji Ota } 505c0dd49bdSEiji Ota 506c0dd49bdSEiji Ota if (!rdsv3_next_incoming(rs, &inc)) { 507c0dd49bdSEiji Ota if (nonblock) { 508c0dd49bdSEiji Ota ret = -EAGAIN; 509c0dd49bdSEiji Ota break; 510c0dd49bdSEiji Ota } 511c0dd49bdSEiji Ota 512c0dd49bdSEiji Ota RDSV3_DPRINTF3("rdsv3_recvmsg", 513c0dd49bdSEiji Ota "Before wait (rs: %p)", rs); 514c0dd49bdSEiji Ota 5156e18d381Sagiri #if 0 5166e18d381Sagiri ret = rdsv3_wait_sig(sk->sk_sleep, 5176e18d381Sagiri !(list_is_empty(&rs->rs_notify_queue) && 5186e18d381Sagiri !rs->rs_cong_notify && 5196e18d381Sagiri !rdsv3_next_incoming(rs, &inc))); 5206e18d381Sagiri if (ret == 0) { 5216e18d381Sagiri /* signal/timeout pending */ 5226e18d381Sagiri RDSV3_DPRINTF2("rdsv3_recvmsg", 5236e18d381Sagiri "woke due to signal"); 5246e18d381Sagiri ret = -ERESTART; 5256e18d381Sagiri } 5266e18d381Sagiri #else 527c0dd49bdSEiji Ota mutex_enter(&sk->sk_sleep->waitq_mutex); 5286e18d381Sagiri sk->sk_sleep->waitq_waiters++; 529c0dd49bdSEiji Ota while ((list_is_empty(&rs->rs_notify_queue) && 530c0dd49bdSEiji Ota !rs->rs_cong_notify && 531c0dd49bdSEiji Ota !rdsv3_next_incoming(rs, &inc))) { 532c0dd49bdSEiji Ota ret = cv_wait_sig(&sk->sk_sleep->waitq_cv, 533c0dd49bdSEiji Ota &sk->sk_sleep->waitq_mutex); 534c0dd49bdSEiji Ota if (ret == 0) { 535c0dd49bdSEiji Ota /* signal/timeout pending */ 536c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_recvmsg", 537c0dd49bdSEiji Ota "woke due to signal"); 538c0dd49bdSEiji Ota ret = -ERESTART; 539c0dd49bdSEiji Ota break; 540c0dd49bdSEiji Ota } 541c0dd49bdSEiji Ota } 5426e18d381Sagiri sk->sk_sleep->waitq_waiters--; 543c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex); 5446e18d381Sagiri #endif 545c0dd49bdSEiji Ota 546c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recvmsg", 547c0dd49bdSEiji Ota "recvmsg woke rs: %p inc %p ret %d", 548c0dd49bdSEiji Ota rs, inc, -ret); 549c0dd49bdSEiji Ota 550c0dd49bdSEiji Ota if (ret < 0) 551c0dd49bdSEiji Ota break; 552c0dd49bdSEiji Ota 553c0dd49bdSEiji Ota /* 554c0dd49bdSEiji Ota * if the wakeup was due to rs_notify_queue or 555c0dd49bdSEiji Ota * rs_cong_notify then we need to handle those first. 556c0dd49bdSEiji Ota */ 557c0dd49bdSEiji Ota continue; 558c0dd49bdSEiji Ota } 559c0dd49bdSEiji Ota 560c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recvmsg", 561c0dd49bdSEiji Ota "copying inc %p from %u.%u.%u.%u:%u to user", inc, 562c0dd49bdSEiji Ota NIPQUAD(inc->i_conn->c_faddr), 563c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_sport)); 564cadbfdc3SEiji Ota 565c0dd49bdSEiji Ota ret = inc->i_conn->c_trans->inc_copy_to_user(inc, uio, size); 566c0dd49bdSEiji Ota if (ret < 0) 567c0dd49bdSEiji Ota break; 568c0dd49bdSEiji Ota 569c0dd49bdSEiji Ota /* 570c0dd49bdSEiji Ota * if the message we just copied isn't at the head of the 571c0dd49bdSEiji Ota * recv queue then someone else raced us to return it, try 572c0dd49bdSEiji Ota * to get the next message. 573c0dd49bdSEiji Ota */ 574c0dd49bdSEiji Ota if (!rdsv3_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { 575c0dd49bdSEiji Ota rdsv3_inc_put(inc); 576c0dd49bdSEiji Ota inc = NULL; 577c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_deliver_raced); 578c0dd49bdSEiji Ota continue; 579c0dd49bdSEiji Ota } 580c0dd49bdSEiji Ota 581c0dd49bdSEiji Ota if (ret < ntohl(inc->i_hdr.h_len)) { 582c0dd49bdSEiji Ota if (msg_flags & MSG_TRUNC) 583c0dd49bdSEiji Ota ret = ntohl(inc->i_hdr.h_len); 584c0dd49bdSEiji Ota msg->msg_flags |= MSG_TRUNC; 585c0dd49bdSEiji Ota } 586c0dd49bdSEiji Ota 587c0dd49bdSEiji Ota if (rdsv3_cmsg_recv(inc, msg)) { 588c0dd49bdSEiji Ota ret = -EFAULT; 589c0dd49bdSEiji Ota goto out; 590c0dd49bdSEiji Ota } 591c0dd49bdSEiji Ota 592c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_delivered); 593c0dd49bdSEiji Ota 594c0dd49bdSEiji Ota if (msg->msg_namelen) { 595c0dd49bdSEiji Ota sin = kmem_alloc(sizeof (struct sockaddr_in), KM_SLEEP); 596c0dd49bdSEiji Ota sin->sin_family = AF_INET_OFFLOAD; 597c0dd49bdSEiji Ota sin->sin_port = inc->i_hdr.h_sport; 598c0dd49bdSEiji Ota sin->sin_addr.s_addr = inc->i_saddr; 599c0dd49bdSEiji Ota (void) memset(sin->sin_zero, 0, 600c0dd49bdSEiji Ota sizeof (sin->sin_zero)); 601c0dd49bdSEiji Ota msg->msg_namelen = sizeof (struct sockaddr_in); 602c0dd49bdSEiji Ota msg->msg_name = sin; 603c0dd49bdSEiji Ota } 604c0dd49bdSEiji Ota break; 605c0dd49bdSEiji Ota } 606c0dd49bdSEiji Ota 607c0dd49bdSEiji Ota if (inc) 608c0dd49bdSEiji Ota rdsv3_inc_put(inc); 609c0dd49bdSEiji Ota 610c0dd49bdSEiji Ota out: 611c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recvmsg", "Return(rs: %p, ret: %d)", rs, ret); 612c0dd49bdSEiji Ota 613c0dd49bdSEiji Ota return (ret); 614c0dd49bdSEiji Ota } 615c0dd49bdSEiji Ota 616c0dd49bdSEiji Ota /* 617c0dd49bdSEiji Ota * The socket is being shut down and we're asked to drop messages that were 618c0dd49bdSEiji Ota * queued for recvmsg. The caller has unbound the socket so the receive path 619c0dd49bdSEiji Ota * won't queue any more incoming fragments or messages on the socket. 620c0dd49bdSEiji Ota */ 621c0dd49bdSEiji Ota void 622c0dd49bdSEiji Ota rdsv3_clear_recv_queue(struct rdsv3_sock *rs) 623c0dd49bdSEiji Ota { 624c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 625c0dd49bdSEiji Ota struct rdsv3_incoming *inc, *tmp; 626c0dd49bdSEiji Ota 627c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Enter(rs: %p)", rs); 628c0dd49bdSEiji Ota 629c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER); 630c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(inc, tmp, &rs->rs_recv_queue, i_item) { 631c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, 632c0dd49bdSEiji Ota -ntohl(inc->i_hdr.h_len), 633c0dd49bdSEiji Ota inc->i_hdr.h_dport); 634c0dd49bdSEiji Ota list_remove_node(&inc->i_item); 635c0dd49bdSEiji Ota rdsv3_inc_put(inc); 636c0dd49bdSEiji Ota } 637c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock); 638c0dd49bdSEiji Ota 639c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Return(rs: %p)", rs); 640c0dd49bdSEiji Ota } 641c0dd49bdSEiji Ota 642c0dd49bdSEiji Ota /* 643c0dd49bdSEiji Ota * inc->i_saddr isn't used here because it is only set in the receive 644c0dd49bdSEiji Ota * path. 645c0dd49bdSEiji Ota */ 646c0dd49bdSEiji Ota void 647c0dd49bdSEiji Ota rdsv3_inc_info_copy(struct rdsv3_incoming *inc, 648c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter, 649c0dd49bdSEiji Ota uint32_be_t saddr, uint32_be_t daddr, int flip) 650c0dd49bdSEiji Ota { 651fe817b60SEiji Ota struct rds_info_message minfo; 652c0dd49bdSEiji Ota 653c0dd49bdSEiji Ota minfo.seq = ntohll(inc->i_hdr.h_sequence); 654c0dd49bdSEiji Ota minfo.len = ntohl(inc->i_hdr.h_len); 655c0dd49bdSEiji Ota 656c0dd49bdSEiji Ota if (flip) { 657c0dd49bdSEiji Ota minfo.laddr = daddr; 658c0dd49bdSEiji Ota minfo.faddr = saddr; 659c0dd49bdSEiji Ota minfo.lport = inc->i_hdr.h_dport; 660c0dd49bdSEiji Ota minfo.fport = inc->i_hdr.h_sport; 661c0dd49bdSEiji Ota } else { 662c0dd49bdSEiji Ota minfo.laddr = saddr; 663c0dd49bdSEiji Ota minfo.faddr = daddr; 664c0dd49bdSEiji Ota minfo.lport = inc->i_hdr.h_sport; 665c0dd49bdSEiji Ota minfo.fport = inc->i_hdr.h_dport; 666c0dd49bdSEiji Ota } 667c0dd49bdSEiji Ota 668c0dd49bdSEiji Ota rdsv3_info_copy(iter, &minfo, sizeof (minfo)); 669c0dd49bdSEiji Ota } 670