1c0dd49bdSEiji Ota /*
216e76cddSagiri * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3*48bbca81SDaniel Hoffman * Copyright (c) 2016 by Delphix. All rights reserved.
4c0dd49bdSEiji Ota */
516e76cddSagiri
6c0dd49bdSEiji Ota /*
716e76cddSagiri * This file contains code imported from the OFED rds source file recv.c
816e76cddSagiri * Oracle elects to have and use the contents of rds_recv.c under and governed
916e76cddSagiri * by the OpenIB.org BSD license (see below for full license text). However,
1016e76cddSagiri * the following notice accompanied the original version of this file:
11c0dd49bdSEiji Ota */
12c0dd49bdSEiji Ota
13c0dd49bdSEiji Ota /*
14c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved.
15c0dd49bdSEiji Ota *
16c0dd49bdSEiji Ota * This software is available to you under a choice of one of two
17c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU
18c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file
19c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the
20c0dd49bdSEiji Ota * OpenIB.org BSD license below:
21c0dd49bdSEiji Ota *
22c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or
23c0dd49bdSEiji Ota * without modification, are permitted provided that the following
24c0dd49bdSEiji Ota * conditions are met:
25c0dd49bdSEiji Ota *
26c0dd49bdSEiji Ota * - Redistributions of source code must retain the above
27c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following
28c0dd49bdSEiji Ota * disclaimer.
29c0dd49bdSEiji Ota *
30c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above
31c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following
32c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials
33c0dd49bdSEiji Ota * provided with the distribution.
34c0dd49bdSEiji Ota *
35c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42c0dd49bdSEiji Ota * SOFTWARE.
43c0dd49bdSEiji Ota *
44c0dd49bdSEiji Ota */
45c0dd49bdSEiji Ota #include <sys/rds.h>
46c0dd49bdSEiji Ota
47c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h>
48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdma.h>
49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
50c0dd49bdSEiji Ota
51c0dd49bdSEiji Ota void
rdsv3_inc_init(struct rdsv3_incoming * inc,struct rdsv3_connection * conn,uint32_be_t saddr)52c0dd49bdSEiji Ota rdsv3_inc_init(struct rdsv3_incoming *inc, struct rdsv3_connection *conn,
53c0dd49bdSEiji Ota uint32_be_t saddr)
54c0dd49bdSEiji Ota {
55c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_inc_init", "Enter(inc: %p, conn: %p)", inc, conn);
56c0dd49bdSEiji Ota inc->i_refcount = 1;
57c0dd49bdSEiji Ota list_link_init(&inc->i_item);
58c0dd49bdSEiji Ota inc->i_conn = conn;
59c0dd49bdSEiji Ota inc->i_saddr = saddr;
60c0dd49bdSEiji Ota inc->i_rdma_cookie = 0;
61c0dd49bdSEiji Ota }
62c0dd49bdSEiji Ota
63c0dd49bdSEiji Ota void
rdsv3_inc_addref(struct rdsv3_incoming * inc)64c0dd49bdSEiji Ota rdsv3_inc_addref(struct rdsv3_incoming *inc)
65c0dd49bdSEiji Ota {
66c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_inc_addref",
67c0dd49bdSEiji Ota "addref inc %p ref %d", inc, atomic_get(&inc->i_refcount));
681a5e258fSJosef 'Jeff' Sipek atomic_inc_32(&inc->i_refcount);
69c0dd49bdSEiji Ota }
70c0dd49bdSEiji Ota
71c0dd49bdSEiji Ota void
rdsv3_inc_put(struct rdsv3_incoming * inc)72c0dd49bdSEiji Ota rdsv3_inc_put(struct rdsv3_incoming *inc)
73c0dd49bdSEiji Ota {
74c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_inc_put", "put inc %p ref %d",
75c0dd49bdSEiji Ota inc, atomic_get(&inc->i_refcount));
76c0dd49bdSEiji Ota if (atomic_dec_and_test(&inc->i_refcount)) {
77c0dd49bdSEiji Ota ASSERT(!list_link_active(&inc->i_item));
78c0dd49bdSEiji Ota
79c0dd49bdSEiji Ota inc->i_conn->c_trans->inc_free(inc);
80c0dd49bdSEiji Ota }
81c0dd49bdSEiji Ota }
82c0dd49bdSEiji Ota
83c0dd49bdSEiji Ota /*ARGSUSED*/
84c0dd49bdSEiji Ota static void
rdsv3_recv_rcvbuf_delta(struct rdsv3_sock * rs,struct rsock * sk,struct rdsv3_cong_map * map,int delta,uint16_be_t port)85c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(struct rdsv3_sock *rs, struct rsock *sk,
86c0dd49bdSEiji Ota struct rdsv3_cong_map *map,
87c0dd49bdSEiji Ota int delta, uint16_be_t port)
88c0dd49bdSEiji Ota {
89c0dd49bdSEiji Ota int now_congested;
90c0dd49bdSEiji Ota
91c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta",
92c0dd49bdSEiji Ota "Enter(rs: %p, map: %p, delta: %d, port: %d)",
93c0dd49bdSEiji Ota rs, map, delta, port);
94c0dd49bdSEiji Ota
95c0dd49bdSEiji Ota if (delta == 0)
96c0dd49bdSEiji Ota return;
97c0dd49bdSEiji Ota
98c0dd49bdSEiji Ota rs->rs_rcv_bytes += delta;
99c0dd49bdSEiji Ota now_congested = rs->rs_rcv_bytes > rdsv3_sk_rcvbuf(rs);
100c0dd49bdSEiji Ota
101c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_rcvbuf_delta",
102c0dd49bdSEiji Ota "rs %p (%u.%u.%u.%u:%u) recv bytes %d buf %d "
103c0dd49bdSEiji Ota "now_cong %d delta %d",
104c0dd49bdSEiji Ota rs, NIPQUAD(rs->rs_bound_addr),
105c0dd49bdSEiji Ota (int)ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
106c0dd49bdSEiji Ota rdsv3_sk_rcvbuf(rs), now_congested, delta);
107c0dd49bdSEiji Ota
108c0dd49bdSEiji Ota /* wasn't -> am congested */
109c0dd49bdSEiji Ota if (!rs->rs_congested && now_congested) {
110c0dd49bdSEiji Ota rs->rs_congested = 1;
111c0dd49bdSEiji Ota rdsv3_cong_set_bit(map, port);
112c0dd49bdSEiji Ota rdsv3_cong_queue_updates(map);
113c0dd49bdSEiji Ota }
114c0dd49bdSEiji Ota /* was -> aren't congested */
115c0dd49bdSEiji Ota /*
116c0dd49bdSEiji Ota * Require more free space before reporting uncongested to prevent
117c0dd49bdSEiji Ota * bouncing cong/uncong state too often
118c0dd49bdSEiji Ota */
119c0dd49bdSEiji Ota else if (rs->rs_congested &&
120c0dd49bdSEiji Ota (rs->rs_rcv_bytes < (rdsv3_sk_rcvbuf(rs)/2))) {
121c0dd49bdSEiji Ota rs->rs_congested = 0;
122c0dd49bdSEiji Ota rdsv3_cong_clear_bit(map, port);
123c0dd49bdSEiji Ota rdsv3_cong_queue_updates(map);
124c0dd49bdSEiji Ota }
125c0dd49bdSEiji Ota
126c0dd49bdSEiji Ota /* do nothing if no change in cong state */
127c0dd49bdSEiji Ota
128c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_rcvbuf_delta", "Return(rs: %p)", rs);
129c0dd49bdSEiji Ota }
130c0dd49bdSEiji Ota
131c0dd49bdSEiji Ota /*
132c0dd49bdSEiji Ota * Process all extension headers that come with this message.
133c0dd49bdSEiji Ota */
134c0dd49bdSEiji Ota static void
rdsv3_recv_incoming_exthdrs(struct rdsv3_incoming * inc,struct rdsv3_sock * rs)135c0dd49bdSEiji Ota rdsv3_recv_incoming_exthdrs(struct rdsv3_incoming *inc, struct rdsv3_sock *rs)
136c0dd49bdSEiji Ota {
137c0dd49bdSEiji Ota struct rdsv3_header *hdr = &inc->i_hdr;
138c0dd49bdSEiji Ota unsigned int pos = 0, type, len;
139c0dd49bdSEiji Ota union {
140c0dd49bdSEiji Ota struct rdsv3_ext_header_version version;
141c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma rdma;
142c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma_dest rdma_dest;
143c0dd49bdSEiji Ota } buffer;
144c0dd49bdSEiji Ota
145c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Enter");
146c0dd49bdSEiji Ota while (1) {
147c0dd49bdSEiji Ota len = sizeof (buffer);
148c0dd49bdSEiji Ota type = rdsv3_message_next_extension(hdr, &pos, &buffer, &len);
149c0dd49bdSEiji Ota if (type == RDSV3_EXTHDR_NONE)
150c0dd49bdSEiji Ota break;
151c0dd49bdSEiji Ota RDSV3_DPRINTF4("recv_incoming_exthdrs", "type %d", type);
152c0dd49bdSEiji Ota /* Process extension header here */
153c0dd49bdSEiji Ota switch (type) {
154c0dd49bdSEiji Ota case RDSV3_EXTHDR_RDMA:
155c0dd49bdSEiji Ota rdsv3_rdma_unuse(rs, ntohl(buffer.rdma.h_rdma_rkey),
156c0dd49bdSEiji Ota 0);
157c0dd49bdSEiji Ota break;
158c0dd49bdSEiji Ota
159c0dd49bdSEiji Ota case RDSV3_EXTHDR_RDMA_DEST:
160c0dd49bdSEiji Ota /*
161c0dd49bdSEiji Ota * We ignore the size for now. We could stash it
162c0dd49bdSEiji Ota * somewhere and use it for error checking.
163c0dd49bdSEiji Ota */
164c0dd49bdSEiji Ota inc->i_rdma_cookie = rdsv3_rdma_make_cookie(
165c0dd49bdSEiji Ota ntohl(buffer.rdma_dest.h_rdma_rkey),
166c0dd49bdSEiji Ota ntohl(buffer.rdma_dest.h_rdma_offset));
167c0dd49bdSEiji Ota
168c0dd49bdSEiji Ota break;
169c0dd49bdSEiji Ota }
170c0dd49bdSEiji Ota }
171c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recv_incoming_exthdrs", "Return");
172c0dd49bdSEiji Ota }
173c0dd49bdSEiji Ota
174c0dd49bdSEiji Ota /*
175c0dd49bdSEiji Ota * The transport must make sure that this is serialized against other
176c0dd49bdSEiji Ota * rx and conn reset on this specific conn.
177c0dd49bdSEiji Ota *
178c0dd49bdSEiji Ota * We currently assert that only one fragmented message will be sent
179c0dd49bdSEiji Ota * down a connection at a time. This lets us reassemble in the conn
180c0dd49bdSEiji Ota * instead of per-flow which means that we don't have to go digging through
181c0dd49bdSEiji Ota * flows to tear down partial reassembly progress on conn failure and
182c0dd49bdSEiji Ota * we save flow lookup and locking for each frag arrival. It does mean
183c0dd49bdSEiji Ota * that small messages will wait behind large ones. Fragmenting at all
184c0dd49bdSEiji Ota * is only to reduce the memory consumption of pre-posted buffers.
185c0dd49bdSEiji Ota *
186c0dd49bdSEiji Ota * The caller passes in saddr and daddr instead of us getting it from the
187c0dd49bdSEiji Ota * conn. This lets loopback, who only has one conn for both directions,
188c0dd49bdSEiji Ota * tell us which roles the addrs in the conn are playing for this message.
189c0dd49bdSEiji Ota */
190c0dd49bdSEiji Ota /* ARGSUSED */
191c0dd49bdSEiji Ota void
rdsv3_recv_incoming(struct rdsv3_connection * conn,uint32_be_t saddr,uint32_be_t daddr,struct rdsv3_incoming * inc,int gfp)192c0dd49bdSEiji Ota rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr,
193c0dd49bdSEiji Ota uint32_be_t daddr, struct rdsv3_incoming *inc, int gfp)
194c0dd49bdSEiji Ota {
195c0dd49bdSEiji Ota struct rdsv3_sock *rs = NULL;
196c0dd49bdSEiji Ota struct rsock *sk;
197c0dd49bdSEiji Ota
198c0dd49bdSEiji Ota inc->i_conn = conn;
199c0dd49bdSEiji Ota inc->i_rx_jiffies = jiffies;
200c0dd49bdSEiji Ota
201c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_incoming",
202c0dd49bdSEiji Ota "conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
203c0dd49bdSEiji Ota "flags 0x%x rx_jiffies %lu", conn,
204c0dd49bdSEiji Ota (unsigned long long)conn->c_next_rx_seq,
205c0dd49bdSEiji Ota inc,
206c0dd49bdSEiji Ota (unsigned long long)ntohll(inc->i_hdr.h_sequence),
207c0dd49bdSEiji Ota ntohl(inc->i_hdr.h_len),
208c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_sport),
209c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_dport),
210c0dd49bdSEiji Ota inc->i_hdr.h_flags,
211c0dd49bdSEiji Ota inc->i_rx_jiffies);
212c0dd49bdSEiji Ota
213c0dd49bdSEiji Ota /*
214c0dd49bdSEiji Ota * Sequence numbers should only increase. Messages get their
215c0dd49bdSEiji Ota * sequence number as they're queued in a sending conn. They
216c0dd49bdSEiji Ota * can be dropped, though, if the sending socket is closed before
217c0dd49bdSEiji Ota * they hit the wire. So sequence numbers can skip forward
218c0dd49bdSEiji Ota * under normal operation. They can also drop back in the conn
219c0dd49bdSEiji Ota * failover case as previously sent messages are resent down the
220c0dd49bdSEiji Ota * new instance of a conn. We drop those, otherwise we have
221c0dd49bdSEiji Ota * to assume that the next valid seq does not come after a
222c0dd49bdSEiji Ota * hole in the fragment stream.
223c0dd49bdSEiji Ota *
224c0dd49bdSEiji Ota * The headers don't give us a way to realize if fragments of
225c0dd49bdSEiji Ota * a message have been dropped. We assume that frags that arrive
226c0dd49bdSEiji Ota * to a flow are part of the current message on the flow that is
227c0dd49bdSEiji Ota * being reassembled. This means that senders can't drop messages
228c0dd49bdSEiji Ota * from the sending conn until all their frags are sent.
229c0dd49bdSEiji Ota *
230c0dd49bdSEiji Ota * XXX we could spend more on the wire to get more robust failure
231c0dd49bdSEiji Ota * detection, arguably worth it to avoid data corruption.
232c0dd49bdSEiji Ota */
233c0dd49bdSEiji Ota if (ntohll(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
234c0dd49bdSEiji Ota (inc->i_hdr.h_flags & RDSV3_FLAG_RETRANSMITTED)) {
235c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_old_seq);
236c0dd49bdSEiji Ota goto out;
237c0dd49bdSEiji Ota }
238c0dd49bdSEiji Ota conn->c_next_rx_seq = ntohll(inc->i_hdr.h_sequence) + 1;
239c0dd49bdSEiji Ota
240c0dd49bdSEiji Ota if (rdsv3_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
241c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_ping);
242c0dd49bdSEiji Ota (void) rdsv3_send_pong(conn, inc->i_hdr.h_sport);
243c0dd49bdSEiji Ota goto out;
244c0dd49bdSEiji Ota }
245c0dd49bdSEiji Ota
24680166370Sagiri rs = rdsv3_find_bound(conn, inc->i_hdr.h_dport);
2475d5562f5SEiji Ota if (!rs) {
248c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_no_sock);
249c0dd49bdSEiji Ota goto out;
250c0dd49bdSEiji Ota }
251c0dd49bdSEiji Ota
252c0dd49bdSEiji Ota /* Process extension headers */
253c0dd49bdSEiji Ota rdsv3_recv_incoming_exthdrs(inc, rs);
254c0dd49bdSEiji Ota
255c0dd49bdSEiji Ota /* We can be racing with rdsv3_release() which marks the socket dead. */
256c0dd49bdSEiji Ota sk = rdsv3_rs_to_sk(rs);
257c0dd49bdSEiji Ota
258c0dd49bdSEiji Ota /* serialize with rdsv3_release -> sock_orphan */
259c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER);
260c0dd49bdSEiji Ota if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD)) {
261c0dd49bdSEiji Ota int error, bytes;
262c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recv_incoming",
263c0dd49bdSEiji Ota "adding inc %p to rs %p's recv queue", inc, rs);
264c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_queued);
265c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
266c0dd49bdSEiji Ota ntohl(inc->i_hdr.h_len),
267c0dd49bdSEiji Ota inc->i_hdr.h_dport);
268c0dd49bdSEiji Ota rdsv3_inc_addref(inc);
269c0dd49bdSEiji Ota list_insert_tail(&rs->rs_recv_queue, inc);
270c0dd49bdSEiji Ota bytes = rs->rs_rcv_bytes;
271c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock);
272c0dd49bdSEiji Ota
273c0dd49bdSEiji Ota __rdsv3_wake_sk_sleep(sk);
274c0dd49bdSEiji Ota
275c0dd49bdSEiji Ota /* wake up anyone waiting in poll */
276c0dd49bdSEiji Ota sk->sk_upcalls->su_recv(sk->sk_upper_handle, NULL,
277c0dd49bdSEiji Ota bytes, 0, &error, NULL);
278c0dd49bdSEiji Ota if (error != 0) {
279c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_recv_incoming",
280c0dd49bdSEiji Ota "su_recv returned: %d", error);
281c0dd49bdSEiji Ota }
282c0dd49bdSEiji Ota } else {
283c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_drop_dead_sock);
284c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock);
285c0dd49bdSEiji Ota }
286c0dd49bdSEiji Ota
287c0dd49bdSEiji Ota out:
288c0dd49bdSEiji Ota if (rs)
289c0dd49bdSEiji Ota rdsv3_sock_put(rs);
290c0dd49bdSEiji Ota }
291c0dd49bdSEiji Ota
292c0dd49bdSEiji Ota /*
293c0dd49bdSEiji Ota * be very careful here. This is being called as the condition in
294c0dd49bdSEiji Ota * wait_event_*() needs to cope with being called many times.
295c0dd49bdSEiji Ota */
296c0dd49bdSEiji Ota static int
rdsv3_next_incoming(struct rdsv3_sock * rs,struct rdsv3_incoming ** inc)297c0dd49bdSEiji Ota rdsv3_next_incoming(struct rdsv3_sock *rs, struct rdsv3_incoming **inc)
298c0dd49bdSEiji Ota {
2995d5562f5SEiji Ota if (!*inc) {
300c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_READER);
301c0dd49bdSEiji Ota if (!list_is_empty(&rs->rs_recv_queue)) {
302c0dd49bdSEiji Ota *inc = list_head(&rs->rs_recv_queue);
303c0dd49bdSEiji Ota rdsv3_inc_addref(*inc);
304c0dd49bdSEiji Ota }
305c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock);
306c0dd49bdSEiji Ota }
307c0dd49bdSEiji Ota
308c0dd49bdSEiji Ota return (*inc != NULL);
309c0dd49bdSEiji Ota }
310c0dd49bdSEiji Ota
311c0dd49bdSEiji Ota static int
rdsv3_still_queued(struct rdsv3_sock * rs,struct rdsv3_incoming * inc,int drop)312c0dd49bdSEiji Ota rdsv3_still_queued(struct rdsv3_sock *rs, struct rdsv3_incoming *inc,
313c0dd49bdSEiji Ota int drop)
314c0dd49bdSEiji Ota {
315c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs);
316c0dd49bdSEiji Ota int ret = 0;
317c0dd49bdSEiji Ota
318c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_still_queued", "Enter rs: %p inc: %p drop: %d",
319c0dd49bdSEiji Ota rs, inc, drop);
320c0dd49bdSEiji Ota
321c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER);
322c0dd49bdSEiji Ota if (list_link_active(&inc->i_item)) {
323c0dd49bdSEiji Ota ret = 1;
324c0dd49bdSEiji Ota if (drop) {
325c0dd49bdSEiji Ota /* XXX make sure this i_conn is reliable */
326c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
327c0dd49bdSEiji Ota -ntohl(inc->i_hdr.h_len),
328c0dd49bdSEiji Ota inc->i_hdr.h_dport);
329c0dd49bdSEiji Ota list_remove_node(&inc->i_item);
330c0dd49bdSEiji Ota rdsv3_inc_put(inc);
331c0dd49bdSEiji Ota }
332c0dd49bdSEiji Ota }
333c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock);
334c0dd49bdSEiji Ota
335c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_still_queued",
336c0dd49bdSEiji Ota "inc %p rs %p still %d dropped %d", inc, rs, ret, drop);
337c0dd49bdSEiji Ota return (ret);
338c0dd49bdSEiji Ota }
339c0dd49bdSEiji Ota
340c0dd49bdSEiji Ota /*
341c0dd49bdSEiji Ota * Pull errors off the error queue.
342c0dd49bdSEiji Ota * If msghdr is NULL, we will just purge the error queue.
343c0dd49bdSEiji Ota */
344c0dd49bdSEiji Ota int
rdsv3_notify_queue_get(struct rdsv3_sock * rs,struct msghdr * msghdr)345c0dd49bdSEiji Ota rdsv3_notify_queue_get(struct rdsv3_sock *rs, struct msghdr *msghdr)
346c0dd49bdSEiji Ota {
347c0dd49bdSEiji Ota struct rdsv3_notifier *notifier;
348fe817b60SEiji Ota struct rds_rdma_notify cmsg;
349c0dd49bdSEiji Ota unsigned int count = 0, max_messages = ~0U;
350c0dd49bdSEiji Ota list_t copy;
351c0dd49bdSEiji Ota int err = 0;
352c0dd49bdSEiji Ota
353c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Enter(rs: %p)", rs);
354c0dd49bdSEiji Ota
355c0dd49bdSEiji Ota list_create(©, sizeof (struct rdsv3_notifier),
356c0dd49bdSEiji Ota offsetof(struct rdsv3_notifier, n_list));
357c0dd49bdSEiji Ota
358c0dd49bdSEiji Ota
359c0dd49bdSEiji Ota /*
360c0dd49bdSEiji Ota * put_cmsg copies to user space and thus may sleep. We can't do this
361c0dd49bdSEiji Ota * with rs_lock held, so first grab as many notifications as we can
362c0dd49bdSEiji Ota * stuff
363c0dd49bdSEiji Ota * in the user provided cmsg buffer. We don't try to copy more, to avoid
364c0dd49bdSEiji Ota * losing notifications - except when the buffer is so small that
365c0dd49bdSEiji Ota * it wouldn't
366*48bbca81SDaniel Hoffman * even hold a single notification. Then we give as much of this
367c0dd49bdSEiji Ota * single
368c0dd49bdSEiji Ota * msg as we can squeeze in, and set MSG_CTRUNC.
369c0dd49bdSEiji Ota */
370c0dd49bdSEiji Ota if (msghdr) {
371c0dd49bdSEiji Ota max_messages =
372c0dd49bdSEiji Ota msghdr->msg_controllen / CMSG_SPACE(sizeof (cmsg));
373c0dd49bdSEiji Ota if (!max_messages)
374c0dd49bdSEiji Ota max_messages = 1;
375c0dd49bdSEiji Ota }
376c0dd49bdSEiji Ota
377c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock);
378c0dd49bdSEiji Ota while (!list_is_empty(&rs->rs_notify_queue) && count < max_messages) {
379c0dd49bdSEiji Ota notifier = list_remove_head(&rs->rs_notify_queue);
380c0dd49bdSEiji Ota list_insert_tail(©, notifier);
381c0dd49bdSEiji Ota count++;
382c0dd49bdSEiji Ota }
383c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock);
384c0dd49bdSEiji Ota
385c0dd49bdSEiji Ota if (!count)
386c0dd49bdSEiji Ota return (0);
387c0dd49bdSEiji Ota
388c0dd49bdSEiji Ota while (!list_is_empty(©)) {
389c0dd49bdSEiji Ota notifier = list_remove_head(©);
390c0dd49bdSEiji Ota
391c0dd49bdSEiji Ota if (msghdr) {
392c0dd49bdSEiji Ota cmsg.user_token = notifier->n_user_token;
393c0dd49bdSEiji Ota cmsg.status = notifier->n_status;
394c0dd49bdSEiji Ota
395c0dd49bdSEiji Ota err = rdsv3_put_cmsg(msghdr, SOL_RDS,
396fe817b60SEiji Ota RDS_CMSG_RDMA_STATUS, sizeof (cmsg), &cmsg);
397c0dd49bdSEiji Ota if (err)
398c0dd49bdSEiji Ota break;
399c0dd49bdSEiji Ota }
400c0dd49bdSEiji Ota
401c0dd49bdSEiji Ota kmem_free(notifier, sizeof (struct rdsv3_notifier));
402c0dd49bdSEiji Ota }
403c0dd49bdSEiji Ota
404c0dd49bdSEiji Ota /*
405c0dd49bdSEiji Ota * If we bailed out because of an error in put_cmsg,
406c0dd49bdSEiji Ota * we may be left with one or more notifications that we
407c0dd49bdSEiji Ota * didn't process. Return them to the head of the list.
408c0dd49bdSEiji Ota */
409c0dd49bdSEiji Ota if (!list_is_empty(©)) {
410c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock);
411c0dd49bdSEiji Ota list_splice(©, &rs->rs_notify_queue);
412c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock);
413c0dd49bdSEiji Ota }
414c0dd49bdSEiji Ota
415c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_notify_queue_get", "Return(rs: %p)", rs);
416c0dd49bdSEiji Ota
417c0dd49bdSEiji Ota return (err);
418c0dd49bdSEiji Ota }
419c0dd49bdSEiji Ota
420c0dd49bdSEiji Ota /*
421c0dd49bdSEiji Ota * Queue a congestion notification
422c0dd49bdSEiji Ota */
423c0dd49bdSEiji Ota static int
rdsv3_notify_cong(struct rdsv3_sock * rs,struct msghdr * msghdr)424c0dd49bdSEiji Ota rdsv3_notify_cong(struct rdsv3_sock *rs, struct msghdr *msghdr)
425c0dd49bdSEiji Ota {
426c0dd49bdSEiji Ota uint64_t notify = rs->rs_cong_notify;
427c0dd49bdSEiji Ota int err;
428c0dd49bdSEiji Ota
429fe817b60SEiji Ota err = rdsv3_put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
430c0dd49bdSEiji Ota sizeof (notify), ¬ify);
431c0dd49bdSEiji Ota if (err)
432c0dd49bdSEiji Ota return (err);
433c0dd49bdSEiji Ota
434c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock);
435c0dd49bdSEiji Ota rs->rs_cong_notify &= ~notify;
436c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock);
437c0dd49bdSEiji Ota
438c0dd49bdSEiji Ota return (0);
439c0dd49bdSEiji Ota }
440c0dd49bdSEiji Ota
441c0dd49bdSEiji Ota /*
442c0dd49bdSEiji Ota * Receive any control messages.
443c0dd49bdSEiji Ota */
444c0dd49bdSEiji Ota static int
rdsv3_cmsg_recv(struct rdsv3_incoming * inc,struct msghdr * msg)445c0dd49bdSEiji Ota rdsv3_cmsg_recv(struct rdsv3_incoming *inc, struct msghdr *msg)
446c0dd49bdSEiji Ota {
4479b3d509cSEiji Ota int ret = 0;
4489b3d509cSEiji Ota if (inc->i_rdma_cookie) {
4499b3d509cSEiji Ota ret = rdsv3_put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
4509b3d509cSEiji Ota sizeof (inc->i_rdma_cookie), &inc->i_rdma_cookie);
4519b3d509cSEiji Ota }
4529b3d509cSEiji Ota return (ret);
453c0dd49bdSEiji Ota }
454c0dd49bdSEiji Ota
455c0dd49bdSEiji Ota int
rdsv3_recvmsg(struct rdsv3_sock * rs,uio_t * uio,struct nmsghdr * msg,size_t size,int msg_flags)456c0dd49bdSEiji Ota rdsv3_recvmsg(struct rdsv3_sock *rs, uio_t *uio,
457c0dd49bdSEiji Ota struct nmsghdr *msg, size_t size, int msg_flags)
458c0dd49bdSEiji Ota {
459c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs);
460b27516f5Sagiri int ret = 0;
461c0dd49bdSEiji Ota struct sockaddr_in *sin = NULL;
462c0dd49bdSEiji Ota struct rdsv3_incoming *inc = NULL;
463b27516f5Sagiri boolean_t nonblock = B_FALSE;
464c0dd49bdSEiji Ota
465c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recvmsg",
466c0dd49bdSEiji Ota "Enter(rs: %p size: %d msg_flags: 0x%x)", rs, size, msg_flags);
467c0dd49bdSEiji Ota
468b27516f5Sagiri if ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) ||
469b27516f5Sagiri (msg_flags & MSG_DONTWAIT))
470b27516f5Sagiri nonblock = B_TRUE;
471b27516f5Sagiri
472c0dd49bdSEiji Ota if (msg_flags & MSG_OOB)
473c0dd49bdSEiji Ota goto out;
474c0dd49bdSEiji Ota
475c0dd49bdSEiji Ota /* mark the first cmsg position */
476c0dd49bdSEiji Ota if (msg) {
477c0dd49bdSEiji Ota msg->msg_control = NULL;
478c0dd49bdSEiji Ota }
479c0dd49bdSEiji Ota
480c0dd49bdSEiji Ota while (1) {
481c0dd49bdSEiji Ota /*
482c0dd49bdSEiji Ota * If there are pending notifications, do those -
483c0dd49bdSEiji Ota * and nothing else
484c0dd49bdSEiji Ota */
485c0dd49bdSEiji Ota if (!list_is_empty(&rs->rs_notify_queue)) {
486c0dd49bdSEiji Ota ret = rdsv3_notify_queue_get(rs, msg);
487c0dd49bdSEiji Ota
488c0dd49bdSEiji Ota if (msg && msg->msg_namelen) {
489c0dd49bdSEiji Ota sin = kmem_zalloc(sizeof (struct sockaddr_in),
490c0dd49bdSEiji Ota KM_SLEEP);
491c0dd49bdSEiji Ota sin->sin_family = AF_INET_OFFLOAD;
492c0dd49bdSEiji Ota if (inc) {
493c0dd49bdSEiji Ota sin->sin_port = inc->i_hdr.h_sport;
494c0dd49bdSEiji Ota sin->sin_addr.s_addr = inc->i_saddr;
495c0dd49bdSEiji Ota }
496c0dd49bdSEiji Ota msg->msg_namelen = sizeof (struct sockaddr_in);
497c0dd49bdSEiji Ota msg->msg_name = sin;
498c0dd49bdSEiji Ota }
499c0dd49bdSEiji Ota break;
500c0dd49bdSEiji Ota }
501c0dd49bdSEiji Ota
502c0dd49bdSEiji Ota if (rs->rs_cong_notify) {
503c0dd49bdSEiji Ota ret = rdsv3_notify_cong(rs, msg);
504c0dd49bdSEiji Ota goto out;
505c0dd49bdSEiji Ota }
506c0dd49bdSEiji Ota
507c0dd49bdSEiji Ota if (!rdsv3_next_incoming(rs, &inc)) {
508c0dd49bdSEiji Ota if (nonblock) {
509c0dd49bdSEiji Ota ret = -EAGAIN;
510c0dd49bdSEiji Ota break;
511c0dd49bdSEiji Ota }
512c0dd49bdSEiji Ota
513c0dd49bdSEiji Ota RDSV3_DPRINTF3("rdsv3_recvmsg",
514c0dd49bdSEiji Ota "Before wait (rs: %p)", rs);
515c0dd49bdSEiji Ota
5166e18d381Sagiri #if 0
5176e18d381Sagiri ret = rdsv3_wait_sig(sk->sk_sleep,
5186e18d381Sagiri !(list_is_empty(&rs->rs_notify_queue) &&
5196e18d381Sagiri !rs->rs_cong_notify &&
5206e18d381Sagiri !rdsv3_next_incoming(rs, &inc)));
5216e18d381Sagiri if (ret == 0) {
5226e18d381Sagiri /* signal/timeout pending */
5236e18d381Sagiri RDSV3_DPRINTF2("rdsv3_recvmsg",
5246e18d381Sagiri "woke due to signal");
5256e18d381Sagiri ret = -ERESTART;
5266e18d381Sagiri }
5276e18d381Sagiri #else
528c0dd49bdSEiji Ota mutex_enter(&sk->sk_sleep->waitq_mutex);
5296e18d381Sagiri sk->sk_sleep->waitq_waiters++;
530c0dd49bdSEiji Ota while ((list_is_empty(&rs->rs_notify_queue) &&
531c0dd49bdSEiji Ota !rs->rs_cong_notify &&
532c0dd49bdSEiji Ota !rdsv3_next_incoming(rs, &inc))) {
533c0dd49bdSEiji Ota ret = cv_wait_sig(&sk->sk_sleep->waitq_cv,
534c0dd49bdSEiji Ota &sk->sk_sleep->waitq_mutex);
535c0dd49bdSEiji Ota if (ret == 0) {
536c0dd49bdSEiji Ota /* signal/timeout pending */
537c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_recvmsg",
538c0dd49bdSEiji Ota "woke due to signal");
5395e12ddadSEiji Ota ret = -EINTR;
540c0dd49bdSEiji Ota break;
541c0dd49bdSEiji Ota }
542c0dd49bdSEiji Ota }
5436e18d381Sagiri sk->sk_sleep->waitq_waiters--;
544c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex);
5456e18d381Sagiri #endif
546c0dd49bdSEiji Ota
547c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recvmsg",
548c0dd49bdSEiji Ota "recvmsg woke rs: %p inc %p ret %d",
549c0dd49bdSEiji Ota rs, inc, -ret);
550c0dd49bdSEiji Ota
551c0dd49bdSEiji Ota if (ret < 0)
552c0dd49bdSEiji Ota break;
553c0dd49bdSEiji Ota
554c0dd49bdSEiji Ota /*
555c0dd49bdSEiji Ota * if the wakeup was due to rs_notify_queue or
556c0dd49bdSEiji Ota * rs_cong_notify then we need to handle those first.
557c0dd49bdSEiji Ota */
558c0dd49bdSEiji Ota continue;
559c0dd49bdSEiji Ota }
560c0dd49bdSEiji Ota
561c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_recvmsg",
562c0dd49bdSEiji Ota "copying inc %p from %u.%u.%u.%u:%u to user", inc,
563c0dd49bdSEiji Ota NIPQUAD(inc->i_conn->c_faddr),
564c0dd49bdSEiji Ota ntohs(inc->i_hdr.h_sport));
565cadbfdc3SEiji Ota
566c0dd49bdSEiji Ota ret = inc->i_conn->c_trans->inc_copy_to_user(inc, uio, size);
567c0dd49bdSEiji Ota if (ret < 0)
568c0dd49bdSEiji Ota break;
569c0dd49bdSEiji Ota
570c0dd49bdSEiji Ota /*
571c0dd49bdSEiji Ota * if the message we just copied isn't at the head of the
572c0dd49bdSEiji Ota * recv queue then someone else raced us to return it, try
573c0dd49bdSEiji Ota * to get the next message.
574c0dd49bdSEiji Ota */
575c0dd49bdSEiji Ota if (!rdsv3_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
576c0dd49bdSEiji Ota rdsv3_inc_put(inc);
577c0dd49bdSEiji Ota inc = NULL;
578c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_deliver_raced);
579c0dd49bdSEiji Ota continue;
580c0dd49bdSEiji Ota }
581c0dd49bdSEiji Ota
582c0dd49bdSEiji Ota if (ret < ntohl(inc->i_hdr.h_len)) {
583c0dd49bdSEiji Ota if (msg_flags & MSG_TRUNC)
584c0dd49bdSEiji Ota ret = ntohl(inc->i_hdr.h_len);
585c0dd49bdSEiji Ota msg->msg_flags |= MSG_TRUNC;
586c0dd49bdSEiji Ota }
587c0dd49bdSEiji Ota
588c0dd49bdSEiji Ota if (rdsv3_cmsg_recv(inc, msg)) {
589c0dd49bdSEiji Ota ret = -EFAULT;
590c0dd49bdSEiji Ota goto out;
591c0dd49bdSEiji Ota }
592c0dd49bdSEiji Ota
593c0dd49bdSEiji Ota rdsv3_stats_inc(s_recv_delivered);
594c0dd49bdSEiji Ota
595c0dd49bdSEiji Ota if (msg->msg_namelen) {
596c0dd49bdSEiji Ota sin = kmem_alloc(sizeof (struct sockaddr_in), KM_SLEEP);
597c0dd49bdSEiji Ota sin->sin_family = AF_INET_OFFLOAD;
598c0dd49bdSEiji Ota sin->sin_port = inc->i_hdr.h_sport;
599c0dd49bdSEiji Ota sin->sin_addr.s_addr = inc->i_saddr;
600c0dd49bdSEiji Ota (void) memset(sin->sin_zero, 0,
601c0dd49bdSEiji Ota sizeof (sin->sin_zero));
602c0dd49bdSEiji Ota msg->msg_namelen = sizeof (struct sockaddr_in);
603c0dd49bdSEiji Ota msg->msg_name = sin;
604c0dd49bdSEiji Ota }
605c0dd49bdSEiji Ota break;
606c0dd49bdSEiji Ota }
607c0dd49bdSEiji Ota
608c0dd49bdSEiji Ota if (inc)
609c0dd49bdSEiji Ota rdsv3_inc_put(inc);
610c0dd49bdSEiji Ota
611c0dd49bdSEiji Ota out:
6129b3d509cSEiji Ota if (msg && msg->msg_control == NULL)
6139b3d509cSEiji Ota msg->msg_controllen = 0;
6149b3d509cSEiji Ota
615c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_recvmsg", "Return(rs: %p, ret: %d)", rs, ret);
616c0dd49bdSEiji Ota
617c0dd49bdSEiji Ota return (ret);
618c0dd49bdSEiji Ota }
619c0dd49bdSEiji Ota
620c0dd49bdSEiji Ota /*
621c0dd49bdSEiji Ota * The socket is being shut down and we're asked to drop messages that were
622c0dd49bdSEiji Ota * queued for recvmsg. The caller has unbound the socket so the receive path
623c0dd49bdSEiji Ota * won't queue any more incoming fragments or messages on the socket.
624c0dd49bdSEiji Ota */
625c0dd49bdSEiji Ota void
rdsv3_clear_recv_queue(struct rdsv3_sock * rs)626c0dd49bdSEiji Ota rdsv3_clear_recv_queue(struct rdsv3_sock *rs)
627c0dd49bdSEiji Ota {
628c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs);
629c0dd49bdSEiji Ota struct rdsv3_incoming *inc, *tmp;
630c0dd49bdSEiji Ota
631c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Enter(rs: %p)", rs);
632c0dd49bdSEiji Ota
633c0dd49bdSEiji Ota rw_enter(&rs->rs_recv_lock, RW_WRITER);
634c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(inc, tmp, &rs->rs_recv_queue, i_item) {
635c0dd49bdSEiji Ota rdsv3_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
636c0dd49bdSEiji Ota -ntohl(inc->i_hdr.h_len),
637c0dd49bdSEiji Ota inc->i_hdr.h_dport);
638c0dd49bdSEiji Ota list_remove_node(&inc->i_item);
639c0dd49bdSEiji Ota rdsv3_inc_put(inc);
640c0dd49bdSEiji Ota }
641c0dd49bdSEiji Ota rw_exit(&rs->rs_recv_lock);
642c0dd49bdSEiji Ota
643c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_clear_recv_queue", "Return(rs: %p)", rs);
644c0dd49bdSEiji Ota }
645c0dd49bdSEiji Ota
646c0dd49bdSEiji Ota /*
647c0dd49bdSEiji Ota * inc->i_saddr isn't used here because it is only set in the receive
648c0dd49bdSEiji Ota * path.
649c0dd49bdSEiji Ota */
650c0dd49bdSEiji Ota void
rdsv3_inc_info_copy(struct rdsv3_incoming * inc,struct rdsv3_info_iterator * iter,uint32_be_t saddr,uint32_be_t daddr,int flip)651c0dd49bdSEiji Ota rdsv3_inc_info_copy(struct rdsv3_incoming *inc,
652c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter,
653c0dd49bdSEiji Ota uint32_be_t saddr, uint32_be_t daddr, int flip)
654c0dd49bdSEiji Ota {
655fe817b60SEiji Ota struct rds_info_message minfo;
656c0dd49bdSEiji Ota
657c0dd49bdSEiji Ota minfo.seq = ntohll(inc->i_hdr.h_sequence);
658c0dd49bdSEiji Ota minfo.len = ntohl(inc->i_hdr.h_len);
659c0dd49bdSEiji Ota
660c0dd49bdSEiji Ota if (flip) {
661c0dd49bdSEiji Ota minfo.laddr = daddr;
662c0dd49bdSEiji Ota minfo.faddr = saddr;
663c0dd49bdSEiji Ota minfo.lport = inc->i_hdr.h_dport;
664c0dd49bdSEiji Ota minfo.fport = inc->i_hdr.h_sport;
665c0dd49bdSEiji Ota } else {
666c0dd49bdSEiji Ota minfo.laddr = saddr;
667c0dd49bdSEiji Ota minfo.faddr = daddr;
668c0dd49bdSEiji Ota minfo.lport = inc->i_hdr.h_sport;
669c0dd49bdSEiji Ota minfo.fport = inc->i_hdr.h_dport;
670c0dd49bdSEiji Ota }
671c0dd49bdSEiji Ota
672c0dd49bdSEiji Ota rdsv3_info_copy(iter, &minfo, sizeof (minfo));
673c0dd49bdSEiji Ota }
674