1c0dd49bdSEiji Ota /*
216e76cddSagiri * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3c0dd49bdSEiji Ota */
416e76cddSagiri
5c0dd49bdSEiji Ota /*
616e76cddSagiri * This file contains code imported from the OFED rds source file connection.c
716e76cddSagiri * Oracle elects to have and use the contents of connection.c under and governed
816e76cddSagiri * by the OpenIB.org BSD license (see below for full license text). However,
916e76cddSagiri * the following notice accompanied the original version of this file:
10c0dd49bdSEiji Ota */
11c0dd49bdSEiji Ota
12c0dd49bdSEiji Ota /*
13c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved.
14c0dd49bdSEiji Ota *
15c0dd49bdSEiji Ota * This software is available to you under a choice of one of two
16c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU
17c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file
18c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the
19c0dd49bdSEiji Ota * OpenIB.org BSD license below:
20c0dd49bdSEiji Ota *
21c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or
22c0dd49bdSEiji Ota * without modification, are permitted provided that the following
23c0dd49bdSEiji Ota * conditions are met:
24c0dd49bdSEiji Ota *
25c0dd49bdSEiji Ota * - Redistributions of source code must retain the above
26c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following
27c0dd49bdSEiji Ota * disclaimer.
28c0dd49bdSEiji Ota *
29c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above
30c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following
31c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials
32c0dd49bdSEiji Ota * provided with the distribution.
33c0dd49bdSEiji Ota *
34c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41c0dd49bdSEiji Ota * SOFTWARE.
42c0dd49bdSEiji Ota *
43c0dd49bdSEiji Ota */
44c0dd49bdSEiji Ota #include <sys/types.h>
45c0dd49bdSEiji Ota #include <sys/kmem.h>
46c0dd49bdSEiji Ota #include <sys/rds.h>
47c0dd49bdSEiji Ota
48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h>
49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/loop.h>
50c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51c0dd49bdSEiji Ota
52c0dd49bdSEiji Ota /* converting this to RCU is a chore for another day.. */
53c0dd49bdSEiji Ota static krwlock_t rdsv3_conn_lock;
54c0dd49bdSEiji Ota struct avl_tree rdsv3_conn_hash;
55c0dd49bdSEiji Ota static struct kmem_cache *rdsv3_conn_slab = NULL;
56c0dd49bdSEiji Ota
57c0dd49bdSEiji Ota #define rdsv3_conn_info_set(var, test, suffix) do { \
58c0dd49bdSEiji Ota if (test) \
59fe817b60SEiji Ota var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
60c0dd49bdSEiji Ota } while (0)
61c0dd49bdSEiji Ota
62c0dd49bdSEiji Ota
63c0dd49bdSEiji Ota static struct rdsv3_connection *
rdsv3_conn_lookup(uint32_be_t laddr,uint32_be_t faddr,avl_index_t * pos)64c0dd49bdSEiji Ota rdsv3_conn_lookup(uint32_be_t laddr, uint32_be_t faddr, avl_index_t *pos)
65c0dd49bdSEiji Ota {
66c0dd49bdSEiji Ota struct rdsv3_connection *conn;
67c0dd49bdSEiji Ota struct rdsv3_conn_info_s conn_info;
68c0dd49bdSEiji Ota avl_index_t place = 0;
69c0dd49bdSEiji Ota
70c0dd49bdSEiji Ota conn_info.c_laddr = laddr;
71c0dd49bdSEiji Ota conn_info.c_faddr = faddr;
72c0dd49bdSEiji Ota
73c0dd49bdSEiji Ota conn = avl_find(&rdsv3_conn_hash, &conn_info, &place);
74c0dd49bdSEiji Ota
75c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_conn_lookup",
76c0dd49bdSEiji Ota "returning conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
77c0dd49bdSEiji Ota conn, NIPQUAD(laddr), NIPQUAD(faddr));
78c0dd49bdSEiji Ota
79c0dd49bdSEiji Ota if (pos != NULL)
80c0dd49bdSEiji Ota *pos = place;
81c0dd49bdSEiji Ota
82c0dd49bdSEiji Ota return (conn);
83c0dd49bdSEiji Ota }
84c0dd49bdSEiji Ota
85c0dd49bdSEiji Ota /*
86c0dd49bdSEiji Ota * This is called by transports as they're bringing down a connection.
87c0dd49bdSEiji Ota * It clears partial message state so that the transport can start sending
88c0dd49bdSEiji Ota * and receiving over this connection again in the future. It is up to
89c0dd49bdSEiji Ota * the transport to have serialized this call with its send and recv.
90c0dd49bdSEiji Ota */
91c0dd49bdSEiji Ota void
rdsv3_conn_reset(struct rdsv3_connection * conn)92c0dd49bdSEiji Ota rdsv3_conn_reset(struct rdsv3_connection *conn)
93c0dd49bdSEiji Ota {
94c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_conn_reset",
95c0dd49bdSEiji Ota "connection %u.%u.%u.%u to %u.%u.%u.%u reset",
96c0dd49bdSEiji Ota NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
97c0dd49bdSEiji Ota
98c0dd49bdSEiji Ota rdsv3_stats_inc(s_conn_reset);
99c0dd49bdSEiji Ota rdsv3_send_reset(conn);
100c0dd49bdSEiji Ota conn->c_flags = 0;
101c0dd49bdSEiji Ota
102c0dd49bdSEiji Ota /*
103c0dd49bdSEiji Ota * Do not clear next_rx_seq here, else we cannot distinguish
104c0dd49bdSEiji Ota * retransmitted packets from new packets, and will hand all
105c0dd49bdSEiji Ota * of them to the application. That is not consistent with the
106c0dd49bdSEiji Ota * reliability guarantees of RDS.
107c0dd49bdSEiji Ota */
108c0dd49bdSEiji Ota }
109c0dd49bdSEiji Ota
110c0dd49bdSEiji Ota /*
111c0dd49bdSEiji Ota * There is only every one 'conn' for a given pair of addresses in the
112c0dd49bdSEiji Ota * system at a time. They contain messages to be retransmitted and so
113c0dd49bdSEiji Ota * span the lifetime of the actual underlying transport connections.
114c0dd49bdSEiji Ota *
115c0dd49bdSEiji Ota * For now they are not garbage collected once they're created. They
116c0dd49bdSEiji Ota * are torn down as the module is removed, if ever.
117c0dd49bdSEiji Ota */
118c0dd49bdSEiji Ota static struct rdsv3_connection *
__rdsv3_conn_create(uint32_be_t laddr,uint32_be_t faddr,struct rdsv3_transport * trans,int gfp,int is_outgoing)119c0dd49bdSEiji Ota __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr,
1205d5562f5SEiji Ota struct rdsv3_transport *trans, int gfp, int is_outgoing)
121c0dd49bdSEiji Ota {
122c0dd49bdSEiji Ota struct rdsv3_connection *conn, *parent = NULL;
123c0dd49bdSEiji Ota avl_index_t pos;
124c0dd49bdSEiji Ota int ret;
125c0dd49bdSEiji Ota
126c0dd49bdSEiji Ota rw_enter(&rdsv3_conn_lock, RW_READER);
127c0dd49bdSEiji Ota conn = rdsv3_conn_lookup(laddr, faddr, &pos);
128c0dd49bdSEiji Ota if (conn &&
129c0dd49bdSEiji Ota conn->c_loopback &&
130c0dd49bdSEiji Ota conn->c_trans != &rdsv3_loop_transport &&
131c0dd49bdSEiji Ota !is_outgoing) {
132c0dd49bdSEiji Ota /*
133c0dd49bdSEiji Ota * This is a looped back IB connection, and we're
134c0dd49bdSEiji Ota * called by the code handling the incoming connect.
135c0dd49bdSEiji Ota * We need a second connection object into which we
136c0dd49bdSEiji Ota * can stick the other QP.
137c0dd49bdSEiji Ota */
138c0dd49bdSEiji Ota parent = conn;
139c0dd49bdSEiji Ota conn = parent->c_passive;
140c0dd49bdSEiji Ota }
141c0dd49bdSEiji Ota rw_exit(&rdsv3_conn_lock);
142c0dd49bdSEiji Ota if (conn)
143c0dd49bdSEiji Ota goto out;
144c0dd49bdSEiji Ota
145c0dd49bdSEiji Ota RDSV3_DPRINTF2("__rdsv3_conn_create", "Enter(%x -> %x)",
146c0dd49bdSEiji Ota ntohl(laddr), ntohl(faddr));
147c0dd49bdSEiji Ota
148c0dd49bdSEiji Ota conn = kmem_cache_alloc(rdsv3_conn_slab, gfp);
1495d5562f5SEiji Ota if (!conn) {
150c0dd49bdSEiji Ota conn = ERR_PTR(-ENOMEM);
151c0dd49bdSEiji Ota goto out;
152c0dd49bdSEiji Ota }
153c0dd49bdSEiji Ota
154c0dd49bdSEiji Ota /* see rdsv3_conn_constructor */
155c0dd49bdSEiji Ota conn->c_laddr = laddr;
156c0dd49bdSEiji Ota conn->c_faddr = faddr;
157c0dd49bdSEiji Ota
15880166370Sagiri /*
15980166370Sagiri * We don't allow sockets to send messages without binding.
16080166370Sagiri * So, the IP address will already be there in the bind array.
16180166370Sagiri * Mostly, this is a readonly operation.
16280166370Sagiri * For now, passing GLOBAL_ZONEID.
16380166370Sagiri */
16480166370Sagiri conn->c_bucketp = rdsv3_find_ip_bucket(ntohl(laddr), GLOBAL_ZONEID);
16580166370Sagiri
166c0dd49bdSEiji Ota ret = rdsv3_cong_get_maps(conn);
167c0dd49bdSEiji Ota if (ret) {
168c0dd49bdSEiji Ota kmem_cache_free(rdsv3_conn_slab, conn);
169c0dd49bdSEiji Ota conn = ERR_PTR(ret);
170c0dd49bdSEiji Ota goto out;
171c0dd49bdSEiji Ota }
172c0dd49bdSEiji Ota
173c0dd49bdSEiji Ota /*
174c0dd49bdSEiji Ota * This is where a connection becomes loopback. If *any* RDS sockets
175c0dd49bdSEiji Ota * can bind to the destination address then we'd rather the messages
176c0dd49bdSEiji Ota * flow through loopback rather than either transport.
177c0dd49bdSEiji Ota */
178c0dd49bdSEiji Ota if (rdsv3_trans_get_preferred(faddr)) {
179c0dd49bdSEiji Ota conn->c_loopback = 1;
180c0dd49bdSEiji Ota if (is_outgoing && trans->t_prefer_loopback) {
181c0dd49bdSEiji Ota /*
182c0dd49bdSEiji Ota * "outgoing" connection - and the transport
183c0dd49bdSEiji Ota * says it wants the connection handled by the
184c0dd49bdSEiji Ota * loopback transport. This is what TCP does.
185c0dd49bdSEiji Ota */
186c0dd49bdSEiji Ota trans = &rdsv3_loop_transport;
187c0dd49bdSEiji Ota }
188c0dd49bdSEiji Ota }
189c0dd49bdSEiji Ota
190c0dd49bdSEiji Ota conn->c_trans = trans;
191c0dd49bdSEiji Ota
192c0dd49bdSEiji Ota ret = trans->conn_alloc(conn, gfp);
193c0dd49bdSEiji Ota if (ret) {
194c0dd49bdSEiji Ota kmem_cache_free(rdsv3_conn_slab, conn);
195c0dd49bdSEiji Ota conn = ERR_PTR(ret);
196c0dd49bdSEiji Ota goto out;
197c0dd49bdSEiji Ota }
198c0dd49bdSEiji Ota
199c0dd49bdSEiji Ota conn->c_state = RDSV3_CONN_DOWN;
200c0dd49bdSEiji Ota conn->c_reconnect_jiffies = 0;
201c0dd49bdSEiji Ota RDSV3_INIT_DELAYED_WORK(&conn->c_send_w, rdsv3_send_worker);
202c0dd49bdSEiji Ota RDSV3_INIT_DELAYED_WORK(&conn->c_recv_w, rdsv3_recv_worker);
203c0dd49bdSEiji Ota RDSV3_INIT_DELAYED_WORK(&conn->c_conn_w, rdsv3_connect_worker);
2045d5562f5SEiji Ota RDSV3_INIT_DELAYED_WORK(&conn->c_reap_w, rdsv3_reaper_worker);
205c0dd49bdSEiji Ota RDSV3_INIT_WORK(&conn->c_down_w, rdsv3_shutdown_worker);
206c0dd49bdSEiji Ota mutex_init(&conn->c_cm_lock, NULL, MUTEX_DRIVER, NULL);
207c0dd49bdSEiji Ota conn->c_flags = 0;
208c0dd49bdSEiji Ota
209c0dd49bdSEiji Ota RDSV3_DPRINTF2("__rdsv3_conn_create",
210c0dd49bdSEiji Ota "allocated conn %p for %u.%u.%u.%u -> %u.%u.%u.%u over %s %s",
211c0dd49bdSEiji Ota conn, NIPQUAD(laddr), NIPQUAD(faddr),
212*f802f1c0SToomas Soome *trans->t_name != '\0' ? trans->t_name : "[unknown]",
213c0dd49bdSEiji Ota is_outgoing ? "(outgoing)" : "");
214c0dd49bdSEiji Ota
215c0dd49bdSEiji Ota /*
216c0dd49bdSEiji Ota * Since we ran without holding the conn lock, someone could
217c0dd49bdSEiji Ota * have created the same conn (either normal or passive) in the
218c0dd49bdSEiji Ota * interim. We check while holding the lock. If we won, we complete
219c0dd49bdSEiji Ota * init and return our conn. If we lost, we rollback and return the
220c0dd49bdSEiji Ota * other one.
221c0dd49bdSEiji Ota */
222c0dd49bdSEiji Ota rw_enter(&rdsv3_conn_lock, RW_WRITER);
223c0dd49bdSEiji Ota if (parent) {
224c0dd49bdSEiji Ota /* Creating passive conn */
225c0dd49bdSEiji Ota if (parent->c_passive) {
226c0dd49bdSEiji Ota trans->conn_free(conn->c_transport_data);
227c0dd49bdSEiji Ota kmem_cache_free(rdsv3_conn_slab, conn);
228c0dd49bdSEiji Ota conn = parent->c_passive;
229c0dd49bdSEiji Ota } else {
230c0dd49bdSEiji Ota parent->c_passive = conn;
231c0dd49bdSEiji Ota rdsv3_cong_add_conn(conn);
232c0dd49bdSEiji Ota }
233c0dd49bdSEiji Ota } else {
234c0dd49bdSEiji Ota /* Creating normal conn */
235c0dd49bdSEiji Ota struct rdsv3_connection *found;
236c0dd49bdSEiji Ota
237c0dd49bdSEiji Ota found = rdsv3_conn_lookup(laddr, faddr, &pos);
238c0dd49bdSEiji Ota if (found) {
239c0dd49bdSEiji Ota trans->conn_free(conn->c_transport_data);
240c0dd49bdSEiji Ota kmem_cache_free(rdsv3_conn_slab, conn);
241c0dd49bdSEiji Ota conn = found;
242c0dd49bdSEiji Ota } else {
243c0dd49bdSEiji Ota avl_insert(&rdsv3_conn_hash, conn, pos);
244c0dd49bdSEiji Ota rdsv3_cong_add_conn(conn);
2455d5562f5SEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w,
2465d5562f5SEiji Ota RDSV3_REAPER_WAIT_JIFFIES);
247c0dd49bdSEiji Ota }
248c0dd49bdSEiji Ota }
249c0dd49bdSEiji Ota
250c0dd49bdSEiji Ota rw_exit(&rdsv3_conn_lock);
251c0dd49bdSEiji Ota
252c0dd49bdSEiji Ota RDSV3_DPRINTF2("__rdsv3_conn_create", "Return(conn: %p)", conn);
253c0dd49bdSEiji Ota
254c0dd49bdSEiji Ota out:
255c0dd49bdSEiji Ota return (conn);
256c0dd49bdSEiji Ota }
257c0dd49bdSEiji Ota
258c0dd49bdSEiji Ota struct rdsv3_connection *
rdsv3_conn_create(uint32_be_t laddr,uint32_be_t faddr,struct rdsv3_transport * trans,int gfp)259c0dd49bdSEiji Ota rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr,
260c0dd49bdSEiji Ota struct rdsv3_transport *trans, int gfp)
261c0dd49bdSEiji Ota {
262c0dd49bdSEiji Ota return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 0));
263c0dd49bdSEiji Ota }
264c0dd49bdSEiji Ota
265c0dd49bdSEiji Ota struct rdsv3_connection *
rdsv3_conn_create_outgoing(uint32_be_t laddr,uint32_be_t faddr,struct rdsv3_transport * trans,int gfp)266c0dd49bdSEiji Ota rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr,
267c0dd49bdSEiji Ota struct rdsv3_transport *trans, int gfp)
268c0dd49bdSEiji Ota {
269c0dd49bdSEiji Ota return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 1));
270c0dd49bdSEiji Ota }
271c0dd49bdSEiji Ota
2725d5562f5SEiji Ota extern struct avl_tree rdsv3_conn_hash;
2735d5562f5SEiji Ota
2745d5562f5SEiji Ota void
rdsv3_conn_shutdown(struct rdsv3_connection * conn)2755d5562f5SEiji Ota rdsv3_conn_shutdown(struct rdsv3_connection *conn)
2765d5562f5SEiji Ota {
2775d5562f5SEiji Ota RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Enter(conn: %p)", conn);
2785d5562f5SEiji Ota
2795d5562f5SEiji Ota /* shut it down unless it's down already */
2805d5562f5SEiji Ota if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) {
2815d5562f5SEiji Ota /*
2825d5562f5SEiji Ota * Quiesce the connection mgmt handlers before we start tearing
2835d5562f5SEiji Ota * things down. We don't hold the mutex for the entire
2845d5562f5SEiji Ota * duration of the shutdown operation, else we may be
2855d5562f5SEiji Ota * deadlocking with the CM handler. Instead, the CM event
2865d5562f5SEiji Ota * handler is supposed to check for state DISCONNECTING
2875d5562f5SEiji Ota */
2885d5562f5SEiji Ota mutex_enter(&conn->c_cm_lock);
2895d5562f5SEiji Ota if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP,
2905d5562f5SEiji Ota RDSV3_CONN_DISCONNECTING) &&
2915d5562f5SEiji Ota !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR,
2925d5562f5SEiji Ota RDSV3_CONN_DISCONNECTING)) {
2935d5562f5SEiji Ota RDSV3_DPRINTF2("rdsv3_conn_shutdown",
2945d5562f5SEiji Ota "shutdown called in state %d",
2955d5562f5SEiji Ota atomic_get(&conn->c_state));
2965d5562f5SEiji Ota rdsv3_conn_drop(conn);
2975d5562f5SEiji Ota mutex_exit(&conn->c_cm_lock);
2985d5562f5SEiji Ota return;
2995d5562f5SEiji Ota }
3005d5562f5SEiji Ota mutex_exit(&conn->c_cm_lock);
3015d5562f5SEiji Ota
3025d5562f5SEiji Ota /* verify everybody's out of rds_send_xmit() */
3035d5562f5SEiji Ota mutex_enter(&conn->c_send_lock);
3045d5562f5SEiji Ota while (atomic_get(&conn->c_senders)) {
3055d5562f5SEiji Ota mutex_exit(&conn->c_send_lock);
3065d5562f5SEiji Ota delay(1);
3075d5562f5SEiji Ota mutex_enter(&conn->c_send_lock);
3085d5562f5SEiji Ota }
3095d5562f5SEiji Ota
3105d5562f5SEiji Ota conn->c_trans->conn_shutdown(conn);
3115d5562f5SEiji Ota rdsv3_conn_reset(conn);
3125d5562f5SEiji Ota mutex_exit(&conn->c_send_lock);
3135d5562f5SEiji Ota
3145d5562f5SEiji Ota if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING,
3155d5562f5SEiji Ota RDSV3_CONN_DOWN)) {
3165d5562f5SEiji Ota /*
3175d5562f5SEiji Ota * This can happen - eg when we're in the middle of
3185d5562f5SEiji Ota * tearing down the connection, and someone unloads
3195d5562f5SEiji Ota * the rds module.
3205d5562f5SEiji Ota * Quite reproduceable with loopback connections.
3215d5562f5SEiji Ota * Mostly harmless.
3225d5562f5SEiji Ota */
3235d5562f5SEiji Ota #ifndef __lock_lint
3245d5562f5SEiji Ota RDSV3_DPRINTF2("rdsv3_conn_shutdown",
3255d5562f5SEiji Ota "failed to transition to state DOWN, "
3265d5562f5SEiji Ota "current statis is: %d",
3275d5562f5SEiji Ota atomic_get(&conn->c_state));
3285d5562f5SEiji Ota rdsv3_conn_drop(conn);
3295d5562f5SEiji Ota #endif
3305d5562f5SEiji Ota return;
3315d5562f5SEiji Ota }
3325d5562f5SEiji Ota }
3335d5562f5SEiji Ota
3345d5562f5SEiji Ota /*
3355d5562f5SEiji Ota * Then reconnect if it's still live.
3365d5562f5SEiji Ota * The passive side of an IB loopback connection is never added
3375d5562f5SEiji Ota * to the conn hash, so we never trigger a reconnect on this
3385d5562f5SEiji Ota * conn - the reconnect is always triggered by the active peer.
3395d5562f5SEiji Ota */
3405d5562f5SEiji Ota rdsv3_cancel_delayed_work(&conn->c_conn_w);
3415d5562f5SEiji Ota
3425d5562f5SEiji Ota {
3435d5562f5SEiji Ota struct rdsv3_conn_info_s conn_info;
3445d5562f5SEiji Ota
3455d5562f5SEiji Ota conn_info.c_laddr = conn->c_laddr;
3465d5562f5SEiji Ota conn_info.c_faddr = conn->c_faddr;
3475d5562f5SEiji Ota if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn)
3485d5562f5SEiji Ota rdsv3_queue_reconnect(conn);
3495d5562f5SEiji Ota }
3505d5562f5SEiji Ota RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Exit");
3515d5562f5SEiji Ota }
3525d5562f5SEiji Ota
3535d5562f5SEiji Ota /*
3545d5562f5SEiji Ota * Stop and free a connection.
3555d5562f5SEiji Ota */
356c0dd49bdSEiji Ota void
rdsv3_conn_destroy(struct rdsv3_connection * conn)357c0dd49bdSEiji Ota rdsv3_conn_destroy(struct rdsv3_connection *conn)
358c0dd49bdSEiji Ota {
359c0dd49bdSEiji Ota struct rdsv3_message *rm, *rtmp;
3605d5562f5SEiji Ota list_t to_be_dropped;
361c0dd49bdSEiji Ota
362c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_conn_destroy",
363c0dd49bdSEiji Ota "freeing conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
364c0dd49bdSEiji Ota conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
365c0dd49bdSEiji Ota
366c0dd49bdSEiji Ota avl_remove(&rdsv3_conn_hash, conn);
367c0dd49bdSEiji Ota
3685d5562f5SEiji Ota rdsv3_cancel_delayed_work(&conn->c_reap_w);
369c0dd49bdSEiji Ota rdsv3_cancel_delayed_work(&conn->c_send_w);
370c0dd49bdSEiji Ota rdsv3_cancel_delayed_work(&conn->c_recv_w);
3715d5562f5SEiji Ota
3725d5562f5SEiji Ota rdsv3_conn_shutdown(conn);
373c0dd49bdSEiji Ota
374c0dd49bdSEiji Ota /* tear down queued messages */
3755d5562f5SEiji Ota
3765d5562f5SEiji Ota list_create(&to_be_dropped, sizeof (struct rdsv3_message),
3775d5562f5SEiji Ota offsetof(struct rdsv3_message, m_conn_item));
3785d5562f5SEiji Ota
3795d5562f5SEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_retrans, m_conn_item) {
3805d5562f5SEiji Ota list_remove_node(&rm->m_conn_item);
3815d5562f5SEiji Ota list_insert_tail(&to_be_dropped, rm);
3825d5562f5SEiji Ota }
3835d5562f5SEiji Ota
3845d5562f5SEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_send_queue,
385c0dd49bdSEiji Ota m_conn_item) {
386c0dd49bdSEiji Ota list_remove_node(&rm->m_conn_item);
3875d5562f5SEiji Ota list_insert_tail(&to_be_dropped, rm);
3885d5562f5SEiji Ota }
3895d5562f5SEiji Ota
3905d5562f5SEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &to_be_dropped, m_conn_item) {
3915d5562f5SEiji Ota clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags);
3925d5562f5SEiji Ota list_remove_node(&rm->m_conn_item);
393c0dd49bdSEiji Ota rdsv3_message_put(rm);
394c0dd49bdSEiji Ota }
3955d5562f5SEiji Ota
396c0dd49bdSEiji Ota if (conn->c_xmit_rm)
397c0dd49bdSEiji Ota rdsv3_message_put(conn->c_xmit_rm);
398c0dd49bdSEiji Ota
399c0dd49bdSEiji Ota conn->c_trans->conn_free(conn->c_transport_data);
400c0dd49bdSEiji Ota
401c0dd49bdSEiji Ota /*
402c0dd49bdSEiji Ota * The congestion maps aren't freed up here. They're
403c0dd49bdSEiji Ota * freed by rdsv3_cong_exit() after all the connections
404c0dd49bdSEiji Ota * have been freed.
405c0dd49bdSEiji Ota */
406c0dd49bdSEiji Ota rdsv3_cong_remove_conn(conn);
407c0dd49bdSEiji Ota
408c0dd49bdSEiji Ota ASSERT(list_is_empty(&conn->c_retrans));
409c0dd49bdSEiji Ota kmem_cache_free(rdsv3_conn_slab, conn);
410c0dd49bdSEiji Ota
411c0dd49bdSEiji Ota }
412c0dd49bdSEiji Ota
413c0dd49bdSEiji Ota /* ARGSUSED */
414c0dd49bdSEiji Ota static void
rdsv3_conn_message_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens,int want_send)415c0dd49bdSEiji Ota rdsv3_conn_message_info(struct rsock *sock, unsigned int len,
416c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter,
417c0dd49bdSEiji Ota struct rdsv3_info_lengths *lens,
418c0dd49bdSEiji Ota int want_send)
419c0dd49bdSEiji Ota {
420c0dd49bdSEiji Ota struct list *list;
421c0dd49bdSEiji Ota struct rdsv3_connection *conn;
422c0dd49bdSEiji Ota struct rdsv3_message *rm;
423c0dd49bdSEiji Ota unsigned int total = 0;
424c0dd49bdSEiji Ota
425c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_conn_message_info", "Enter");
426c0dd49bdSEiji Ota
427fe817b60SEiji Ota len /= sizeof (struct rds_info_message);
428c0dd49bdSEiji Ota
429c0dd49bdSEiji Ota rw_enter(&rdsv3_conn_lock, RW_READER);
430c0dd49bdSEiji Ota
431c0dd49bdSEiji Ota if (avl_is_empty(&rdsv3_conn_hash)) {
432c0dd49bdSEiji Ota /* no connections */
433c0dd49bdSEiji Ota rw_exit(&rdsv3_conn_lock);
434c0dd49bdSEiji Ota return;
435c0dd49bdSEiji Ota }
436c0dd49bdSEiji Ota
437c0dd49bdSEiji Ota conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash);
438c0dd49bdSEiji Ota
439c0dd49bdSEiji Ota do {
440c0dd49bdSEiji Ota if (want_send)
441c0dd49bdSEiji Ota list = &conn->c_send_queue;
442c0dd49bdSEiji Ota else
443c0dd49bdSEiji Ota list = &conn->c_retrans;
444c0dd49bdSEiji Ota
445c0dd49bdSEiji Ota mutex_enter(&conn->c_lock);
446c0dd49bdSEiji Ota
447c0dd49bdSEiji Ota /* XXX too lazy to maintain counts.. */
448c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE(rm, list, m_conn_item) {
449c0dd49bdSEiji Ota total++;
450c0dd49bdSEiji Ota if (total <= len)
451c0dd49bdSEiji Ota rdsv3_inc_info_copy(&rm->m_inc, iter,
452c0dd49bdSEiji Ota conn->c_laddr, conn->c_faddr, 0);
453c0dd49bdSEiji Ota }
454c0dd49bdSEiji Ota
455c0dd49bdSEiji Ota mutex_exit(&conn->c_lock);
456c0dd49bdSEiji Ota
457c0dd49bdSEiji Ota conn = AVL_NEXT(&rdsv3_conn_hash, conn);
458c0dd49bdSEiji Ota } while (conn != NULL);
459c0dd49bdSEiji Ota rw_exit(&rdsv3_conn_lock);
460c0dd49bdSEiji Ota
461c0dd49bdSEiji Ota lens->nr = total;
462fe817b60SEiji Ota lens->each = sizeof (struct rds_info_message);
463c0dd49bdSEiji Ota
464c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_conn_message_info", "Return");
465c0dd49bdSEiji Ota }
466c0dd49bdSEiji Ota
467c0dd49bdSEiji Ota static void
rdsv3_conn_message_info_send(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)468c0dd49bdSEiji Ota rdsv3_conn_message_info_send(struct rsock *sock, unsigned int len,
469c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter,
470c0dd49bdSEiji Ota struct rdsv3_info_lengths *lens)
471c0dd49bdSEiji Ota {
472c0dd49bdSEiji Ota rdsv3_conn_message_info(sock, len, iter, lens, 1);
473c0dd49bdSEiji Ota }
474c0dd49bdSEiji Ota
475c0dd49bdSEiji Ota static void
rdsv3_conn_message_info_retrans(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)476c0dd49bdSEiji Ota rdsv3_conn_message_info_retrans(struct rsock *sock,
477c0dd49bdSEiji Ota unsigned int len,
478c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter,
479c0dd49bdSEiji Ota struct rdsv3_info_lengths *lens)
480c0dd49bdSEiji Ota {
481c0dd49bdSEiji Ota rdsv3_conn_message_info(sock, len, iter, lens, 0);
482c0dd49bdSEiji Ota }
483c0dd49bdSEiji Ota
484c0dd49bdSEiji Ota /* ARGSUSED */
485c0dd49bdSEiji Ota void
rdsv3_for_each_conn_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens,int (* visitor)(struct rdsv3_connection *,void *),size_t item_len)486c0dd49bdSEiji Ota rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len,
487c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter,
488c0dd49bdSEiji Ota struct rdsv3_info_lengths *lens,
489c0dd49bdSEiji Ota int (*visitor)(struct rdsv3_connection *, void *),
490c0dd49bdSEiji Ota size_t item_len)
491c0dd49bdSEiji Ota {
492b27516f5Sagiri uint8_t *buffer;
493c0dd49bdSEiji Ota struct rdsv3_connection *conn;
494c0dd49bdSEiji Ota
495c0dd49bdSEiji Ota rw_enter(&rdsv3_conn_lock, RW_READER);
496c0dd49bdSEiji Ota
497c0dd49bdSEiji Ota lens->nr = 0;
498c0dd49bdSEiji Ota lens->each = item_len;
499c0dd49bdSEiji Ota
500c0dd49bdSEiji Ota if (avl_is_empty(&rdsv3_conn_hash)) {
501c0dd49bdSEiji Ota /* no connections */
502c0dd49bdSEiji Ota rw_exit(&rdsv3_conn_lock);
503c0dd49bdSEiji Ota return;
504c0dd49bdSEiji Ota }
505c0dd49bdSEiji Ota
506b27516f5Sagiri /* allocate a little extra as this can get cast to a uint64_t */
507b27516f5Sagiri buffer = kmem_zalloc(item_len + 8, KM_SLEEP);
508b27516f5Sagiri
509c0dd49bdSEiji Ota conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash);
510c0dd49bdSEiji Ota
511c0dd49bdSEiji Ota do {
512c0dd49bdSEiji Ota /* XXX no c_lock usage.. */
513b27516f5Sagiri if (visitor(conn, buffer)) {
514b27516f5Sagiri /*
515b27516f5Sagiri * We copy as much as we can fit in the buffer,
516b27516f5Sagiri * but we count all items so that the caller
517b27516f5Sagiri * can resize the buffer.
518b27516f5Sagiri */
519b27516f5Sagiri if (len >= item_len) {
520b27516f5Sagiri RDSV3_DPRINTF4("rdsv3_for_each_conn_info",
521b27516f5Sagiri "buffer: %p iter: %p bytes: %d", buffer,
522b27516f5Sagiri iter->addr + iter->offset, item_len);
523b27516f5Sagiri rdsv3_info_copy(iter, buffer, item_len);
524b27516f5Sagiri len -= item_len;
525b27516f5Sagiri }
526b27516f5Sagiri lens->nr++;
527c0dd49bdSEiji Ota }
528c0dd49bdSEiji Ota conn = AVL_NEXT(&rdsv3_conn_hash, conn);
529c0dd49bdSEiji Ota } while (conn != NULL);
530c0dd49bdSEiji Ota rw_exit(&rdsv3_conn_lock);
531b27516f5Sagiri
532b27516f5Sagiri kmem_free(buffer, item_len + 8);
533c0dd49bdSEiji Ota }
534c0dd49bdSEiji Ota
535c0dd49bdSEiji Ota static int
rdsv3_conn_info_visitor(struct rdsv3_connection * conn,void * buffer)536c0dd49bdSEiji Ota rdsv3_conn_info_visitor(struct rdsv3_connection *conn, void *buffer)
537c0dd49bdSEiji Ota {
538fe817b60SEiji Ota struct rds_info_connection *cinfo = buffer;
539c0dd49bdSEiji Ota
540c0dd49bdSEiji Ota cinfo->next_tx_seq = conn->c_next_tx_seq;
541c0dd49bdSEiji Ota cinfo->next_rx_seq = conn->c_next_rx_seq;
542c0dd49bdSEiji Ota cinfo->laddr = conn->c_laddr;
543c0dd49bdSEiji Ota cinfo->faddr = conn->c_faddr;
544c0dd49bdSEiji Ota (void) strncpy((char *)cinfo->transport, conn->c_trans->t_name,
545c0dd49bdSEiji Ota sizeof (cinfo->transport));
546c0dd49bdSEiji Ota cinfo->flags = 0;
547c0dd49bdSEiji Ota
548c0dd49bdSEiji Ota rdsv3_conn_info_set(cinfo->flags,
5495d5562f5SEiji Ota MUTEX_HELD(&conn->c_send_lock), SENDING);
5505d5562f5SEiji Ota
551c0dd49bdSEiji Ota /* XXX Future: return the state rather than these funky bits */
552c0dd49bdSEiji Ota rdsv3_conn_info_set(cinfo->flags,
553c0dd49bdSEiji Ota atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING,
554c0dd49bdSEiji Ota CONNECTING);
555c0dd49bdSEiji Ota rdsv3_conn_info_set(cinfo->flags,
556c0dd49bdSEiji Ota atomic_get(&conn->c_state) == RDSV3_CONN_UP,
557c0dd49bdSEiji Ota CONNECTED);
558c0dd49bdSEiji Ota return (1);
559c0dd49bdSEiji Ota }
560c0dd49bdSEiji Ota
561c0dd49bdSEiji Ota static void
rdsv3_conn_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)562c0dd49bdSEiji Ota rdsv3_conn_info(struct rsock *sock, unsigned int len,
563c0dd49bdSEiji Ota struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
564c0dd49bdSEiji Ota {
565c0dd49bdSEiji Ota rdsv3_for_each_conn_info(sock, len, iter, lens,
566fe817b60SEiji Ota rdsv3_conn_info_visitor, sizeof (struct rds_info_connection));
567c0dd49bdSEiji Ota }
568c0dd49bdSEiji Ota
569c0dd49bdSEiji Ota int
rdsv3_conn_init()570c0dd49bdSEiji Ota rdsv3_conn_init()
571c0dd49bdSEiji Ota {
572c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_conn_init", "Enter");
573c0dd49bdSEiji Ota
574c0dd49bdSEiji Ota rdsv3_conn_slab = kmem_cache_create("rdsv3_connection",
575c0dd49bdSEiji Ota sizeof (struct rdsv3_connection), 0, rdsv3_conn_constructor,
576c0dd49bdSEiji Ota rdsv3_conn_destructor, NULL, NULL, NULL, 0);
5775d5562f5SEiji Ota if (!rdsv3_conn_slab) {
5786e18d381Sagiri RDSV3_DPRINTF2("rdsv3_conn_init",
579c0dd49bdSEiji Ota "kmem_cache_create(rdsv3_conn_slab) failed");
5805d5562f5SEiji Ota return (-ENOMEM);
581c0dd49bdSEiji Ota }
582c0dd49bdSEiji Ota
583c0dd49bdSEiji Ota avl_create(&rdsv3_conn_hash, rdsv3_conn_compare,
584c0dd49bdSEiji Ota sizeof (struct rdsv3_connection), offsetof(struct rdsv3_connection,
585c0dd49bdSEiji Ota c_hash_node));
586c0dd49bdSEiji Ota
587c0dd49bdSEiji Ota rw_init(&rdsv3_conn_lock, NULL, RW_DRIVER, NULL);
588c0dd49bdSEiji Ota
589c0dd49bdSEiji Ota rdsv3_loop_init();
590c0dd49bdSEiji Ota
591fe817b60SEiji Ota rdsv3_info_register_func(RDS_INFO_CONNECTIONS, rdsv3_conn_info);
592fe817b60SEiji Ota rdsv3_info_register_func(RDS_INFO_SEND_MESSAGES,
593c0dd49bdSEiji Ota rdsv3_conn_message_info_send);
594fe817b60SEiji Ota rdsv3_info_register_func(RDS_INFO_RETRANS_MESSAGES,
595c0dd49bdSEiji Ota rdsv3_conn_message_info_retrans);
596c0dd49bdSEiji Ota
597c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_conn_init", "Return");
598c0dd49bdSEiji Ota
599c0dd49bdSEiji Ota return (0);
600c0dd49bdSEiji Ota }
601c0dd49bdSEiji Ota
602c0dd49bdSEiji Ota void
rdsv3_conn_exit()603c0dd49bdSEiji Ota rdsv3_conn_exit()
604c0dd49bdSEiji Ota {
605c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_conn_exit", "Enter");
606c0dd49bdSEiji Ota
607c0dd49bdSEiji Ota rdsv3_loop_exit();
608c0dd49bdSEiji Ota
609c0dd49bdSEiji Ota rw_destroy(&rdsv3_conn_lock);
610c0dd49bdSEiji Ota avl_destroy(&rdsv3_conn_hash);
611c0dd49bdSEiji Ota
612c0dd49bdSEiji Ota ASSERT(rdsv3_conn_slab);
613c0dd49bdSEiji Ota kmem_cache_destroy(rdsv3_conn_slab);
614c0dd49bdSEiji Ota
615c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_conn_exit", "Return");
616c0dd49bdSEiji Ota }
617c0dd49bdSEiji Ota
618c0dd49bdSEiji Ota /*
619c0dd49bdSEiji Ota * Force a disconnect
620c0dd49bdSEiji Ota */
621c0dd49bdSEiji Ota void
rdsv3_conn_drop(struct rdsv3_connection * conn)622c0dd49bdSEiji Ota rdsv3_conn_drop(struct rdsv3_connection *conn)
623c0dd49bdSEiji Ota {
624c0dd49bdSEiji Ota conn->c_state = RDSV3_CONN_ERROR;
625c0dd49bdSEiji Ota rdsv3_queue_work(rdsv3_wq, &conn->c_down_w);
626c0dd49bdSEiji Ota }
627