1c0dd49bdSEiji Ota /*
216e76cddSagiri  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3c0dd49bdSEiji Ota  */
416e76cddSagiri 
5c0dd49bdSEiji Ota /*
616e76cddSagiri  * This file contains code imported from the OFED rds source file connection.c
716e76cddSagiri  * Oracle elects to have and use the contents of connection.c under and governed
816e76cddSagiri  * by the OpenIB.org BSD license (see below for full license text). However,
916e76cddSagiri  * the following notice accompanied the original version of this file:
10c0dd49bdSEiji Ota  */
11c0dd49bdSEiji Ota 
12c0dd49bdSEiji Ota /*
13c0dd49bdSEiji Ota  * Copyright (c) 2006 Oracle.  All rights reserved.
14c0dd49bdSEiji Ota  *
15c0dd49bdSEiji Ota  * This software is available to you under a choice of one of two
16c0dd49bdSEiji Ota  * licenses.  You may choose to be licensed under the terms of the GNU
17c0dd49bdSEiji Ota  * General Public License (GPL) Version 2, available from the file
18c0dd49bdSEiji Ota  * COPYING in the main directory of this source tree, or the
19c0dd49bdSEiji Ota  * OpenIB.org BSD license below:
20c0dd49bdSEiji Ota  *
21c0dd49bdSEiji Ota  *     Redistribution and use in source and binary forms, with or
22c0dd49bdSEiji Ota  *     without modification, are permitted provided that the following
23c0dd49bdSEiji Ota  *     conditions are met:
24c0dd49bdSEiji Ota  *
25c0dd49bdSEiji Ota  *      - Redistributions of source code must retain the above
26c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
27c0dd49bdSEiji Ota  *        disclaimer.
28c0dd49bdSEiji Ota  *
29c0dd49bdSEiji Ota  *      - Redistributions in binary form must reproduce the above
30c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
31c0dd49bdSEiji Ota  *        disclaimer in the documentation and/or other materials
32c0dd49bdSEiji Ota  *        provided with the distribution.
33c0dd49bdSEiji Ota  *
34c0dd49bdSEiji Ota  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35c0dd49bdSEiji Ota  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36c0dd49bdSEiji Ota  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37c0dd49bdSEiji Ota  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38c0dd49bdSEiji Ota  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39c0dd49bdSEiji Ota  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40c0dd49bdSEiji Ota  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41c0dd49bdSEiji Ota  * SOFTWARE.
42c0dd49bdSEiji Ota  *
43c0dd49bdSEiji Ota  */
44c0dd49bdSEiji Ota #include <sys/types.h>
45c0dd49bdSEiji Ota #include <sys/kmem.h>
46c0dd49bdSEiji Ota #include <sys/rds.h>
47c0dd49bdSEiji Ota 
48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h>
49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/loop.h>
50c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51c0dd49bdSEiji Ota 
52c0dd49bdSEiji Ota /* converting this to RCU is a chore for another day.. */
53c0dd49bdSEiji Ota static krwlock_t rdsv3_conn_lock;
54c0dd49bdSEiji Ota struct avl_tree rdsv3_conn_hash;
55c0dd49bdSEiji Ota static struct kmem_cache *rdsv3_conn_slab = NULL;
56c0dd49bdSEiji Ota 
57c0dd49bdSEiji Ota #define	rdsv3_conn_info_set(var, test, suffix) do {               \
58c0dd49bdSEiji Ota 	if (test)                                               \
59fe817b60SEiji Ota 		var |= RDS_INFO_CONNECTION_FLAG_##suffix;     \
60c0dd49bdSEiji Ota } while (0)
61c0dd49bdSEiji Ota 
62c0dd49bdSEiji Ota 
63c0dd49bdSEiji Ota static struct rdsv3_connection *
rdsv3_conn_lookup(uint32_be_t laddr,uint32_be_t faddr,avl_index_t * pos)64c0dd49bdSEiji Ota rdsv3_conn_lookup(uint32_be_t laddr, uint32_be_t faddr, avl_index_t *pos)
65c0dd49bdSEiji Ota {
66c0dd49bdSEiji Ota 	struct rdsv3_connection *conn;
67c0dd49bdSEiji Ota 	struct rdsv3_conn_info_s conn_info;
68c0dd49bdSEiji Ota 	avl_index_t place = 0;
69c0dd49bdSEiji Ota 
70c0dd49bdSEiji Ota 	conn_info.c_laddr = laddr;
71c0dd49bdSEiji Ota 	conn_info.c_faddr = faddr;
72c0dd49bdSEiji Ota 
73c0dd49bdSEiji Ota 	conn = avl_find(&rdsv3_conn_hash, &conn_info, &place);
74c0dd49bdSEiji Ota 
75c0dd49bdSEiji Ota 	RDSV3_DPRINTF5("rdsv3_conn_lookup",
76c0dd49bdSEiji Ota 	    "returning conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
77c0dd49bdSEiji Ota 	    conn, NIPQUAD(laddr), NIPQUAD(faddr));
78c0dd49bdSEiji Ota 
79c0dd49bdSEiji Ota 	if (pos != NULL)
80c0dd49bdSEiji Ota 		*pos = place;
81c0dd49bdSEiji Ota 
82c0dd49bdSEiji Ota 	return (conn);
83c0dd49bdSEiji Ota }
84c0dd49bdSEiji Ota 
85c0dd49bdSEiji Ota /*
86c0dd49bdSEiji Ota  * This is called by transports as they're bringing down a connection.
87c0dd49bdSEiji Ota  * It clears partial message state so that the transport can start sending
88c0dd49bdSEiji Ota  * and receiving over this connection again in the future.  It is up to
89c0dd49bdSEiji Ota  * the transport to have serialized this call with its send and recv.
90c0dd49bdSEiji Ota  */
91c0dd49bdSEiji Ota void
rdsv3_conn_reset(struct rdsv3_connection * conn)92c0dd49bdSEiji Ota rdsv3_conn_reset(struct rdsv3_connection *conn)
93c0dd49bdSEiji Ota {
94c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("rdsv3_conn_reset",
95c0dd49bdSEiji Ota 	    "connection %u.%u.%u.%u to %u.%u.%u.%u reset",
96c0dd49bdSEiji Ota 	    NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
97c0dd49bdSEiji Ota 
98c0dd49bdSEiji Ota 	rdsv3_stats_inc(s_conn_reset);
99c0dd49bdSEiji Ota 	rdsv3_send_reset(conn);
100c0dd49bdSEiji Ota 	conn->c_flags = 0;
101c0dd49bdSEiji Ota 
102c0dd49bdSEiji Ota 	/*
103c0dd49bdSEiji Ota 	 * Do not clear next_rx_seq here, else we cannot distinguish
104c0dd49bdSEiji Ota 	 * retransmitted packets from new packets, and will hand all
105c0dd49bdSEiji Ota 	 * of them to the application. That is not consistent with the
106c0dd49bdSEiji Ota 	 * reliability guarantees of RDS.
107c0dd49bdSEiji Ota 	 */
108c0dd49bdSEiji Ota }
109c0dd49bdSEiji Ota 
110c0dd49bdSEiji Ota /*
111c0dd49bdSEiji Ota  * There is only every one 'conn' for a given pair of addresses in the
112c0dd49bdSEiji Ota  * system at a time.  They contain messages to be retransmitted and so
113c0dd49bdSEiji Ota  * span the lifetime of the actual underlying transport connections.
114c0dd49bdSEiji Ota  *
115c0dd49bdSEiji Ota  * For now they are not garbage collected once they're created.  They
116c0dd49bdSEiji Ota  * are torn down as the module is removed, if ever.
117c0dd49bdSEiji Ota  */
118c0dd49bdSEiji Ota static struct rdsv3_connection *
__rdsv3_conn_create(uint32_be_t laddr,uint32_be_t faddr,struct rdsv3_transport * trans,int gfp,int is_outgoing)119c0dd49bdSEiji Ota __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr,
1205d5562f5SEiji Ota     struct rdsv3_transport *trans, int gfp, int is_outgoing)
121c0dd49bdSEiji Ota {
122c0dd49bdSEiji Ota 	struct rdsv3_connection *conn, *parent = NULL;
123c0dd49bdSEiji Ota 	avl_index_t pos;
124c0dd49bdSEiji Ota 	int ret;
125c0dd49bdSEiji Ota 
126c0dd49bdSEiji Ota 	rw_enter(&rdsv3_conn_lock, RW_READER);
127c0dd49bdSEiji Ota 	conn = rdsv3_conn_lookup(laddr, faddr, &pos);
128c0dd49bdSEiji Ota 	if (conn &&
129c0dd49bdSEiji Ota 	    conn->c_loopback &&
130c0dd49bdSEiji Ota 	    conn->c_trans != &rdsv3_loop_transport &&
131c0dd49bdSEiji Ota 	    !is_outgoing) {
132c0dd49bdSEiji Ota 		/*
133c0dd49bdSEiji Ota 		 * This is a looped back IB connection, and we're
134c0dd49bdSEiji Ota 		 * called by the code handling the incoming connect.
135c0dd49bdSEiji Ota 		 * We need a second connection object into which we
136c0dd49bdSEiji Ota 		 * can stick the other QP.
137c0dd49bdSEiji Ota 		 */
138c0dd49bdSEiji Ota 		parent = conn;
139c0dd49bdSEiji Ota 		conn = parent->c_passive;
140c0dd49bdSEiji Ota 	}
141c0dd49bdSEiji Ota 	rw_exit(&rdsv3_conn_lock);
142c0dd49bdSEiji Ota 	if (conn)
143c0dd49bdSEiji Ota 		goto out;
144c0dd49bdSEiji Ota 
145c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("__rdsv3_conn_create", "Enter(%x -> %x)",
146c0dd49bdSEiji Ota 	    ntohl(laddr), ntohl(faddr));
147c0dd49bdSEiji Ota 
148c0dd49bdSEiji Ota 	conn = kmem_cache_alloc(rdsv3_conn_slab, gfp);
1495d5562f5SEiji Ota 	if (!conn) {
150c0dd49bdSEiji Ota 		conn = ERR_PTR(-ENOMEM);
151c0dd49bdSEiji Ota 		goto out;
152c0dd49bdSEiji Ota 	}
153c0dd49bdSEiji Ota 
154c0dd49bdSEiji Ota 	/* see rdsv3_conn_constructor */
155c0dd49bdSEiji Ota 	conn->c_laddr = laddr;
156c0dd49bdSEiji Ota 	conn->c_faddr = faddr;
157c0dd49bdSEiji Ota 
15880166370Sagiri 	/*
15980166370Sagiri 	 * We don't allow sockets to send messages without binding.
16080166370Sagiri 	 * So, the IP address will already be there in the bind array.
16180166370Sagiri 	 * Mostly, this is a readonly operation.
16280166370Sagiri 	 * For now, passing GLOBAL_ZONEID.
16380166370Sagiri 	 */
16480166370Sagiri 	conn->c_bucketp = rdsv3_find_ip_bucket(ntohl(laddr), GLOBAL_ZONEID);
16580166370Sagiri 
166c0dd49bdSEiji Ota 	ret = rdsv3_cong_get_maps(conn);
167c0dd49bdSEiji Ota 	if (ret) {
168c0dd49bdSEiji Ota 		kmem_cache_free(rdsv3_conn_slab, conn);
169c0dd49bdSEiji Ota 		conn = ERR_PTR(ret);
170c0dd49bdSEiji Ota 		goto out;
171c0dd49bdSEiji Ota 	}
172c0dd49bdSEiji Ota 
173c0dd49bdSEiji Ota 	/*
174c0dd49bdSEiji Ota 	 * This is where a connection becomes loopback.  If *any* RDS sockets
175c0dd49bdSEiji Ota 	 * can bind to the destination address then we'd rather the messages
176c0dd49bdSEiji Ota 	 * flow through loopback rather than either transport.
177c0dd49bdSEiji Ota 	 */
178c0dd49bdSEiji Ota 	if (rdsv3_trans_get_preferred(faddr)) {
179c0dd49bdSEiji Ota 		conn->c_loopback = 1;
180c0dd49bdSEiji Ota 		if (is_outgoing && trans->t_prefer_loopback) {
181c0dd49bdSEiji Ota 			/*
182c0dd49bdSEiji Ota 			 * "outgoing" connection - and the transport
183c0dd49bdSEiji Ota 			 * says it wants the connection handled by the
184c0dd49bdSEiji Ota 			 * loopback transport. This is what TCP does.
185c0dd49bdSEiji Ota 			 */
186c0dd49bdSEiji Ota 			trans = &rdsv3_loop_transport;
187c0dd49bdSEiji Ota 		}
188c0dd49bdSEiji Ota 	}
189c0dd49bdSEiji Ota 
190c0dd49bdSEiji Ota 	conn->c_trans = trans;
191c0dd49bdSEiji Ota 
192c0dd49bdSEiji Ota 	ret = trans->conn_alloc(conn, gfp);
193c0dd49bdSEiji Ota 	if (ret) {
194c0dd49bdSEiji Ota 		kmem_cache_free(rdsv3_conn_slab, conn);
195c0dd49bdSEiji Ota 		conn = ERR_PTR(ret);
196c0dd49bdSEiji Ota 		goto out;
197c0dd49bdSEiji Ota 	}
198c0dd49bdSEiji Ota 
199c0dd49bdSEiji Ota 	conn->c_state = RDSV3_CONN_DOWN;
200c0dd49bdSEiji Ota 	conn->c_reconnect_jiffies = 0;
201c0dd49bdSEiji Ota 	RDSV3_INIT_DELAYED_WORK(&conn->c_send_w, rdsv3_send_worker);
202c0dd49bdSEiji Ota 	RDSV3_INIT_DELAYED_WORK(&conn->c_recv_w, rdsv3_recv_worker);
203c0dd49bdSEiji Ota 	RDSV3_INIT_DELAYED_WORK(&conn->c_conn_w, rdsv3_connect_worker);
2045d5562f5SEiji Ota 	RDSV3_INIT_DELAYED_WORK(&conn->c_reap_w, rdsv3_reaper_worker);
205c0dd49bdSEiji Ota 	RDSV3_INIT_WORK(&conn->c_down_w, rdsv3_shutdown_worker);
206c0dd49bdSEiji Ota 	mutex_init(&conn->c_cm_lock, NULL, MUTEX_DRIVER, NULL);
207c0dd49bdSEiji Ota 	conn->c_flags = 0;
208c0dd49bdSEiji Ota 
209c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("__rdsv3_conn_create",
210c0dd49bdSEiji Ota 	    "allocated conn %p for %u.%u.%u.%u -> %u.%u.%u.%u over %s %s",
211c0dd49bdSEiji Ota 	    conn, NIPQUAD(laddr), NIPQUAD(faddr),
212*f802f1c0SToomas Soome 	    *trans->t_name != '\0' ? trans->t_name : "[unknown]",
213c0dd49bdSEiji Ota 	    is_outgoing ? "(outgoing)" : "");
214c0dd49bdSEiji Ota 
215c0dd49bdSEiji Ota 	/*
216c0dd49bdSEiji Ota 	 * Since we ran without holding the conn lock, someone could
217c0dd49bdSEiji Ota 	 * have created the same conn (either normal or passive) in the
218c0dd49bdSEiji Ota 	 * interim. We check while holding the lock. If we won, we complete
219c0dd49bdSEiji Ota 	 * init and return our conn. If we lost, we rollback and return the
220c0dd49bdSEiji Ota 	 * other one.
221c0dd49bdSEiji Ota 	 */
222c0dd49bdSEiji Ota 	rw_enter(&rdsv3_conn_lock, RW_WRITER);
223c0dd49bdSEiji Ota 	if (parent) {
224c0dd49bdSEiji Ota 		/* Creating passive conn */
225c0dd49bdSEiji Ota 		if (parent->c_passive) {
226c0dd49bdSEiji Ota 			trans->conn_free(conn->c_transport_data);
227c0dd49bdSEiji Ota 			kmem_cache_free(rdsv3_conn_slab, conn);
228c0dd49bdSEiji Ota 			conn = parent->c_passive;
229c0dd49bdSEiji Ota 		} else {
230c0dd49bdSEiji Ota 			parent->c_passive = conn;
231c0dd49bdSEiji Ota 			rdsv3_cong_add_conn(conn);
232c0dd49bdSEiji Ota 		}
233c0dd49bdSEiji Ota 	} else {
234c0dd49bdSEiji Ota 		/* Creating normal conn */
235c0dd49bdSEiji Ota 		struct rdsv3_connection *found;
236c0dd49bdSEiji Ota 
237c0dd49bdSEiji Ota 		found = rdsv3_conn_lookup(laddr, faddr, &pos);
238c0dd49bdSEiji Ota 		if (found) {
239c0dd49bdSEiji Ota 			trans->conn_free(conn->c_transport_data);
240c0dd49bdSEiji Ota 			kmem_cache_free(rdsv3_conn_slab, conn);
241c0dd49bdSEiji Ota 			conn = found;
242c0dd49bdSEiji Ota 		} else {
243c0dd49bdSEiji Ota 			avl_insert(&rdsv3_conn_hash, conn, pos);
244c0dd49bdSEiji Ota 			rdsv3_cong_add_conn(conn);
2455d5562f5SEiji Ota 			rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w,
2465d5562f5SEiji Ota 			    RDSV3_REAPER_WAIT_JIFFIES);
247c0dd49bdSEiji Ota 		}
248c0dd49bdSEiji Ota 	}
249c0dd49bdSEiji Ota 
250c0dd49bdSEiji Ota 	rw_exit(&rdsv3_conn_lock);
251c0dd49bdSEiji Ota 
252c0dd49bdSEiji Ota 	RDSV3_DPRINTF2("__rdsv3_conn_create", "Return(conn: %p)", conn);
253c0dd49bdSEiji Ota 
254c0dd49bdSEiji Ota out:
255c0dd49bdSEiji Ota 	return (conn);
256c0dd49bdSEiji Ota }
257c0dd49bdSEiji Ota 
258c0dd49bdSEiji Ota struct rdsv3_connection *
rdsv3_conn_create(uint32_be_t laddr,uint32_be_t faddr,struct rdsv3_transport * trans,int gfp)259c0dd49bdSEiji Ota rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr,
260c0dd49bdSEiji Ota     struct rdsv3_transport *trans, int gfp)
261c0dd49bdSEiji Ota {
262c0dd49bdSEiji Ota 	return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 0));
263c0dd49bdSEiji Ota }
264c0dd49bdSEiji Ota 
265c0dd49bdSEiji Ota struct rdsv3_connection *
rdsv3_conn_create_outgoing(uint32_be_t laddr,uint32_be_t faddr,struct rdsv3_transport * trans,int gfp)266c0dd49bdSEiji Ota rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr,
267c0dd49bdSEiji Ota     struct rdsv3_transport *trans, int gfp)
268c0dd49bdSEiji Ota {
269c0dd49bdSEiji Ota 	return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 1));
270c0dd49bdSEiji Ota }
271c0dd49bdSEiji Ota 
2725d5562f5SEiji Ota extern struct avl_tree	rdsv3_conn_hash;
2735d5562f5SEiji Ota 
2745d5562f5SEiji Ota void
rdsv3_conn_shutdown(struct rdsv3_connection * conn)2755d5562f5SEiji Ota rdsv3_conn_shutdown(struct rdsv3_connection *conn)
2765d5562f5SEiji Ota {
2775d5562f5SEiji Ota 	RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Enter(conn: %p)", conn);
2785d5562f5SEiji Ota 
2795d5562f5SEiji Ota 	/* shut it down unless it's down already */
2805d5562f5SEiji Ota 	if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) {
2815d5562f5SEiji Ota 		/*
2825d5562f5SEiji Ota 		 * Quiesce the connection mgmt handlers before we start tearing
2835d5562f5SEiji Ota 		 * things down. We don't hold the mutex for the entire
2845d5562f5SEiji Ota 		 * duration of the shutdown operation, else we may be
2855d5562f5SEiji Ota 		 * deadlocking with the CM handler. Instead, the CM event
2865d5562f5SEiji Ota 		 * handler is supposed to check for state DISCONNECTING
2875d5562f5SEiji Ota 		 */
2885d5562f5SEiji Ota 		mutex_enter(&conn->c_cm_lock);
2895d5562f5SEiji Ota 		if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP,
2905d5562f5SEiji Ota 		    RDSV3_CONN_DISCONNECTING) &&
2915d5562f5SEiji Ota 		    !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR,
2925d5562f5SEiji Ota 		    RDSV3_CONN_DISCONNECTING)) {
2935d5562f5SEiji Ota 			RDSV3_DPRINTF2("rdsv3_conn_shutdown",
2945d5562f5SEiji Ota 			    "shutdown called in state %d",
2955d5562f5SEiji Ota 			    atomic_get(&conn->c_state));
2965d5562f5SEiji Ota 			rdsv3_conn_drop(conn);
2975d5562f5SEiji Ota 			mutex_exit(&conn->c_cm_lock);
2985d5562f5SEiji Ota 			return;
2995d5562f5SEiji Ota 		}
3005d5562f5SEiji Ota 		mutex_exit(&conn->c_cm_lock);
3015d5562f5SEiji Ota 
3025d5562f5SEiji Ota 		/* verify everybody's out of rds_send_xmit() */
3035d5562f5SEiji Ota 		mutex_enter(&conn->c_send_lock);
3045d5562f5SEiji Ota 		while (atomic_get(&conn->c_senders)) {
3055d5562f5SEiji Ota 			mutex_exit(&conn->c_send_lock);
3065d5562f5SEiji Ota 			delay(1);
3075d5562f5SEiji Ota 			mutex_enter(&conn->c_send_lock);
3085d5562f5SEiji Ota 		}
3095d5562f5SEiji Ota 
3105d5562f5SEiji Ota 		conn->c_trans->conn_shutdown(conn);
3115d5562f5SEiji Ota 		rdsv3_conn_reset(conn);
3125d5562f5SEiji Ota 		mutex_exit(&conn->c_send_lock);
3135d5562f5SEiji Ota 
3145d5562f5SEiji Ota 		if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING,
3155d5562f5SEiji Ota 		    RDSV3_CONN_DOWN)) {
3165d5562f5SEiji Ota 			/*
3175d5562f5SEiji Ota 			 * This can happen - eg when we're in the middle of
3185d5562f5SEiji Ota 			 * tearing down the connection, and someone unloads
3195d5562f5SEiji Ota 			 * the rds module.
3205d5562f5SEiji Ota 			 * Quite reproduceable with loopback connections.
3215d5562f5SEiji Ota 			 * Mostly harmless.
3225d5562f5SEiji Ota 			 */
3235d5562f5SEiji Ota #ifndef __lock_lint
3245d5562f5SEiji Ota 			RDSV3_DPRINTF2("rdsv3_conn_shutdown",
3255d5562f5SEiji Ota 			    "failed to transition to state DOWN, "
3265d5562f5SEiji Ota 			    "current statis is: %d",
3275d5562f5SEiji Ota 			    atomic_get(&conn->c_state));
3285d5562f5SEiji Ota 			rdsv3_conn_drop(conn);
3295d5562f5SEiji Ota #endif
3305d5562f5SEiji Ota 			return;
3315d5562f5SEiji Ota 		}
3325d5562f5SEiji Ota 	}
3335d5562f5SEiji Ota 
3345d5562f5SEiji Ota 	/*
3355d5562f5SEiji Ota 	 * Then reconnect if it's still live.
3365d5562f5SEiji Ota 	 * The passive side of an IB loopback connection is never added
3375d5562f5SEiji Ota 	 * to the conn hash, so we never trigger a reconnect on this
3385d5562f5SEiji Ota 	 * conn - the reconnect is always triggered by the active peer.
3395d5562f5SEiji Ota 	 */
3405d5562f5SEiji Ota 	rdsv3_cancel_delayed_work(&conn->c_conn_w);
3415d5562f5SEiji Ota 
3425d5562f5SEiji Ota 	{
3435d5562f5SEiji Ota 		struct rdsv3_conn_info_s conn_info;
3445d5562f5SEiji Ota 
3455d5562f5SEiji Ota 		conn_info.c_laddr = conn->c_laddr;
3465d5562f5SEiji Ota 		conn_info.c_faddr = conn->c_faddr;
3475d5562f5SEiji Ota 		if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn)
3485d5562f5SEiji Ota 			rdsv3_queue_reconnect(conn);
3495d5562f5SEiji Ota 	}
3505d5562f5SEiji Ota 	RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Exit");
3515d5562f5SEiji Ota }
3525d5562f5SEiji Ota 
3535d5562f5SEiji Ota /*
3545d5562f5SEiji Ota  * Stop and free a connection.
3555d5562f5SEiji Ota  */
356c0dd49bdSEiji Ota void
rdsv3_conn_destroy(struct rdsv3_connection * conn)357c0dd49bdSEiji Ota rdsv3_conn_destroy(struct rdsv3_connection *conn)
358c0dd49bdSEiji Ota {
359c0dd49bdSEiji Ota 	struct rdsv3_message *rm, *rtmp;
3605d5562f5SEiji Ota 	list_t to_be_dropped;
361c0dd49bdSEiji Ota 
362c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_conn_destroy",
363c0dd49bdSEiji Ota 	    "freeing conn %p for %u.%u.%u.%u -> %u.%u.%u.%u",
364c0dd49bdSEiji Ota 	    conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
365c0dd49bdSEiji Ota 
366c0dd49bdSEiji Ota 	avl_remove(&rdsv3_conn_hash, conn);
367c0dd49bdSEiji Ota 
3685d5562f5SEiji Ota 	rdsv3_cancel_delayed_work(&conn->c_reap_w);
369c0dd49bdSEiji Ota 	rdsv3_cancel_delayed_work(&conn->c_send_w);
370c0dd49bdSEiji Ota 	rdsv3_cancel_delayed_work(&conn->c_recv_w);
3715d5562f5SEiji Ota 
3725d5562f5SEiji Ota 	rdsv3_conn_shutdown(conn);
373c0dd49bdSEiji Ota 
374c0dd49bdSEiji Ota 	/* tear down queued messages */
3755d5562f5SEiji Ota 
3765d5562f5SEiji Ota 	list_create(&to_be_dropped, sizeof (struct rdsv3_message),
3775d5562f5SEiji Ota 	    offsetof(struct rdsv3_message, m_conn_item));
3785d5562f5SEiji Ota 
3795d5562f5SEiji Ota 	RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_retrans, m_conn_item) {
3805d5562f5SEiji Ota 		list_remove_node(&rm->m_conn_item);
3815d5562f5SEiji Ota 		list_insert_tail(&to_be_dropped, rm);
3825d5562f5SEiji Ota 	}
3835d5562f5SEiji Ota 
3845d5562f5SEiji Ota 	RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_send_queue,
385c0dd49bdSEiji Ota 	    m_conn_item) {
386c0dd49bdSEiji Ota 		list_remove_node(&rm->m_conn_item);
3875d5562f5SEiji Ota 		list_insert_tail(&to_be_dropped, rm);
3885d5562f5SEiji Ota 	}
3895d5562f5SEiji Ota 
3905d5562f5SEiji Ota 	RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &to_be_dropped, m_conn_item) {
3915d5562f5SEiji Ota 		clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags);
3925d5562f5SEiji Ota 		list_remove_node(&rm->m_conn_item);
393c0dd49bdSEiji Ota 		rdsv3_message_put(rm);
394c0dd49bdSEiji Ota 	}
3955d5562f5SEiji Ota 
396c0dd49bdSEiji Ota 	if (conn->c_xmit_rm)
397c0dd49bdSEiji Ota 		rdsv3_message_put(conn->c_xmit_rm);
398c0dd49bdSEiji Ota 
399c0dd49bdSEiji Ota 	conn->c_trans->conn_free(conn->c_transport_data);
400c0dd49bdSEiji Ota 
401c0dd49bdSEiji Ota 	/*
402c0dd49bdSEiji Ota 	 * The congestion maps aren't freed up here.  They're
403c0dd49bdSEiji Ota 	 * freed by rdsv3_cong_exit() after all the connections
404c0dd49bdSEiji Ota 	 * have been freed.
405c0dd49bdSEiji Ota 	 */
406c0dd49bdSEiji Ota 	rdsv3_cong_remove_conn(conn);
407c0dd49bdSEiji Ota 
408c0dd49bdSEiji Ota 	ASSERT(list_is_empty(&conn->c_retrans));
409c0dd49bdSEiji Ota 	kmem_cache_free(rdsv3_conn_slab, conn);
410c0dd49bdSEiji Ota 
411c0dd49bdSEiji Ota }
412c0dd49bdSEiji Ota 
413c0dd49bdSEiji Ota /* ARGSUSED */
414c0dd49bdSEiji Ota static void
rdsv3_conn_message_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens,int want_send)415c0dd49bdSEiji Ota rdsv3_conn_message_info(struct rsock *sock, unsigned int len,
416c0dd49bdSEiji Ota     struct rdsv3_info_iterator *iter,
417c0dd49bdSEiji Ota     struct rdsv3_info_lengths *lens,
418c0dd49bdSEiji Ota     int want_send)
419c0dd49bdSEiji Ota {
420c0dd49bdSEiji Ota 	struct list *list;
421c0dd49bdSEiji Ota 	struct rdsv3_connection *conn;
422c0dd49bdSEiji Ota 	struct rdsv3_message *rm;
423c0dd49bdSEiji Ota 	unsigned int total = 0;
424c0dd49bdSEiji Ota 
425c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_conn_message_info", "Enter");
426c0dd49bdSEiji Ota 
427fe817b60SEiji Ota 	len /= sizeof (struct rds_info_message);
428c0dd49bdSEiji Ota 
429c0dd49bdSEiji Ota 	rw_enter(&rdsv3_conn_lock, RW_READER);
430c0dd49bdSEiji Ota 
431c0dd49bdSEiji Ota 	if (avl_is_empty(&rdsv3_conn_hash)) {
432c0dd49bdSEiji Ota 		/* no connections */
433c0dd49bdSEiji Ota 		rw_exit(&rdsv3_conn_lock);
434c0dd49bdSEiji Ota 		return;
435c0dd49bdSEiji Ota 	}
436c0dd49bdSEiji Ota 
437c0dd49bdSEiji Ota 	conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash);
438c0dd49bdSEiji Ota 
439c0dd49bdSEiji Ota 	do {
440c0dd49bdSEiji Ota 		if (want_send)
441c0dd49bdSEiji Ota 			list = &conn->c_send_queue;
442c0dd49bdSEiji Ota 		else
443c0dd49bdSEiji Ota 			list = &conn->c_retrans;
444c0dd49bdSEiji Ota 
445c0dd49bdSEiji Ota 		mutex_enter(&conn->c_lock);
446c0dd49bdSEiji Ota 
447c0dd49bdSEiji Ota 		/* XXX too lazy to maintain counts.. */
448c0dd49bdSEiji Ota 		RDSV3_FOR_EACH_LIST_NODE(rm, list, m_conn_item) {
449c0dd49bdSEiji Ota 			total++;
450c0dd49bdSEiji Ota 			if (total <= len)
451c0dd49bdSEiji Ota 				rdsv3_inc_info_copy(&rm->m_inc, iter,
452c0dd49bdSEiji Ota 				    conn->c_laddr, conn->c_faddr, 0);
453c0dd49bdSEiji Ota 		}
454c0dd49bdSEiji Ota 
455c0dd49bdSEiji Ota 		mutex_exit(&conn->c_lock);
456c0dd49bdSEiji Ota 
457c0dd49bdSEiji Ota 		conn = AVL_NEXT(&rdsv3_conn_hash, conn);
458c0dd49bdSEiji Ota 	} while (conn != NULL);
459c0dd49bdSEiji Ota 	rw_exit(&rdsv3_conn_lock);
460c0dd49bdSEiji Ota 
461c0dd49bdSEiji Ota 	lens->nr = total;
462fe817b60SEiji Ota 	lens->each = sizeof (struct rds_info_message);
463c0dd49bdSEiji Ota 
464c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_conn_message_info", "Return");
465c0dd49bdSEiji Ota }
466c0dd49bdSEiji Ota 
467c0dd49bdSEiji Ota static void
rdsv3_conn_message_info_send(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)468c0dd49bdSEiji Ota rdsv3_conn_message_info_send(struct rsock *sock, unsigned int len,
469c0dd49bdSEiji Ota     struct rdsv3_info_iterator *iter,
470c0dd49bdSEiji Ota     struct rdsv3_info_lengths *lens)
471c0dd49bdSEiji Ota {
472c0dd49bdSEiji Ota 	rdsv3_conn_message_info(sock, len, iter, lens, 1);
473c0dd49bdSEiji Ota }
474c0dd49bdSEiji Ota 
475c0dd49bdSEiji Ota static void
rdsv3_conn_message_info_retrans(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)476c0dd49bdSEiji Ota rdsv3_conn_message_info_retrans(struct rsock *sock,
477c0dd49bdSEiji Ota     unsigned int len,
478c0dd49bdSEiji Ota     struct rdsv3_info_iterator *iter,
479c0dd49bdSEiji Ota     struct rdsv3_info_lengths *lens)
480c0dd49bdSEiji Ota {
481c0dd49bdSEiji Ota 	rdsv3_conn_message_info(sock, len, iter, lens, 0);
482c0dd49bdSEiji Ota }
483c0dd49bdSEiji Ota 
484c0dd49bdSEiji Ota /* ARGSUSED */
485c0dd49bdSEiji Ota void
rdsv3_for_each_conn_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens,int (* visitor)(struct rdsv3_connection *,void *),size_t item_len)486c0dd49bdSEiji Ota rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len,
487c0dd49bdSEiji Ota     struct rdsv3_info_iterator *iter,
488c0dd49bdSEiji Ota     struct rdsv3_info_lengths *lens,
489c0dd49bdSEiji Ota     int (*visitor)(struct rdsv3_connection *, void *),
490c0dd49bdSEiji Ota     size_t item_len)
491c0dd49bdSEiji Ota {
492b27516f5Sagiri 	uint8_t *buffer;
493c0dd49bdSEiji Ota 	struct rdsv3_connection *conn;
494c0dd49bdSEiji Ota 
495c0dd49bdSEiji Ota 	rw_enter(&rdsv3_conn_lock, RW_READER);
496c0dd49bdSEiji Ota 
497c0dd49bdSEiji Ota 	lens->nr = 0;
498c0dd49bdSEiji Ota 	lens->each = item_len;
499c0dd49bdSEiji Ota 
500c0dd49bdSEiji Ota 	if (avl_is_empty(&rdsv3_conn_hash)) {
501c0dd49bdSEiji Ota 		/* no connections */
502c0dd49bdSEiji Ota 		rw_exit(&rdsv3_conn_lock);
503c0dd49bdSEiji Ota 		return;
504c0dd49bdSEiji Ota 	}
505c0dd49bdSEiji Ota 
506b27516f5Sagiri 	/* allocate a little extra as this can get cast to a uint64_t */
507b27516f5Sagiri 	buffer = kmem_zalloc(item_len + 8, KM_SLEEP);
508b27516f5Sagiri 
509c0dd49bdSEiji Ota 	conn = (struct rdsv3_connection *)avl_first(&rdsv3_conn_hash);
510c0dd49bdSEiji Ota 
511c0dd49bdSEiji Ota 	do {
512c0dd49bdSEiji Ota 		/* XXX no c_lock usage.. */
513b27516f5Sagiri 		if (visitor(conn, buffer)) {
514b27516f5Sagiri 			/*
515b27516f5Sagiri 			 * We copy as much as we can fit in the buffer,
516b27516f5Sagiri 			 * but we count all items so that the caller
517b27516f5Sagiri 			 * can resize the buffer.
518b27516f5Sagiri 			 */
519b27516f5Sagiri 			if (len >= item_len) {
520b27516f5Sagiri 				RDSV3_DPRINTF4("rdsv3_for_each_conn_info",
521b27516f5Sagiri 				    "buffer: %p iter: %p bytes: %d", buffer,
522b27516f5Sagiri 				    iter->addr + iter->offset, item_len);
523b27516f5Sagiri 				rdsv3_info_copy(iter, buffer, item_len);
524b27516f5Sagiri 				len -= item_len;
525b27516f5Sagiri 			}
526b27516f5Sagiri 			lens->nr++;
527c0dd49bdSEiji Ota 		}
528c0dd49bdSEiji Ota 		conn = AVL_NEXT(&rdsv3_conn_hash, conn);
529c0dd49bdSEiji Ota 	} while (conn != NULL);
530c0dd49bdSEiji Ota 	rw_exit(&rdsv3_conn_lock);
531b27516f5Sagiri 
532b27516f5Sagiri 	kmem_free(buffer, item_len + 8);
533c0dd49bdSEiji Ota }
534c0dd49bdSEiji Ota 
535c0dd49bdSEiji Ota static int
rdsv3_conn_info_visitor(struct rdsv3_connection * conn,void * buffer)536c0dd49bdSEiji Ota rdsv3_conn_info_visitor(struct rdsv3_connection *conn, void *buffer)
537c0dd49bdSEiji Ota {
538fe817b60SEiji Ota 	struct rds_info_connection *cinfo = buffer;
539c0dd49bdSEiji Ota 
540c0dd49bdSEiji Ota 	cinfo->next_tx_seq = conn->c_next_tx_seq;
541c0dd49bdSEiji Ota 	cinfo->next_rx_seq = conn->c_next_rx_seq;
542c0dd49bdSEiji Ota 	cinfo->laddr = conn->c_laddr;
543c0dd49bdSEiji Ota 	cinfo->faddr = conn->c_faddr;
544c0dd49bdSEiji Ota 	(void) strncpy((char *)cinfo->transport, conn->c_trans->t_name,
545c0dd49bdSEiji Ota 	    sizeof (cinfo->transport));
546c0dd49bdSEiji Ota 	cinfo->flags = 0;
547c0dd49bdSEiji Ota 
548c0dd49bdSEiji Ota 	rdsv3_conn_info_set(cinfo->flags,
5495d5562f5SEiji Ota 	    MUTEX_HELD(&conn->c_send_lock), SENDING);
5505d5562f5SEiji Ota 
551c0dd49bdSEiji Ota 	/* XXX Future: return the state rather than these funky bits */
552c0dd49bdSEiji Ota 	rdsv3_conn_info_set(cinfo->flags,
553c0dd49bdSEiji Ota 	    atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING,
554c0dd49bdSEiji Ota 	    CONNECTING);
555c0dd49bdSEiji Ota 	rdsv3_conn_info_set(cinfo->flags,
556c0dd49bdSEiji Ota 	    atomic_get(&conn->c_state) == RDSV3_CONN_UP,
557c0dd49bdSEiji Ota 	    CONNECTED);
558c0dd49bdSEiji Ota 	return (1);
559c0dd49bdSEiji Ota }
560c0dd49bdSEiji Ota 
561c0dd49bdSEiji Ota static void
rdsv3_conn_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)562c0dd49bdSEiji Ota rdsv3_conn_info(struct rsock *sock, unsigned int len,
563c0dd49bdSEiji Ota     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
564c0dd49bdSEiji Ota {
565c0dd49bdSEiji Ota 	rdsv3_for_each_conn_info(sock, len, iter, lens,
566fe817b60SEiji Ota 	    rdsv3_conn_info_visitor, sizeof (struct rds_info_connection));
567c0dd49bdSEiji Ota }
568c0dd49bdSEiji Ota 
569c0dd49bdSEiji Ota int
rdsv3_conn_init()570c0dd49bdSEiji Ota rdsv3_conn_init()
571c0dd49bdSEiji Ota {
572c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_conn_init", "Enter");
573c0dd49bdSEiji Ota 
574c0dd49bdSEiji Ota 	rdsv3_conn_slab = kmem_cache_create("rdsv3_connection",
575c0dd49bdSEiji Ota 	    sizeof (struct rdsv3_connection), 0, rdsv3_conn_constructor,
576c0dd49bdSEiji Ota 	    rdsv3_conn_destructor, NULL, NULL, NULL, 0);
5775d5562f5SEiji Ota 	if (!rdsv3_conn_slab) {
5786e18d381Sagiri 		RDSV3_DPRINTF2("rdsv3_conn_init",
579c0dd49bdSEiji Ota 		    "kmem_cache_create(rdsv3_conn_slab) failed");
5805d5562f5SEiji Ota 		return (-ENOMEM);
581c0dd49bdSEiji Ota 	}
582c0dd49bdSEiji Ota 
583c0dd49bdSEiji Ota 	avl_create(&rdsv3_conn_hash, rdsv3_conn_compare,
584c0dd49bdSEiji Ota 	    sizeof (struct rdsv3_connection), offsetof(struct rdsv3_connection,
585c0dd49bdSEiji Ota 	    c_hash_node));
586c0dd49bdSEiji Ota 
587c0dd49bdSEiji Ota 	rw_init(&rdsv3_conn_lock, NULL, RW_DRIVER, NULL);
588c0dd49bdSEiji Ota 
589c0dd49bdSEiji Ota 	rdsv3_loop_init();
590c0dd49bdSEiji Ota 
591fe817b60SEiji Ota 	rdsv3_info_register_func(RDS_INFO_CONNECTIONS, rdsv3_conn_info);
592fe817b60SEiji Ota 	rdsv3_info_register_func(RDS_INFO_SEND_MESSAGES,
593c0dd49bdSEiji Ota 	    rdsv3_conn_message_info_send);
594fe817b60SEiji Ota 	rdsv3_info_register_func(RDS_INFO_RETRANS_MESSAGES,
595c0dd49bdSEiji Ota 	    rdsv3_conn_message_info_retrans);
596c0dd49bdSEiji Ota 
597c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_conn_init", "Return");
598c0dd49bdSEiji Ota 
599c0dd49bdSEiji Ota 	return (0);
600c0dd49bdSEiji Ota }
601c0dd49bdSEiji Ota 
602c0dd49bdSEiji Ota void
rdsv3_conn_exit()603c0dd49bdSEiji Ota rdsv3_conn_exit()
604c0dd49bdSEiji Ota {
605c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_conn_exit", "Enter");
606c0dd49bdSEiji Ota 
607c0dd49bdSEiji Ota 	rdsv3_loop_exit();
608c0dd49bdSEiji Ota 
609c0dd49bdSEiji Ota 	rw_destroy(&rdsv3_conn_lock);
610c0dd49bdSEiji Ota 	avl_destroy(&rdsv3_conn_hash);
611c0dd49bdSEiji Ota 
612c0dd49bdSEiji Ota 	ASSERT(rdsv3_conn_slab);
613c0dd49bdSEiji Ota 	kmem_cache_destroy(rdsv3_conn_slab);
614c0dd49bdSEiji Ota 
615c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_conn_exit", "Return");
616c0dd49bdSEiji Ota }
617c0dd49bdSEiji Ota 
618c0dd49bdSEiji Ota /*
619c0dd49bdSEiji Ota  * Force a disconnect
620c0dd49bdSEiji Ota  */
621c0dd49bdSEiji Ota void
rdsv3_conn_drop(struct rdsv3_connection * conn)622c0dd49bdSEiji Ota rdsv3_conn_drop(struct rdsv3_connection *conn)
623c0dd49bdSEiji Ota {
624c0dd49bdSEiji Ota 	conn->c_state = RDSV3_CONN_ERROR;
625c0dd49bdSEiji Ota 	rdsv3_queue_work(rdsv3_wq, &conn->c_down_w);
626c0dd49bdSEiji Ota }
627