1c0dd49bdSEiji Ota /*
216e76cddSagiri  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3c0dd49bdSEiji Ota  */
416e76cddSagiri 
5c0dd49bdSEiji Ota /*
616e76cddSagiri  * This file contains code imported from the OFED rds source file cong.c
716e76cddSagiri  * Oracle elects to have and use the contents of cong.c under and governed
816e76cddSagiri  * by the OpenIB.org BSD license (see below for full license text). However,
916e76cddSagiri  * the following notice accompanied the original version of this file:
10c0dd49bdSEiji Ota  */
11c0dd49bdSEiji Ota 
1216e76cddSagiri 
13c0dd49bdSEiji Ota /*
14c0dd49bdSEiji Ota  * Copyright (c) 2007 Oracle.  All rights reserved.
15c0dd49bdSEiji Ota  *
16c0dd49bdSEiji Ota  * This software is available to you under a choice of one of two
17c0dd49bdSEiji Ota  * licenses.  You may choose to be licensed under the terms of the GNU
18c0dd49bdSEiji Ota  * General Public License (GPL) Version 2, available from the file
19c0dd49bdSEiji Ota  * COPYING in the main directory of this source tree, or the
20c0dd49bdSEiji Ota  * OpenIB.org BSD license below:
21c0dd49bdSEiji Ota  *
22c0dd49bdSEiji Ota  *     Redistribution and use in source and binary forms, with or
23c0dd49bdSEiji Ota  *     without modification, are permitted provided that the following
24c0dd49bdSEiji Ota  *     conditions are met:
25c0dd49bdSEiji Ota  *
26c0dd49bdSEiji Ota  *      - Redistributions of source code must retain the above
27c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
28c0dd49bdSEiji Ota  *        disclaimer.
29c0dd49bdSEiji Ota  *
30c0dd49bdSEiji Ota  *      - Redistributions in binary form must reproduce the above
31c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
32c0dd49bdSEiji Ota  *        disclaimer in the documentation and/or other materials
33c0dd49bdSEiji Ota  *        provided with the distribution.
34c0dd49bdSEiji Ota  *
35c0dd49bdSEiji Ota  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36c0dd49bdSEiji Ota  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37c0dd49bdSEiji Ota  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38c0dd49bdSEiji Ota  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39c0dd49bdSEiji Ota  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40c0dd49bdSEiji Ota  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41c0dd49bdSEiji Ota  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42c0dd49bdSEiji Ota  * SOFTWARE.
43c0dd49bdSEiji Ota  *
44c0dd49bdSEiji Ota  */
45c0dd49bdSEiji Ota #include <sys/rds.h>
46c0dd49bdSEiji Ota 
47c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h>
48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
50c0dd49bdSEiji Ota 
51c0dd49bdSEiji Ota /*
52c0dd49bdSEiji Ota  * This file implements the receive side of the unconventional congestion
53c0dd49bdSEiji Ota  * management in RDS.
54c0dd49bdSEiji Ota  *
55c0dd49bdSEiji Ota  * Messages waiting in the receive queue on the receiving socket are accounted
56c0dd49bdSEiji Ota  * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
57c0dd49bdSEiji Ota  * message are accounted for.  If the number of bytes queued equals or exceeds
58c0dd49bdSEiji Ota  * rcvbuf then the socket is congested.  All sends attempted to this socket's
59c0dd49bdSEiji Ota  * address should return block or return -EWOULDBLOCK.
60c0dd49bdSEiji Ota  *
61c0dd49bdSEiji Ota  * Applications are expected to be reasonably tuned such that this situation
62c0dd49bdSEiji Ota  * very rarely occurs.  An application encountering this "back-pressure" is
63c0dd49bdSEiji Ota  * considered a bug.
64c0dd49bdSEiji Ota  *
65c0dd49bdSEiji Ota  * This is implemented by having each node maintain bitmaps which indicate
66c0dd49bdSEiji Ota  * which ports on bound addresses are congested.  As the bitmap changes it is
67c0dd49bdSEiji Ota  * sent through all the connections which terminate in the local address of the
68c0dd49bdSEiji Ota  * bitmap which changed.
69c0dd49bdSEiji Ota  *
70c0dd49bdSEiji Ota  * The bitmaps are allocated as connections are brought up.  This avoids
71c0dd49bdSEiji Ota  * allocation in the interrupt handling path which queues messages on sockets.
72c0dd49bdSEiji Ota  * The dense bitmaps let transports send the entire bitmap on any bitmap change
73c0dd49bdSEiji Ota  * reasonably efficiently.  This is much easier to implement than some
74c0dd49bdSEiji Ota  * finer-grained communication of per-port congestion.  The sender does a very
75c0dd49bdSEiji Ota  * inexpensive bit test to test if the port it's about to send to is congested
76c0dd49bdSEiji Ota  * or not.
77c0dd49bdSEiji Ota  */
78c0dd49bdSEiji Ota 
79c0dd49bdSEiji Ota /*
80c0dd49bdSEiji Ota  * Interaction with poll is a tad tricky. We want all processes stuck in
81c0dd49bdSEiji Ota  * poll to wake up and check whether a congested destination became uncongested.
82c0dd49bdSEiji Ota  * The really sad thing is we have no idea which destinations the application
83c0dd49bdSEiji Ota  * wants to send to - we don't even know which rdsv3_connections are involved.
84c0dd49bdSEiji Ota  * So until we implement a more flexible rds poll interface, we have to make
85c0dd49bdSEiji Ota  * do with this:
86c0dd49bdSEiji Ota  * We maintain a global counter that is incremented each time a congestion map
87c0dd49bdSEiji Ota  * update is received. Each rds socket tracks this value, and if rdsv3_poll
88c0dd49bdSEiji Ota  * finds that the saved generation number is smaller than the global generation
89c0dd49bdSEiji Ota  * number, it wakes up the process.
90c0dd49bdSEiji Ota  */
91c0dd49bdSEiji Ota static atomic_t		rdsv3_cong_generation = ATOMIC_INIT(0);
92c0dd49bdSEiji Ota 
93c0dd49bdSEiji Ota /*
94c0dd49bdSEiji Ota  * Congestion monitoring
95c0dd49bdSEiji Ota  */
96c0dd49bdSEiji Ota static struct list rdsv3_cong_monitor;
97c0dd49bdSEiji Ota static krwlock_t rdsv3_cong_monitor_lock;
98c0dd49bdSEiji Ota 
99c0dd49bdSEiji Ota /*
100c0dd49bdSEiji Ota  * Yes, a global lock.  It's used so infrequently that it's worth keeping it
101c0dd49bdSEiji Ota  * global to simplify the locking.  It's only used in the following
102c0dd49bdSEiji Ota  * circumstances:
103c0dd49bdSEiji Ota  *
104c0dd49bdSEiji Ota  *  - on connection buildup to associate a conn with its maps
105c0dd49bdSEiji Ota  *  - on map changes to inform conns of a new map to send
106c0dd49bdSEiji Ota  *
107c0dd49bdSEiji Ota  *  It's sadly ordered under the socket callback lock and the connection lock.
108c0dd49bdSEiji Ota  *  Receive paths can mark ports congested from interrupt context so the
109c0dd49bdSEiji Ota  *  lock masks interrupts.
110c0dd49bdSEiji Ota  */
111c0dd49bdSEiji Ota static kmutex_t rdsv3_cong_lock;
112c0dd49bdSEiji Ota static struct avl_tree rdsv3_cong_tree;
113c0dd49bdSEiji Ota 
114c0dd49bdSEiji Ota static struct rdsv3_cong_map *
rdsv3_cong_tree_walk(uint32_be_t addr,struct rdsv3_cong_map * insert)115c0dd49bdSEiji Ota rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
116c0dd49bdSEiji Ota {
117c0dd49bdSEiji Ota 	struct rdsv3_cong_map *map;
118c0dd49bdSEiji Ota 	avl_index_t where;
119c0dd49bdSEiji Ota 
120c0dd49bdSEiji Ota 	if (insert) {
121c0dd49bdSEiji Ota 		map = avl_find(&rdsv3_cong_tree, insert, &where);
122c0dd49bdSEiji Ota 		if (map == NULL) {
123c0dd49bdSEiji Ota 			avl_insert(&rdsv3_cong_tree, insert, where);
124c0dd49bdSEiji Ota 			return (NULL);
125c0dd49bdSEiji Ota 		}
126c0dd49bdSEiji Ota 	} else {
127c0dd49bdSEiji Ota 		struct rdsv3_cong_map map1;
128c0dd49bdSEiji Ota 		map1.m_addr = addr;
129c0dd49bdSEiji Ota 		map = avl_find(&rdsv3_cong_tree, &map1, &where);
130c0dd49bdSEiji Ota 	}
131c0dd49bdSEiji Ota 
132c0dd49bdSEiji Ota 	return (map);
133c0dd49bdSEiji Ota }
134c0dd49bdSEiji Ota 
135c0dd49bdSEiji Ota /*
136c0dd49bdSEiji Ota  * There is only ever one bitmap for any address.  Connections try and allocate
137c0dd49bdSEiji Ota  * these bitmaps in the process getting pointers to them.  The bitmaps are only
138c0dd49bdSEiji Ota  * ever freed as the module is removed after all connections have been freed.
139c0dd49bdSEiji Ota  */
140c0dd49bdSEiji Ota static struct rdsv3_cong_map *
rdsv3_cong_from_addr(uint32_be_t addr)141c0dd49bdSEiji Ota rdsv3_cong_from_addr(uint32_be_t addr)
142c0dd49bdSEiji Ota {
143c0dd49bdSEiji Ota 	struct rdsv3_cong_map *map;
144c0dd49bdSEiji Ota 	struct rdsv3_cong_map *ret = NULL;
145c0dd49bdSEiji Ota 	unsigned long zp;
146c0dd49bdSEiji Ota 	unsigned long i;
147c0dd49bdSEiji Ota 
148c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
149c0dd49bdSEiji Ota 
150c0dd49bdSEiji Ota 	map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
1515d5562f5SEiji Ota 	if (!map)
152c0dd49bdSEiji Ota 		return (NULL);
153c0dd49bdSEiji Ota 
154c0dd49bdSEiji Ota 	map->m_addr = addr;
155c0dd49bdSEiji Ota 	rdsv3_init_waitqueue(&map->m_waitq);
156c0dd49bdSEiji Ota 	list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
157c0dd49bdSEiji Ota 	    offsetof(struct rdsv3_connection, c_map_item));
158c0dd49bdSEiji Ota 
159c0dd49bdSEiji Ota 	for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
160c0dd49bdSEiji Ota 		zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
161c0dd49bdSEiji Ota 		if (zp == 0)
162c0dd49bdSEiji Ota 			goto out;
163c0dd49bdSEiji Ota 		map->m_page_addrs[i] = zp;
164c0dd49bdSEiji Ota 	}
165c0dd49bdSEiji Ota 
166c0dd49bdSEiji Ota 	mutex_enter(&rdsv3_cong_lock);
167c0dd49bdSEiji Ota 	ret = rdsv3_cong_tree_walk(addr, map);
168c0dd49bdSEiji Ota 	mutex_exit(&rdsv3_cong_lock);
169c0dd49bdSEiji Ota 
1705d5562f5SEiji Ota 	if (!ret) {
171c0dd49bdSEiji Ota 		ret = map;
172c0dd49bdSEiji Ota 		map = NULL;
173c0dd49bdSEiji Ota 	}
174c0dd49bdSEiji Ota 
175c0dd49bdSEiji Ota out:
176c0dd49bdSEiji Ota 	if (map) {
177c0dd49bdSEiji Ota 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
178c0dd49bdSEiji Ota 		    i++)
179c0dd49bdSEiji Ota 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
180c0dd49bdSEiji Ota 		kmem_free(map, sizeof (*map));
181c0dd49bdSEiji Ota 	}
182c0dd49bdSEiji Ota 
183c0dd49bdSEiji Ota 	RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
184c0dd49bdSEiji Ota 	    ret, ntohl(addr));
185c0dd49bdSEiji Ota 
186c0dd49bdSEiji Ota 	return (ret);
187c0dd49bdSEiji Ota }
188c0dd49bdSEiji Ota 
189c0dd49bdSEiji Ota /*
190c0dd49bdSEiji Ota  * Put the conn on its local map's list.  This is called when the conn is
191c0dd49bdSEiji Ota  * really added to the hash.  It's nested under the rdsv3_conn_lock, sadly.
192c0dd49bdSEiji Ota  */
193c0dd49bdSEiji Ota void
rdsv3_cong_add_conn(struct rdsv3_connection * conn)194c0dd49bdSEiji Ota rdsv3_cong_add_conn(struct rdsv3_connection *conn)
195c0dd49bdSEiji Ota {
196c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
197c0dd49bdSEiji Ota 
198c0dd49bdSEiji Ota 	RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
199c0dd49bdSEiji Ota 	    conn, conn->c_lcong);
200c0dd49bdSEiji Ota 	mutex_enter(&rdsv3_cong_lock);
201c0dd49bdSEiji Ota 	list_insert_tail(&conn->c_lcong->m_conn_list, conn);
202c0dd49bdSEiji Ota 	mutex_exit(&rdsv3_cong_lock);
203c0dd49bdSEiji Ota 
204c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
205c0dd49bdSEiji Ota }
206c0dd49bdSEiji Ota 
207c0dd49bdSEiji Ota void
rdsv3_cong_remove_conn(struct rdsv3_connection * conn)208c0dd49bdSEiji Ota rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
209c0dd49bdSEiji Ota {
210c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
211c0dd49bdSEiji Ota 
212c0dd49bdSEiji Ota 	RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
213c0dd49bdSEiji Ota 	    conn, conn->c_lcong);
214c0dd49bdSEiji Ota 	mutex_enter(&rdsv3_cong_lock);
215c0dd49bdSEiji Ota 	list_remove_node(&conn->c_map_item);
216c0dd49bdSEiji Ota 	mutex_exit(&rdsv3_cong_lock);
217c0dd49bdSEiji Ota 
218c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
219c0dd49bdSEiji Ota }
220c0dd49bdSEiji Ota 
221c0dd49bdSEiji Ota int
rdsv3_cong_get_maps(struct rdsv3_connection * conn)222c0dd49bdSEiji Ota rdsv3_cong_get_maps(struct rdsv3_connection *conn)
223c0dd49bdSEiji Ota {
224c0dd49bdSEiji Ota 	conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
225c0dd49bdSEiji Ota 	conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
226c0dd49bdSEiji Ota 
2275d5562f5SEiji Ota 	if (!(conn->c_lcong && conn->c_fcong))
228c0dd49bdSEiji Ota 		return (-ENOMEM);
229c0dd49bdSEiji Ota 
230c0dd49bdSEiji Ota 	return (0);
231c0dd49bdSEiji Ota }
232c0dd49bdSEiji Ota 
233c0dd49bdSEiji Ota void
rdsv3_cong_queue_updates(struct rdsv3_cong_map * map)234c0dd49bdSEiji Ota rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
235c0dd49bdSEiji Ota {
236c0dd49bdSEiji Ota 	struct rdsv3_connection *conn;
237c0dd49bdSEiji Ota 
238c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
239c0dd49bdSEiji Ota 
240c0dd49bdSEiji Ota 	mutex_enter(&rdsv3_cong_lock);
241c0dd49bdSEiji Ota 
242c0dd49bdSEiji Ota 	RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
243c0dd49bdSEiji Ota 		if (!test_and_set_bit(0, &conn->c_map_queued)) {
244c0dd49bdSEiji Ota 			rdsv3_stats_inc(s_cong_update_queued);
2455d5562f5SEiji Ota 			(void) rdsv3_send_xmit(conn);
246c0dd49bdSEiji Ota 		}
247c0dd49bdSEiji Ota 	}
248c0dd49bdSEiji Ota 
249c0dd49bdSEiji Ota 	mutex_exit(&rdsv3_cong_lock);
250c0dd49bdSEiji Ota 
251c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
252c0dd49bdSEiji Ota }
253c0dd49bdSEiji Ota 
254c0dd49bdSEiji Ota void
rdsv3_cong_map_updated(struct rdsv3_cong_map * map,uint64_t portmask)255c0dd49bdSEiji Ota rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
256c0dd49bdSEiji Ota {
257c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_map_updated",
258c0dd49bdSEiji Ota 	    "waking map %p for %u.%u.%u.%u",
259c0dd49bdSEiji Ota 	    map, NIPQUAD(map->m_addr));
260cadbfdc3SEiji Ota 
261c0dd49bdSEiji Ota 	rdsv3_stats_inc(s_cong_update_received);
262*1a5e258fSJosef 'Jeff' Sipek 	atomic_inc_32(&rdsv3_cong_generation);
263c0dd49bdSEiji Ota #if 0
264c0dd49bdSEiji Ota XXX
265c0dd49bdSEiji Ota 	if (waitqueue_active(&map->m_waitq))
266c0dd49bdSEiji Ota #endif
267c0dd49bdSEiji Ota 		rdsv3_wake_up(&map->m_waitq);
268c0dd49bdSEiji Ota 
269c0dd49bdSEiji Ota 	if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
270c0dd49bdSEiji Ota 		struct rdsv3_sock *rs;
271c0dd49bdSEiji Ota 
272c0dd49bdSEiji Ota 		rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
273c0dd49bdSEiji Ota 		RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
274c0dd49bdSEiji Ota 		    rs_cong_list) {
275c0dd49bdSEiji Ota 			mutex_enter(&rs->rs_lock);
276c0dd49bdSEiji Ota 			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
277c0dd49bdSEiji Ota 			rs->rs_cong_mask &= ~portmask;
278c0dd49bdSEiji Ota 			mutex_exit(&rs->rs_lock);
279c0dd49bdSEiji Ota 			if (rs->rs_cong_notify)
280c0dd49bdSEiji Ota 				rdsv3_wake_sk_sleep(rs);
281c0dd49bdSEiji Ota 		}
282c0dd49bdSEiji Ota 		rw_exit(&rdsv3_cong_monitor_lock);
283c0dd49bdSEiji Ota 	}
284c0dd49bdSEiji Ota 
285c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
286c0dd49bdSEiji Ota }
287c0dd49bdSEiji Ota 
288c0dd49bdSEiji Ota int
rdsv3_cong_updated_since(unsigned long * recent)289c0dd49bdSEiji Ota rdsv3_cong_updated_since(unsigned long *recent)
290c0dd49bdSEiji Ota {
291c0dd49bdSEiji Ota 	unsigned long gen = atomic_get(&rdsv3_cong_generation);
292c0dd49bdSEiji Ota 
293c0dd49bdSEiji Ota 	if (*recent == gen)
294c0dd49bdSEiji Ota 		return (0);
295c0dd49bdSEiji Ota 	*recent = gen;
296c0dd49bdSEiji Ota 	return (1);
297c0dd49bdSEiji Ota }
298c0dd49bdSEiji Ota 
299c0dd49bdSEiji Ota /*
300c0dd49bdSEiji Ota  * We're called under the locking that protects the sockets receive buffer
301c0dd49bdSEiji Ota  * consumption.  This makes it a lot easier for the caller to only call us
302c0dd49bdSEiji Ota  * when it knows that an existing set bit needs to be cleared, and vice versa.
303c0dd49bdSEiji Ota  * We can't block and we need to deal with concurrent sockets working against
304c0dd49bdSEiji Ota  * the same per-address map.
305c0dd49bdSEiji Ota  */
306c0dd49bdSEiji Ota void
rdsv3_cong_set_bit(struct rdsv3_cong_map * map,uint16_be_t port)307c0dd49bdSEiji Ota rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
308c0dd49bdSEiji Ota {
309c0dd49bdSEiji Ota 	unsigned long i;
310c0dd49bdSEiji Ota 	unsigned long off;
311c0dd49bdSEiji Ota 
312c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_set_bit",
313c0dd49bdSEiji Ota 	    "setting congestion for %u.%u.%u.%u:%u in map %p",
314c0dd49bdSEiji Ota 	    NIPQUAD(map->m_addr), ntohs(port), map);
315c0dd49bdSEiji Ota 
316c0dd49bdSEiji Ota 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
317c0dd49bdSEiji Ota 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
318cadbfdc3SEiji Ota 	set_le_bit(off, (void *)map->m_page_addrs[i]);
319c0dd49bdSEiji Ota }
320c0dd49bdSEiji Ota 
321c0dd49bdSEiji Ota void
rdsv3_cong_clear_bit(struct rdsv3_cong_map * map,uint16_be_t port)322c0dd49bdSEiji Ota rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
323c0dd49bdSEiji Ota {
324c0dd49bdSEiji Ota 	unsigned long i;
325c0dd49bdSEiji Ota 	unsigned long off;
326c0dd49bdSEiji Ota 
327c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
328c0dd49bdSEiji Ota 	    "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
329c0dd49bdSEiji Ota 	    NIPQUAD(map->m_addr), ntohs(port), map);
330c0dd49bdSEiji Ota 
331c0dd49bdSEiji Ota 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
332c0dd49bdSEiji Ota 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
333cadbfdc3SEiji Ota 	clear_le_bit(off, (void *)map->m_page_addrs[i]);
334c0dd49bdSEiji Ota }
335c0dd49bdSEiji Ota 
336c0dd49bdSEiji Ota static int
rdsv3_cong_test_bit(struct rdsv3_cong_map * map,uint16_be_t port)337c0dd49bdSEiji Ota rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
338c0dd49bdSEiji Ota {
339c0dd49bdSEiji Ota 	unsigned long i;
340c0dd49bdSEiji Ota 	unsigned long off;
341c0dd49bdSEiji Ota 
342c0dd49bdSEiji Ota 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
343c0dd49bdSEiji Ota 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
344c0dd49bdSEiji Ota 
345c0dd49bdSEiji Ota 	RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
346c0dd49bdSEiji Ota 	    ntohs(port), i, off);
347c0dd49bdSEiji Ota 
348cadbfdc3SEiji Ota 	return (test_le_bit(off, (void *)map->m_page_addrs[i]));
349c0dd49bdSEiji Ota }
350c0dd49bdSEiji Ota 
351c0dd49bdSEiji Ota void
rdsv3_cong_add_socket(struct rdsv3_sock * rs)352c0dd49bdSEiji Ota rdsv3_cong_add_socket(struct rdsv3_sock *rs)
353c0dd49bdSEiji Ota {
354c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
355c0dd49bdSEiji Ota 
356c0dd49bdSEiji Ota 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
357c0dd49bdSEiji Ota 	if (!list_link_active(&rs->rs_cong_list))
358c0dd49bdSEiji Ota 		list_insert_head(&rdsv3_cong_monitor, rs);
359c0dd49bdSEiji Ota 	rw_exit(&rdsv3_cong_monitor_lock);
360c0dd49bdSEiji Ota }
361c0dd49bdSEiji Ota 
362c0dd49bdSEiji Ota void
rdsv3_cong_remove_socket(struct rdsv3_sock * rs)363c0dd49bdSEiji Ota rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
364c0dd49bdSEiji Ota {
365c0dd49bdSEiji Ota 	struct rdsv3_cong_map *map;
366c0dd49bdSEiji Ota 
367c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
368c0dd49bdSEiji Ota 
369c0dd49bdSEiji Ota 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
370c0dd49bdSEiji Ota 	list_remove_node(&rs->rs_cong_list);
371c0dd49bdSEiji Ota 	rw_exit(&rdsv3_cong_monitor_lock);
372c0dd49bdSEiji Ota 
373c0dd49bdSEiji Ota 	/* update congestion map for now-closed port */
374c0dd49bdSEiji Ota 	mutex_enter(&rdsv3_cong_lock);
375c0dd49bdSEiji Ota 	map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
376c0dd49bdSEiji Ota 	mutex_exit(&rdsv3_cong_lock);
377c0dd49bdSEiji Ota 
378c0dd49bdSEiji Ota 	if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
379c0dd49bdSEiji Ota 		rdsv3_cong_clear_bit(map, rs->rs_bound_port);
380c0dd49bdSEiji Ota 		rdsv3_cong_queue_updates(map);
381c0dd49bdSEiji Ota 	}
382c0dd49bdSEiji Ota }
383c0dd49bdSEiji Ota 
384c0dd49bdSEiji Ota int
rdsv3_cong_wait(struct rdsv3_cong_map * map,uint16_be_t port,int nonblock,struct rdsv3_sock * rs)385c0dd49bdSEiji Ota rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
386c0dd49bdSEiji Ota     struct rdsv3_sock *rs)
387c0dd49bdSEiji Ota {
3886e18d381Sagiri 	int ret = 0;
389c0dd49bdSEiji Ota 
390c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
391c0dd49bdSEiji Ota 	    rs, nonblock);
392c0dd49bdSEiji Ota 
393c0dd49bdSEiji Ota 	if (!rdsv3_cong_test_bit(map, port))
394c0dd49bdSEiji Ota 		return (0);
395c0dd49bdSEiji Ota 	if (nonblock) {
396c0dd49bdSEiji Ota 		if (rs && rs->rs_cong_monitor) {
397c0dd49bdSEiji Ota 			/*
398c0dd49bdSEiji Ota 			 * It would have been nice to have an atomic set_bit on
399c0dd49bdSEiji Ota 			 * a uint64_t.
400c0dd49bdSEiji Ota 			 */
401c0dd49bdSEiji Ota 			mutex_enter(&rs->rs_lock);
402c0dd49bdSEiji Ota 			rs->rs_cong_mask |=
403fe817b60SEiji Ota 			    RDS_CONG_MONITOR_MASK(ntohs(port));
404c0dd49bdSEiji Ota 			mutex_exit(&rs->rs_lock);
405c0dd49bdSEiji Ota 
406c0dd49bdSEiji Ota 			/*
407c0dd49bdSEiji Ota 			 * Test again - a congestion update may have arrived in
408c0dd49bdSEiji Ota 			 * the meantime.
409c0dd49bdSEiji Ota 			 */
410c0dd49bdSEiji Ota 			if (!rdsv3_cong_test_bit(map, port))
411c0dd49bdSEiji Ota 				return (0);
412c0dd49bdSEiji Ota 		}
413c0dd49bdSEiji Ota 		rdsv3_stats_inc(s_cong_send_error);
414c0dd49bdSEiji Ota 		return (-ENOBUFS);
415c0dd49bdSEiji Ota 	}
416c0dd49bdSEiji Ota 
417c0dd49bdSEiji Ota 	rdsv3_stats_inc(s_cong_send_blocked);
418c0dd49bdSEiji Ota 	RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
419c0dd49bdSEiji Ota 	    map, ntohs(port));
420c0dd49bdSEiji Ota 
4216e18d381Sagiri #if 0
4226e18d381Sagiri 	ret = rdsv3_wait_sig(&map->m_waitq, !rdsv3_cong_test_bit(map, port));
4236e18d381Sagiri 	if (ret == 0)
4246e18d381Sagiri 		return (-ERESTART);
4256e18d381Sagiri 	return (0);
4266e18d381Sagiri #else
427c0dd49bdSEiji Ota 	mutex_enter(&map->m_waitq.waitq_mutex);
4286e18d381Sagiri 	map->m_waitq.waitq_waiters++;
429c0dd49bdSEiji Ota 	while (rdsv3_cong_test_bit(map, port)) {
4306e18d381Sagiri 		ret = cv_wait_sig(&map->m_waitq.waitq_cv,
4316e18d381Sagiri 		    &map->m_waitq.waitq_mutex);
4326e18d381Sagiri 		if (ret == 0) {
4335e12ddadSEiji Ota 			ret = -EINTR;
434c0dd49bdSEiji Ota 			break;
435c0dd49bdSEiji Ota 		}
436c0dd49bdSEiji Ota 	}
4376e18d381Sagiri 	map->m_waitq.waitq_waiters--;
438c0dd49bdSEiji Ota 	mutex_exit(&map->m_waitq.waitq_mutex);
439c0dd49bdSEiji Ota 	return (ret);
4406e18d381Sagiri #endif
441c0dd49bdSEiji Ota }
442c0dd49bdSEiji Ota 
443c0dd49bdSEiji Ota void
rdsv3_cong_exit(void)444c0dd49bdSEiji Ota rdsv3_cong_exit(void)
445c0dd49bdSEiji Ota {
446c0dd49bdSEiji Ota 	struct rdsv3_cong_map *map;
447c0dd49bdSEiji Ota 	unsigned long i;
448c0dd49bdSEiji Ota 
449c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
450c0dd49bdSEiji Ota 
451c0dd49bdSEiji Ota 	while ((map = avl_first(&rdsv3_cong_tree))) {
452c0dd49bdSEiji Ota 		RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
453c0dd49bdSEiji Ota 		avl_remove(&rdsv3_cong_tree, map);
454c0dd49bdSEiji Ota 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
455c0dd49bdSEiji Ota 		    i++)
456c0dd49bdSEiji Ota 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
457c0dd49bdSEiji Ota 		kmem_free(map, sizeof (*map));
458c0dd49bdSEiji Ota 	}
459c0dd49bdSEiji Ota 
460c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
461c0dd49bdSEiji Ota }
462c0dd49bdSEiji Ota 
463c0dd49bdSEiji Ota /*
464c0dd49bdSEiji Ota  * Allocate a RDS message containing a congestion update.
465c0dd49bdSEiji Ota  */
466c0dd49bdSEiji Ota struct rdsv3_message *
rdsv3_cong_update_alloc(struct rdsv3_connection * conn)467c0dd49bdSEiji Ota rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
468c0dd49bdSEiji Ota {
469c0dd49bdSEiji Ota 	struct rdsv3_cong_map *map = conn->c_lcong;
470c0dd49bdSEiji Ota 	struct rdsv3_message *rm;
471c0dd49bdSEiji Ota 
472c0dd49bdSEiji Ota 	rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
473c0dd49bdSEiji Ota 	if (!IS_ERR(rm))
474c0dd49bdSEiji Ota 		rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
475c0dd49bdSEiji Ota 
476c0dd49bdSEiji Ota 	return (rm);
477c0dd49bdSEiji Ota }
478c0dd49bdSEiji Ota 
479c0dd49bdSEiji Ota static int
rdsv3_cong_compare(const void * map1,const void * map2)480c0dd49bdSEiji Ota rdsv3_cong_compare(const void *map1, const void *map2)
481c0dd49bdSEiji Ota {
482c0dd49bdSEiji Ota #define	addr1	((struct rdsv3_cong_map *)map1)->m_addr
483c0dd49bdSEiji Ota #define	addr2	((struct rdsv3_cong_map *)map2)->m_addr
484c0dd49bdSEiji Ota 
485c0dd49bdSEiji Ota 	if (addr1 < addr2)
486c0dd49bdSEiji Ota 		return (-1);
487c0dd49bdSEiji Ota 	if (addr1 > addr2)
488c0dd49bdSEiji Ota 		return (1);
489c0dd49bdSEiji Ota 	return (0);
490c0dd49bdSEiji Ota }
491c0dd49bdSEiji Ota 
492c0dd49bdSEiji Ota void
rdsv3_cong_init(void)493c0dd49bdSEiji Ota rdsv3_cong_init(void)
494c0dd49bdSEiji Ota {
495c0dd49bdSEiji Ota 	list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
496c0dd49bdSEiji Ota 	    offsetof(struct rdsv3_sock, rs_cong_list));
497c0dd49bdSEiji Ota 	rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
498c0dd49bdSEiji Ota 	mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
499c0dd49bdSEiji Ota 	avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
500c0dd49bdSEiji Ota 	    sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
501c0dd49bdSEiji Ota 	    m_rb_node));
502c0dd49bdSEiji Ota }
503